In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import sklearn as skl
import sklearn.feature_selection as skl_feature_selection
import sklearn.impute as skl_impute
import sklearn.linear_model as skl_linear_model

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

SCORING = "roc_auc"
RANDOM_STATE = 6

sns.set_theme(style="white")

In [None]:
def load_comp_data():
    """for regular kaggle competitions"""
    train_name, test_name = 'train.csv', 'test.csv'
    for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            filepath = os.path.join(dirname, filename)
            if filepath.endswith(train_name):
                train_data = pd.read_csv(filepath).pipe(reduce_mem)
            elif filepath.endswith(test_name):
                test_data = pd.read_csv(filepath).pipe(reduce_mem)
    return train_data, test_data


def reduce_mem(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    before_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type not in numerics:
            continue

        c_min = df[col].min()
        c_max = df[col].max()
        if str(col_type).startswith('int'):
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)
        else:
            if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)

    after_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
            after_mem, 100 * (before_mem - after_mem) / before_mem))

    return df

In [None]:
def get_na_info(df):
    return (
        pd.DataFrame(dict(dtype=df.dtypes,
                          na_cnt=df.isna().sum(),
                          na_pct=df.isna().sum().div(len(df))
                              .mul(100).round(2)))
            .sort_values('na_pct')
    )

def print_mean_sem(values):
    import math
    mean = np.mean(values)
    sem = np.std(values) / math.sqrt(len(values))
    print(f"{mean:.4f} (SEM: {sem:.4f})")

def batches_of(seq, size=1000):
    batch = []
    for n, item in enumerate(seq, start=1):
        batch.append(item)
        if n % size == 0:
            yield batch
            batch = []
    if batch:
        yield batch

def small_multiple_dists(
    cont: pd.DataFrame, cols=6, hue=None, tails=True,
    kind="hist",
):
    _offset = 2 if hue else 1
    rows = (len(cont.columns)-_offset) // cols + 1
    __, axes = plt.subplots(rows, cols, figsize=(20, rows*2))
    axes_ = axes.ravel()  # flatten; will not affect layout

    plt.tight_layout()
    sns.despine(left=True)
    for i, col in enumerate(cont.columns):
        if col == hue:
            continue
        ax = axes_[i]
        kwargs = {}
        if not tails:
            lim = (
                cont[col].quantile(0.025),
                cont[col].quantile(0.975))

        if kind == "hist":
            if not tails:
                kwargs["binrange"] = lim
            sns.histplot(data=cont, x=col, hue=hue,
                         ax=ax, **kwargs)    
        elif kind == "kde":
            if not tails:
                kwargs["clip"] = lim
            sns.kdeplot(data=cont, x=col, hue=hue,
                        common_norm=False, ax=ax, **kwargs)
        else:
            raise ValueError(f"Unsupported kind: {kind}")

        ax.grid(False, axis="x")

def plot_corr_heatmap(corr):
    _figsize = (len(corr), len(corr))
    sns.heatmap(
        corr,
        vmin=-1, center=0, vmax=1,
        annot=True, fmt='.1f',
        square=True, cmap='coolwarm',
        mask=np.triu(np.ones_like(corr)),
        ax=plt.subplots(figsize=_figsize)[1])

def plot_validation_curve(param_range, train_scores, valid_scores):
    (
        pd.DataFrame(dict(
                param_range=np.repeat(param_range, train_scores.shape[1]),
                train_score=np.ravel(train_scores),
                valid_score=np.ravel(valid_scores)))
            .pipe(pd.melt, id_vars='param_range',
                  value_vars=['train_score', 'valid_score'],
                  var_name='score_type', value_name='score')
            .pipe((sns.catplot, 'data'), x='param_range', y='score',
                  hue='score_type', style='score_type', aspect=3/2,
                  kind='point')
    )

In [None]:
raw_tr, raw_te = load_comp_data()
label_path = "/kaggle/input/tabular-playground-series-apr-2022/train_labels.csv"
raw_y = pd.read_csv(label_path).pipe(reduce_mem)
raw_tr.shape, raw_y.shape, raw_te.shape

In [None]:
y = raw_y.state

## First glance

In [None]:
raw_tr.head()

In [None]:
raw_tr.describe().style.format(precision=4)

In [None]:
raw_y.describe()

In [None]:
# raw_te.info()
# raw_te.head()
# # No missing values

raw_te.describe().style.format(precision=4)

# non-overlapping seq & subject with tr

## MVP
- use mean for each sensor over the 60s period

In [None]:
class PreGroupKFold(skl.model_selection.GroupKFold):
    """Supports specifying groups at init"""
    def __init__(self, n_splits=5, groups=None):
        if groups is None:
            raise ValueError("arg 'groups' not specified")
        self.groups = groups
        super().__init__(n_splits=n_splits)

    def split(self, X, y=None, groups=None):
        return super().split(X, y, self.groups)

In [None]:
cv = skl.model_selection.RepeatedStratifiedKFold(n_splits=3, n_repeats=3)
gkf_cv = skl.model_selection.GroupKFold(n_splits=3)

# balanced classes, stratified k-fold should be less useful
sgkf_cv = skl.model_selection.StratifiedGroupKFold(n_splits=3)

cv_groups = raw_tr[["sequence", "subject"]].drop_duplicates().set_index("sequence")
pgkf_cv = PreGroupKFold(n_splits=3, groups=cv_groups)

In [None]:
# raw_tr.sequence.value_counts()
# raw_tr.subject.value_counts()
# # one seq has 60 entries across the board
# # one subject can have multiple seqs

def to_mvp_X(raw):
    return (
        raw.filter(regex=r'^(seq|sensor)')
            .groupby("sequence")
            .mean()
    #         .merge(raw_y, on="sequence").pipe(get_na_info)  # no NA, good merge
    )

X_mvp = raw_tr.pipe(to_mvp_X)
X_mvp_te = raw_te.pipe(to_mvp_X)
y_mvp = raw_y.state

X_mvp.shape, y_mvp.shape, X_mvp_te.shape

In [None]:
# C_ = (
#     skl_linear_model
#         .LogisticRegressionCV(cv=cv, scoring=SCORING)
#         .fit(X_mvp, y_mvp)
#         .C_
# )
# C_

# lor_mvp = skl_linear_model.LogisticRegression(C=C_[0])

# scores = skl.model_selection.cross_val_score(
#     lor_mvp, X_mvp, y_mvp, scoring=SCORING, cv=cv)

# print("MVP AUC:")
# print_mean_sem(scores)
# 0.57xx

## EDA

In [None]:
SENSOR_COLS = raw_tr.filter(regex=r'^sensor').columns

def calc_iqr(seq):
    q1, q3 = pd.Series(seq).quantile([.25, .75])
    return q1, q3

def calc_box_whis(iqr, coef=1.5):
    q1, q3 = iqr
    whis_len = (q3-q1)*coef
    lo = q1 - whis_len
    hi = q3 + whis_len
    return lo, hi

eda_tr = raw_tr.set_index('sequence')
eda_te = raw_te.set_index('sequence')
eda_bo = pd.concat([
    eda_tr.assign(test=False),
    eda_te.assign(test=True),
])
eda_tr.shape, eda_te.shape, eda_bo.shape

In [None]:
# # ~~generalize for custome agg functions~~
# # see add_morin_aggs function below
# eda_seq_tr = pd.concat([
#     eda_tr.groupby("sequence")[SENSOR_COLS].mean().add_suffix("_mean"),
#     eda_tr.groupby("sequence")[SENSOR_COLS].median().add_suffix("_median"),
#     eda_tr.groupby("sequence")[SENSOR_COLS].std().add_suffix("_std"),
#     eda_tr.groupby("sequence")[SENSOR_COLS].min().add_suffix("_min"),
#     eda_tr.groupby("sequence")[SENSOR_COLS].max().add_suffix("_max"),
#     raw_y.set_index("sequence")], axis=1)

# eda_seq_te = pd.concat([
#     eda_te.groupby("sequence")[SENSOR_COLS].mean().add_suffix("_mean"),
#     eda_te.groupby("sequence")[SENSOR_COLS].median().add_suffix("_median"),
#     eda_te.groupby("sequence")[SENSOR_COLS].std().add_suffix("_std"),
#     eda_te.groupby("sequence")[SENSOR_COLS].min().add_suffix("_min"),
#     eda_te.groupby("sequence")[SENSOR_COLS].max().add_suffix("_max")], axis=1)

# X_eda, y_eda = eda_seq_tr.drop(columns="state"), eda_seq_tr["state"]
# X_eda_te = eda_seq_te
# X_eda.shape, X_eda_te.shape

#### Univariate

In [None]:
raw_y.state.value_counts().plot(kind='bar')
# balanced

In [None]:
eda_bo.describe().style.format(precision=4)
# all sensor features centered

In [None]:
(
    eda_tr.reset_index().drop_duplicates(["subject", "sequence"])
        .subject.value_counts()
#         .pipe((sns.histplot, "data"))
)
# some subjects have more seqs than others in tr

In [None]:
(
    eda_te.reset_index().drop_duplicates(["subject", "sequence"])
        .subject.value_counts()
        .pipe((sns.histplot, "data"))
)
# some subjects have more seqs than others in te too

In [None]:
# %%time
# small_multiple_dists(eda_tr.reset_index()[SENSOR_COLS],
#                      cols=6)
# # > sensor_2 two modes, step-level

In [None]:
# %%time
# small_multiple_dists(eda_tr.reset_index()[SENSOR_COLS],
#                      cols=6, tails=False)

#### Bivariate

In [None]:
# %%time
# small_multiple_dists(eda_seq_tr.sort_index(axis=1),
#                      cols=5, kind="kde", hue="state")

#### Plots for sequence

In [None]:
GROUPED_SENSOR_NUMS = [  # from heatmap below
    2, 5, 8, 12,  # no corr
    4, 10,  # weak corr
    9, 0, 6, 1, 11, 3, 7  # med corr
]
GROUPED_SENSOR_COLS = [
    f"sensor_{n:02}" for n in GROUPED_SENSOR_NUMS
]

def list_random_seq_ids(n, state, subj=None):
    row_filters = raw_y.state == state
    if subj:
        subj_seq_ids = raw_tr[raw_tr.subject == subj].sequence.drop_duplicates()
        row_filters &= raw_y.sequence.isin(subj_seq_ids)

    return (raw_y[row_filters].sample(n).sequence)

def list_random_seqs(n, state, subj=None):
    seq_id = list_random_seq_ids(n, state=state, subj=subj)
    return eda_tr.loc[seq_id].assign(state=state).reset_index()

neg_seq = list_random_seqs(1000, state=0)
pos_seq = list_random_seqs(1000, state=1)

In [None]:
# seq_long = (
#     pd.concat([neg_seq, pos_seq], ignore_index=True)
#         .melt(id_vars=["sequence", "state", "step"],
#               value_vars=SENSOR_COLS,
#               var_name="sensor", value_name="value")
# )
# seq_long.shape

In [None]:
# sns.relplot(x="step", y="value", hue="state", row="sensor",
#             kind="line",
#             aspect=2/1,
#             data=seq_long)

**Deeper look at sensor_02**

In [None]:
# sns.relplot(x="step", y="value", hue="state", row="sensor",
#             kind="line",
#             aspect=2/1,
#             data=seq_long[seq_long.sensor == "sensor_02"])

In [None]:
# seq_long[
#     (seq_long.sensor == "sensor_02")
#     & (seq_long.value.between(-2.1, 1.8))
# ].pipe(
#     (sns.displot, "data"), x="value", kind="kde", hue="state",
#     aspect=2/1)

**avg corrs**

In [None]:
# N_CORRS = 100
# corrs = [list_random_seqs(1, state=0)[SENSOR_COLS].corr()
#          for __ in range(N_CORRS)]

# avg_corr = sum(corrs) / N_CORRS

# avg_corr.pipe(plot_corr_heatmap)

In [None]:
# seq_single = (
#     pd.concat([list_random_seqs(2, state=0),
#                list_random_seqs(2, state=1)], ignore_index=True)
#         .melt(id_vars=["sequence", "state", "step"],
#               value_vars=SENSOR_COLS,
#               var_name="sensor", value_name="value")
# )


# def small_multiple_lineplots(seq, row_order=GROUPED_SENSOR_COLS):
#     sns.relplot(x="step", y="value", hue="state",
#                 col="sequence", facet_kws=dict(sharey=False),
#                 row="sensor", row_order=row_order,
#                 kind="line", estimator=None, units="sequence",  # show all lines
#                 height=3, aspect=3/2,
#                 data=seq)
    
# def small_multiple_lineplots_h(seq, col_order=GROUPED_SENSOR_COLS):
#     sns.relplot(x="step", y="value", hue="state",
#                 row="sequence", facet_kws=dict(sharey=False),
#                 col="sensor", col_order=col_order,
#                 kind="line", estimator=None, units="sequence",  # show all lines
#                 height=3, aspect=3/2,
#                 data=seq)


# seq_single.pipe(small_multiple_lineplots)

**seq heatmap**

In [None]:
# N_SEQS = 8
# def plot_seq_heatmap(data, ax, state):
#     sns.heatmap(vmin=-2.5, vmax=2.5, cmap="coolwarm", ax=ax,
#                 data=data)
#     plt.tight_layout()

# SUBJECT_FOR_SEQ_HEATMAP = 1
# # SUBJECT_FOR_SEQ_HEATMAP = 87
# # SUBJECT_FOR_SEQ_HEATMAP = 647
# state0_seq_ids = list_random_seq_ids(N_SEQS, state=0, subj=SUBJECT_FOR_SEQ_HEATMAP)
# state1_seq_ids = list_random_seq_ids(N_SEQS, state=1, subj=SUBJECT_FOR_SEQ_HEATMAP)

# __, axes = plt.subplots(2, N_SEQS, figsize=(N_SEQS*5, 8))
# for i, seq_id in enumerate(state0_seq_ids.tolist()):
#     data = eda_tr.loc[seq_id].set_index("step")[GROUPED_SENSOR_COLS].T
#     plot_seq_heatmap(data, axes[0][i], state=0)

# for i, seq_id in enumerate(state1_seq_ids.tolist()):
#     data = eda_tr.loc[seq_id].set_index("step")[GROUPED_SENSOR_COLS].T
#     plot_seq_heatmap(data, axes[1][i], state=0)

- insight: sequences from diff subjects are quite different
> -> `GroupKFold`

#### EDA model

In [None]:
# %%time
# C_ = (
#     skl_linear_model
#         .LogisticRegressionCV(cv=cv, scoring=SCORING, max_iter=200,
#                               solver="saga", penalty="l1",
#                              )
#         .fit(X_eda, y_eda)
#         .C_
# )
# C_

In [None]:
# # C = C_[0]
# C = 10000  # from previous run
# lor_eda = skl_linear_model.LogisticRegression(
#     C=C, max_iter=500, solver="saga", penalty="l1")

# scores = skl.model_selection.cross_val_score(
#     lor_eda, X_eda, y_eda,
#     scoring=SCORING, cv=gkf_cv, groups=cv_groups)

# print("EDA AUC:")
# print_mean_sem(scores)
# # EDA AUC with default skl LoR Solver:
# # 0.7651 (SEM: 0.0018)

# # EDA AUC with default skl LoR Solver + GroupKFold:
# # 0.7496 (SEM: 0.0036)  # closer to LB

## Others' ideas
#### TS Agg feats, by Lucas Morin
https://www.kaggle.com/code/lucasmorin/feature-engineering-aggregation-functions#feature-importance

data notebook:
https://www.kaggle.com/code/yichian/tps-apr-2022-morin-s-aggs

In [None]:
# agg_funcs_map = {  # feat col -> agg funcs
#     feat_col: all_functions
#     for feat_col in SENSOR_COLS
# }
# def add_morin_aggs(df):
#     df_feat = df.groupby('sequence').agg(agg_funcs_map)
#     df_feat.columns = ['_'.join(col) for col in df_feat.columns]  # flatten columns
#     map_sequence_subject = df.groupby(['sequence']).subject.min()
#     map_subject_count = df.groupby(['sequence']).subject.min().value_counts()
#     df_feat['count_sequence'] = df_feat.index.map(map_sequence_subject.map(map_subject_count))
#     return df_feat

In [None]:
MORIN_AGGS_TR_PATH = "/kaggle/input/tps-apr-2022-morin-s-aggs/X-morin-aggs-tr.parquet"
MORIN_AGGS_TE_PATH = "/kaggle/input/tps-apr-2022-morin-s-aggs/X-morin-aggs-te.parquet"
X_morin_aggs = pd.read_parquet(MORIN_AGGS_TR_PATH)
X_morin_aggs_te = pd.read_parquet(MORIN_AGGS_TE_PATH)

y_aggs = raw_y.set_index("sequence").state

X_morin_aggs.shape, X_morin_aggs_te.shape, y_aggs.shape

In [None]:
AGG_FEATS_TO_DROP_DUE_TO_NA = [
    "sensor_02_max_over_min",
]

imputer = skl_impute.SimpleImputer()
scaler = skl.preprocessing.StandardScaler()

# will have train-test contamination when CV, but ok for now
X_aggs = (
    X_morin_aggs
        .drop(columns=AGG_FEATS_TO_DROP_DUE_TO_NA)
        .pipe(imputer.fit_transform)
)
X_aggs = scaler.fit_transform(X_aggs)

X_aggs_te = (
    X_morin_aggs_te
        .drop(columns=AGG_FEATS_TO_DROP_DUE_TO_NA)
        .pipe(imputer.transform)
)
X_aggs_te = scaler.transform(X_aggs_te)

X_aggs.shape, X_aggs_te.shape

In [None]:
# %%time
# C_ = (
#     skl_linear_model
#         .LogisticRegressionCV(cv=cv, scoring=SCORING, max_iter=200,
#                               solver="saga", penalty="l1"
#                              )
#         .fit(X_aggs, y_aggs)
#         .C_
# )
# C_

# took ~ 1h

In [None]:
# # C = C_[0]
# C = 0.35938137  # from previous cell

# lor_aggs = skl_linear_model.LogisticRegression(
#     C=C, max_iter=500, solver="saga", penalty="l1")

# scores = skl.model_selection.cross_val_score(
#     lor_aggs, X_aggs, y_aggs,
#     scoring=SCORING, cv=gkf_cv, groups=cv_groups)

# print("LoR, Aggs by Morin AUC:")
# print_mean_sem(scores)
# # 0.9289 (SEM: 0.0020)
# # Wall time: 18.5 s

#### LightGBM

In [None]:
# %%time
# gbm_aggs = lgb.LGBMClassifier()

# scores = skl.model_selection.cross_val_score(
#     gbm_aggs, X_aggs, y_aggs,
#     scoring=SCORING, cv=gkf_cv, groups=cv_groups)

# print("GBM, Aggs by Morin AUC:")
# print_mean_sem(scores)
# # 0.9289 (SEM: 0.0020)
# # Wall time: 20.3 s

## Outlier removal
works for: 
- correlation; 
- PCA;
- ...

#### Check whether outliers are concentrated on specific seqs or subjects

In [None]:
# def _is_outlier(col):
#     return ~col.between(col.quantile(0.025),
#                         col.quantile(0.975))
# #     iqr = calc_iqr(col)
# #     lo, hi = calc_box_whis(iqr)
# #     return ~col.between(lo, hi)
    

# sens = eda_tr[SENSOR_COLS]
# (
#     sens.transform(_is_outlier)
#         .assign(subject=eda_tr["subject"])
#         .groupby("sequence")[SENSOR_COLS].sum()
#         .sum(axis=1).describe()
#         .pipe(display)
# )  # not concentrated on specific sequences

# (
#     sens.transform(_is_outlier)
#         .assign(subject=eda_tr["subject"])
#         .groupby("subject").sum()
#         # number of outliers per subject
#         .pipe(small_multiple_dists)
# )

# (
#     sens.transform(_is_outlier)
#         .assign(subject=eda_tr["subject"])
#         .groupby("subject").sum()
#         .sum(axis=1).plot(kind="hist", logy=True)
# )  # not concentrated on specific subjects

no easy fix, skipping outlier removal
## Feat Eng
- > flat-line seq
- > *TODO amplitude change
- > seq count, included in morin aggs
- > Hier Clt + Permu Imp + SequentialFeatureSelector
- > MAD

In [None]:
def agg_mad(ser):
    return ser.mad()

def agg_skew(ser: pd.Series):
    return ser.skew()

def agg_kurt(ser: pd.Series):
    return ser.kurtosis()

def coefficient_of_variation(x):
    mean = np.mean(x)
    if mean != 0:
        return np.std(x) / mean
    else:
        return np.nan

In [None]:
N_COMPONENTS = 2
ICA_COLS = [f"ica_{i}" for i in range(N_COMPONENTS)]
SELECTED_SENSOR_COLS = GROUPED_SENSOR_COLS[6:]
WEAKCORR_SENSOR_COLS = GROUPED_SENSOR_COLS[4:6]

def _standard_scale(df: pd.DataFrame):
    scaler = skl.preprocessing.StandardScaler()
    scaled = pd.DataFrame(
        scaler.fit_transform(df.drop(columns=["subject", "sequence", "step"])),
        index=df.index,
        columns=df.columns.difference(["subject", "sequence", "step"]),
    )

    seq_step = df[["sequence", "step"]]

    return pd.concat([scaled, seq_step], axis=1)

def _run_ica(df: pd.DataFrame):
    ica = skl.decomposition.FastICA(
        n_components=N_COMPONENTS,
        max_iter=500, random_state=RANDOM_STATE)
    
    comp = pd.DataFrame(
        ica.fit_transform(df),
        index=df.index,
        columns=ICA_COLS,
    )
    return comp

def create_ica_feats(df, cols):
    return (
        df[cols+["subject", "step"]]
            .reset_index()
            .groupby("subject").apply(_standard_scale)
            .set_index(["sequence", "step"])
            .pipe(_run_ica)
    )


def create_ica_agg_feats(df):
    df = df.copy()
    df = (
        df.groupby("sequence")[ICA_COLS]
            .agg([agg_mad,
                  "min", "max", "std", "mean",
                  agg_skew,
                  agg_kurt,
                  coefficient_of_variation])
    )
    df.columns = ["_".join(levels) for levels in df.columns]
    return df

In [None]:
def _simple_impute(df: pd.DataFrame, imputer, te=False):
    if te:
        data = imputer.transform(df)
    else:
        data = imputer.fit_transform(df)

    return pd.DataFrame(
        data,
        index=df.index,
        columns=df.columns,
    )

def create_sensor2_flatline_nf(df):
    return (df.groupby("sequence")
                .sensor_02.apply(lambda s: s.min() == s.max())
                .astype(int)
                .rename("sensor_02_flatline")
           )

def create_more_agg_feats(df):
    tmp = (
        df.groupby("sequence")[SENSOR_COLS]
            .agg([agg_mad])
    )
    tmp.columns = ["_".join(levels) for levels in tmp.columns]
    return tmp

def create_sensor4_10_mad(df):
    return ((df.sensor_04+df.sensor_10)
                .groupby("sequence").mad()
                .rename("sensor4_10_mad")
           )

def feat_eng_raw(df):
    return pd.concat([df.pipe(create_sensor2_flatline_nf),
                      df.pipe(create_more_agg_feats),
                      df.pipe(create_sensor4_10_mad),
                     ], axis=1)

def feat_eng_ica(df):
    return pd.concat([df.pipe(create_ica_feats, cols=SELECTED_SENSOR_COLS)
                          .pipe(create_ica_agg_feats).add_prefix("medcorr_"),
                      df.pipe(create_ica_feats, cols=WEAKCORR_SENSOR_COLS)
                          .pipe(create_ica_agg_feats).add_prefix("wkcorr_"),
                     ], axis=1)

def drop_agg_feats(df):
    feats_to_drop = (
        AGG_FEATS_TO_DROP_DUE_TO_NA
        + df.filter(regex="sensor.*length$").columns.tolist()  # constant
        + df.filter(regex="sensor.*count$").columns.tolist()  # constant
        # also so below for hier-cluster to reduce multicollinearity
#         + df.filter(regex="count_below_0$").columns.tolist()  # -1 corr with above_0
#         + df.filter(regex="absolute_sum_of_changes$").columns.tolist()  # 1 corr with mean_abs_change
#         + df.filter(regex="value_count_0$").columns.tolist()  # 1 corr with count_near_0
#         + df.filter(regex="realized_volatility$").columns.tolist()  # 1 corr with root_mean_sq
#         + df.filter(regex="root_mean_square$").columns.tolist()  # .99 corr with std, due to all mean ~= 0
#         + df.filter(regex="realized_vol_skew$").columns.tolist()  # .99 corr with realized_quarticity
#         + df.filter(regex="realized_quarticity$").columns.tolist()  # .99 corr with abs_max_n
#         + df.filter(regex="mean_n_absolute_max_2$").columns.tolist()  # .99 corr with mean_n_absolute_max_5
#         + df.filter(regex="mean_n_absolute_max_5$").columns.tolist()  # .99 corr with std
#         + df.filter(regex="mean_n_absolute_max_10$").columns.tolist()  # .99 corr with mean_n_absolute_max_5
#         + df.filter(regex="count_near_0_0$").columns.tolist()  # does not make much sense
#         + df.filter(regex="quantile_01$").columns.tolist()  # .99 corr with min
#         + df.filter(regex="quantile_09$").columns.tolist()  # .99 corr with max 
#         + df.filter(regex="last_location_of_maximum$").columns.tolist()  # .99 corr with first_loc
#         + df.filter(regex="last_location_of_minimum$").columns.tolist()  # .99 corr with first_loc
#         + df.filter(regex="absolute_maximum$").columns.tolist()
#         + df.filter(regex="quantile_075$").columns.tolist()
    )
    return df.drop(columns=feats_to_drop)


def feat_eng(df):
    return (df.pipe(drop_agg_feats))

In [None]:
%%time
imputer = skl_impute.SimpleImputer()

ica_feat_bo = (
    pd.concat([eda_tr, eda_te])
        .pipe(feat_eng_ica)
)

X_eng = (
    pd.concat([X_morin_aggs,
               eda_tr.pipe(feat_eng_raw),
               ica_feat_bo.loc[eda_tr.index.drop_duplicates()],
              ], axis=1)
        .pipe(feat_eng)
        .pipe(_simple_impute, imputer=imputer)
)
X_eng_te = (
    pd.concat([X_morin_aggs_te,
               eda_te.pipe(feat_eng_raw),
               ica_feat_bo.loc[eda_te.index.drop_duplicates()],
              ], axis=1)
        .pipe(feat_eng)
        .pipe(_simple_impute, imputer=imputer, te=True)
)

y_eng = y_aggs

X_eng.shape, X_eng_te.shape

In [None]:
%%time
gbm_eng = lgb.LGBMClassifier()

scores = skl.model_selection.cross_val_score(
    gbm_eng, X_eng, y,
    scoring=SCORING, cv=gkf_cv, groups=cv_groups)

print("GBM, feat eng AUC:")
print_mean_sem(scores)
# 0.9301 (SEM: 0.0018)
# 0.9305 (SEM: 0.0019), +ica med corr
# 0.9323 (SEM: 0.0022), +ica weak corr
# Wall time: 19.1 s

In [None]:
##### use hier clustering to reduce multicollinear before permu imp
import scipy

corr = X_eng.corr().values

# Ensure the correlation matrix is symmetric
corr = (corr + corr.T) / 2
np.fill_diagonal(corr, 1)

distance_matrix = 1 - np.abs(corr)
dist_linkage = scipy.cluster.hierarchy.ward(scipy.spatial.distance.squareform(distance_matrix))

# __, ax = plt.subplots(figsize=(80, 12))
# scipy.cluster.hierarchy.dendrogram(dist_linkage, labels=X_eng.columns.tolist(), ax=ax, leaf_rotation=90)
# pass

In [None]:
import collections

DIST_THRESH = 0.1
dist_linkage = np.where(dist_linkage < 0, 0, dist_linkage)
cluster_ids = scipy.cluster.hierarchy.fcluster(dist_linkage, DIST_THRESH, criterion="distance")

cluster_id_to_feature_ids = collections.defaultdict(list)
for feature_id, cluster_id in enumerate(cluster_ids):
    cluster_id_to_feature_ids[cluster_id].append(feature_id)

selected_features = []
for feature_ids in cluster_id_to_feature_ids.values():
    # select one feature with min cardinality from one cluster
    picked_id = sorted(feature_ids, key=lambda i: X_eng.iloc[:,i].nunique())[0]
#     print(X_eng.iloc[:, feature_ids].nunique(), "\n")
    selected_features.append(picked_id)

In [None]:
X_hier = X_eng.iloc[:,selected_features]
X_hier_te = X_eng_te.iloc[:,selected_features]
X_hier.shape, X_hier_te.shape

In [None]:
# gbm_eng = lgb.LGBMClassifier()

# scores = skl.model_selection.cross_val_score(
#     gbm_eng, X_hier, y,
#     scoring=SCORING, cv=gkf_cv, groups=cv_groups)

# print("GBM, feat eng hier-cluster AUC:")
# print_mean_sem(scores)
# # 0.9279 (SEM: 0.0021)
# # 0.9324 (SEM: 0.0022), +ica med/weak

In [None]:
import sklearn.inspection as skl_inspection

def calc_permu_imp(clf, X, y, cv, cv_groups):
    permu_imp = pd.DataFrame(index=X.columns)
    for j, (tr_i, te_i) in enumerate(cv.split(X, y, cv_groups)):
        clf.fit(X.iloc[tr_i], y.iloc[tr_i])
        permu = skl_inspection.permutation_importance(
            clf, X.iloc[te_i], y.iloc[te_i],
            scoring=SCORING, n_repeats=5,
            n_jobs=-1,
        )

        permu_imp[f'mean{j}'] = permu.importances_mean
        permu_imp[f'std{j}'] = permu.importances_std
    return permu_imp

In [None]:
# %%time
# gbm_eng = lgb.LGBMClassifier()
# permu_imp = calc_permu_imp(
#     gbm_eng, X_hier, y,
#     gkf_cv, cv_groups
# )
# permu_selected_feats = permu_imp.loc[
#     (permu_imp.mean0 - 2*permu_imp.std0 > 0)
#     | (permu_imp.mean1 - 2*permu_imp.std1 > 0)
#     | (permu_imp.mean2 - 2*permu_imp.std2 > 0)
# ].index.to_list()

# X_permu = X_hier[permu_selected_feats]
# X_permu_te = X_hier_te[permu_selected_feats]
# X_permu.shape, X_permu_te.shape

In [None]:
# %%time
# gbm_eng = lgb.LGBMClassifier()
# permu_imp = calc_permu_imp(
#     gbm_eng, X_eng, y,
#     gkf_cv, cv_groups
# )
# permu_selected_feats = permu_imp.loc[
#     (permu_imp.mean0 - 2*permu_imp.std0 > 0)
#     | (permu_imp.mean1 - 2*permu_imp.std1 > 0)
#     | (permu_imp.mean2 - 2*permu_imp.std2 > 0)
# ].index.to_list()

# # 10 min +

In [None]:
permu_selected_feats = [
    'sensor_00_mean',
    'sensor_00_coefficient_of_variation',
    'sensor_00_kurtosis',
    'sensor_00_realized_vol_skew',
    'sensor_00_quantile_075',
    'sensor_00_quantile_09',
    'sensor_00_absolute_maximum',
    'sensor_00_max_over_min',
    'sensor_00_last_location_of_maximum',
    'sensor_00_last_location_of_minimum',
    'sensor_00_number_crossing_0',
    'sensor_00_ratio_beyond_01_sigma',
    'sensor_00_ratio_beyond_03_sigma',
    'sensor_01_mean',
    'sensor_01_realized_abs_skew',
    'sensor_01_quantile_01',
    'sensor_01_quantile_09',
    'sensor_01_max_over_min',
    'sensor_01_last_location_of_maximum',
    'sensor_01_mean_second_derivative_central',
    'sensor_01_number_crossing_0',
    'sensor_01_ratio_beyond_03_sigma',
    'sensor_02_standard_deviation',
    'sensor_02_skewness',
    'sensor_02_realized_skew',
    'sensor_02_minimum',
    'sensor_02_median',
    'sensor_02_quantile_01',
    'sensor_02_quantile_025',
    'sensor_02_quantile_075',
    'sensor_02_last_location_of_maximum',
    'sensor_02_last_location_of_minimum',
    'sensor_02_first_location_of_minimum',
    'sensor_02_number_peaks_2',
    'sensor_02_number_peaks_5',
    'sensor_02_mean_n_absolute_max_10',
    'sensor_02_count_unique',
    'sensor_02_mean_abs_change',
    'sensor_02_mean_second_derivative_central',
    'sensor_02_number_crossing_0',
    'sensor_02_ratio_beyond_01_sigma',
    'sensor_03_mean',
    'sensor_03_coefficient_of_variation',
    'sensor_03_kurtosis',
    'sensor_03_realized_vol_skew',
    'sensor_03_realized_quarticity',
    'sensor_03_maximum',
    'sensor_03_quantile_09',
    'sensor_03_max_over_min',
    'sensor_03_last_location_of_maximum',
    'sensor_03_last_location_of_minimum',
    'sensor_03_mean_n_absolute_max_5',
    'sensor_04_coefficient_of_variation',
    'sensor_04_kurtosis',
    'sensor_04_quantile_01',
    'sensor_04_quantile_025',
    'sensor_04_quantile_075',
    'sensor_04_max_over_min',
    'sensor_04_last_location_of_maximum',
    'sensor_04_first_location_of_maximum',
    'sensor_04_last_location_of_minimum',
    'sensor_04_first_location_of_minimum',
    'sensor_04_number_peaks_5',
    'sensor_04_number_peaks_10',
    'sensor_04_mean_change',
    'sensor_04_number_crossing_0',
    'sensor_04_ratio_beyond_01_sigma',
    'sensor_04_ratio_beyond_02_sigma',
    'sensor_04_ratio_beyond_03_sigma',
    'sensor_05_mean',
    'sensor_05_coefficient_of_variation',
    'sensor_05_skewness',
    'sensor_05_median',
    'sensor_05_number_peaks_2',
    'sensor_05_mean_n_absolute_max_2',
    'sensor_06_mean',
    'sensor_06_coefficient_of_variation',
    'sensor_06_skewness',
    'sensor_06_kurtosis',
    'sensor_06_quantile_01',
    'sensor_06_quantile_025',
    'sensor_06_max_over_min',
    'sensor_06_last_location_of_maximum',
    'sensor_06_count_above_0',
    'sensor_06_ratio_beyond_03_sigma',
    'sensor_07_mean',
    'sensor_07_coefficient_of_variation',
    'sensor_07_skewness',
    'sensor_07_realized_skew',
    'sensor_07_quantile_075',
    'sensor_07_absolute_maximum',
    'sensor_07_mean_n_absolute_max_2',
    'sensor_07_mean_n_absolute_max_10',
    'sensor_07_mean_change',
    'sensor_07_number_crossing_0',
    'sensor_07_ratio_beyond_01_sigma',
    'sensor_07_ratio_beyond_02_sigma',
    'sensor_07_ratio_beyond_03_sigma',
    'sensor_08_coefficient_of_variation',
    'sensor_08_skewness',
    'sensor_08_kurtosis',
    'sensor_08_root_mean_square',
    'sensor_08_realized_abs_skew',
    'sensor_08_realized_skew',
    'sensor_08_realized_quarticity',
    'sensor_08_quantile_09',
    'sensor_08_mean_abs_change',
    'sensor_09_mean',
    'sensor_09_coefficient_of_variation',
    'sensor_09_skewness',
    'sensor_09_kurtosis',
    'sensor_09_realized_skew',
    'sensor_09_realized_quarticity',
    'sensor_09_minimum',
    'sensor_09_maximum',
    'sensor_09_quantile_01',
    'sensor_09_quantile_025',
    'sensor_09_quantile_075',
    'sensor_09_quantile_09',
    'sensor_09_absolute_maximum',
    'sensor_09_max_over_min',
    'sensor_09_last_location_of_minimum',
    'sensor_09_mean_n_absolute_max_5',
    'sensor_09_mean_abs_change',
    'sensor_09_number_crossing_0',
    'sensor_10_mean',
    'sensor_10_coefficient_of_variation',
    'sensor_10_kurtosis',
    'sensor_10_realized_abs_skew',
    'sensor_10_median',
    'sensor_10_maximum',
    'sensor_10_quantile_075',
    'sensor_10_quantile_09',
    'sensor_10_absolute_maximum',
    'sensor_10_max_over_min',
    'sensor_10_last_location_of_maximum',
    'sensor_10_last_location_of_minimum',
    'sensor_10_first_location_of_minimum',
    'sensor_10_mean_n_absolute_max_2',
    'sensor_10_number_crossing_0',
    'sensor_11_mean',
    'sensor_11_coefficient_of_variation',
    'sensor_11_kurtosis',
    'sensor_11_realized_abs_skew',
    'sensor_11_realized_skew',
    'sensor_11_median',
    'sensor_11_quantile_01',
    'sensor_11_quantile_025',
    'sensor_11_quantile_075',
    'sensor_11_quantile_09',
    'sensor_11_count_unique',
    'sensor_11_mean_change',
    'sensor_12_mean',
    'sensor_12_coefficient_of_variation',
    'sensor_12_skewness',
    'sensor_12_kurtosis',
    'sensor_12_root_mean_square',
    'sensor_12_realized_abs_skew',
    'sensor_12_realized_skew',
    'sensor_12_realized_vol_skew',
    'sensor_12_minimum',
    'sensor_12_maximum',
    'sensor_12_quantile_025',
    'sensor_12_quantile_09',
    'sensor_12_absolute_maximum',
    'sensor_12_max_over_min',
    'sensor_12_last_location_of_maximum',
    'sensor_12_first_location_of_maximum',
    'sensor_12_number_peaks_2',
    'sensor_12_mean_n_absolute_max_2',
    'sensor_12_number_peaks_5',
    'sensor_12_mean_n_absolute_max_5',
    'sensor_12_mean_abs_change',
    'sensor_12_mean_change',
    'sensor_12_number_crossing_0',
    'sensor_12_ratio_beyond_03_sigma',
    'count_sequence',
    'sensor_02_agg_mad',
    'sensor4_10_mad',
    'medcorr_ica_0_agg_mad',
    'medcorr_ica_0_min',
    'medcorr_ica_0_max',
    'medcorr_ica_0_coefficient_of_variation',
    'medcorr_ica_1_agg_mad',
    'medcorr_ica_1_std',
    'medcorr_ica_1_mean',
    'medcorr_ica_1_agg_skew',
    'wkcorr_ica_0_agg_mad',
    'wkcorr_ica_0_min',
    'wkcorr_ica_0_max',
    'wkcorr_ica_0_std',
    'wkcorr_ica_0_mean',
    'wkcorr_ica_0_agg_skew',
    'wkcorr_ica_0_agg_kurt',
    'wkcorr_ica_1_agg_mad',
    'wkcorr_ica_1_min',
    'wkcorr_ica_1_max',
    'wkcorr_ica_1_std',
    'wkcorr_ica_1_mean',
    'wkcorr_ica_1_agg_skew',
    'wkcorr_ica_1_coefficient_of_variation'
]

In [None]:
X_permu = X_eng[permu_selected_feats]
X_permu_te = X_eng_te[permu_selected_feats]
X_permu.shape, X_permu_te.shape

In [None]:
# (
#     permu_imp.loc[
#         (permu_imp.mean0 - 2*permu_imp.std0 > 0)
#         | (permu_imp.mean1 - 2*permu_imp.std1 > 0)
#         | (permu_imp.mean2 - 2*permu_imp.std2 > 0)
#     ]
#         .filter(regex="^mean").mean(axis=1)
#         .sort_values()
#         .tail(20).round(4)
# )

# # wkcorr_ica_1_agg_mad                  0.0007
# # sensor_12_number_peaks_2              0.0007
# # sensor_12_number_crossing_0           0.0007
# # wkcorr_ica_1_min                      0.0007
# # wkcorr_ica_0_min                      0.0007
# # sensor_11_mean                        0.0008
# # sensor_04_number_peaks_5              0.0010
# # sensor_12_coefficient_of_variation    0.0010
# # sensor_04_ratio_beyond_03_sigma       0.0011
# # sensor_05_coefficient_of_variation    0.0013
# # sensor_04_coefficient_of_variation    0.0014
# # sensor_12_kurtosis                    0.0014
# # sensor_10_coefficient_of_variation    0.0015
# # wkcorr_ica_0_agg_mad                  0.0020
# # wkcorr_ica_0_agg_kurt                 0.0024
# # sensor_02_count_unique                0.0030
# # sensor_02_standard_deviation          0.0052
# # sensor_02_mean_abs_change             0.0150
# # sensor_04_kurtosis                    0.0176
# # count_sequence                        0.0472
# # dtype: float64

In [None]:
# # tree feat imp
# gbm_eng.fit(X_eng, y)
# (
#     pd.Series(gbm_eng.feature_importances_,
#               index=gbm_eng.feature_name_)
#         .sort_values().tail(20)
# )

# # sensor_11_coefficient_of_variation     25
# # wkcorr_ica_0_max                       26
# # wkcorr_ica_1_mean                      28
# # sensor_04_ratio_beyond_03_sigma        30
# # wkcorr_ica_0_mean                      32
# # sensor_12_coefficient_of_variation     33
# # sensor_10_kurtosis                     36
# # wkcorr_ica_1_min                       36
# # wkcorr_ica_0_agg_mad                   37
# # sensor_05_mean                         37
# # sensor_04_coefficient_of_variation     40
# # sensor_05_coefficient_of_variation     45
# # wkcorr_ica_0_agg_kurt                  47
# # sensor_12_kurtosis                     50
# # sensor_04_kurtosis                     52
# # sensor_10_coefficient_of_variation     52
# # sensor_02_count_unique                 53
# # sensor_02_standard_deviation           77
# # sensor_02_mean_abs_change             154
# # count_sequence                        163

In [None]:
X_sel = X_permu
X_sel_te = X_permu_te
X_sel.shape, X_sel_te.shape

In [None]:
# %%time
# import sklearn.feature_selection as skl_feature_selection
# N_FEATS = 50

# seq_feat_sel = skl_feature_selection.SequentialFeatureSelector(
#     gbm_eng, n_features_to_select=N_FEATS, direction="forward",
#     scoring=SCORING, cv=pgkf_cv,
# )
# seq_feat_sel.fit(X_permu, y)
# print(seq_feat_sel.get_feature_names_out())

# X_sel = seq_feat_sel.transform(X_permu)
# X_sel_te = seq_feat_sel.transform(X_permu_te)
# X_sel.shape, X_sel_te.shape

In [None]:
gbm_sel = lgb.LGBMClassifier()

scores = skl.model_selection.cross_val_score(
    gbm_sel, X_sel, y,
    scoring=SCORING, cv=gkf_cv, groups=cv_groups)

print("GBM, feat eng sel AUC:")
print_mean_sem(scores)
# 10, sfs: 0.9156 (SEM: 0.0027)
# 20, sfs: 0.9249 (SEM: 0.0023)
# 226, permu: 0.9312 (SEM: 0.0018)
# 139, hier+permu: 0.9284 (SEM: 0.0019)
# 20, hier+permu+sfs: 0.9227 (SEM: 0.0022)
# 50, hier+permu+sfs: 0.9278 (SEM: 0.0022)
# 162, hier+permu, +mad: 0.9308 (SEM: 0.0016)
# 151, hier+permu, +ica: 0.9316 (SEM: 0.0018)
# 161, hier+permu, +ica med/weak: 0.9332 (SEM: 0.0021)
# 220, permu, +ica med/weak: 0.9341 (SEM: 0.0021)

#### ICA

In [None]:
# SELECTED_SENSOR_COLS = GROUPED_SENSOR_COLS
# SELECTED_SENSOR_COLS = GROUPED_SENSOR_COLS[4:6]
SELECTED_SENSOR_COLS = GROUPED_SENSOR_COLS[6:]
# SELECTED_SENSOR_COLS

In [None]:
one_seq = list_random_seqs(1, state=0)
one_seq.shape

In [None]:
# seq_single = (
#     one_seq
#         .melt(id_vars=["sequence", "state", "step"],
#               value_vars=SELECTED_SENSOR_COLS,
#               var_name="sensor", value_name="value")
# )

# seq_single.pipe(small_multiple_lineplots_h)

In [None]:
# N_COMPONENTS = 2
# ica = skl.decomposition.FastICA(
#     n_components=N_COMPONENTS,
#     max_iter=500, random_state=RANDOM_STATE)
# one_seq_ica = ica.fit_transform(one_seq[SELECTED_SENSOR_COLS])

# ICA_COLS = [f"ica_{i}" for i in range(N_COMPONENTS)]
# (
#     pd.concat([one_seq[one_seq.columns.difference(SENSOR_COLS)],
#                pd.DataFrame(one_seq_ica, columns=ICA_COLS)
#               ], axis=1)
#         .melt(id_vars=["sequence", "state", "step"],
#               value_vars=ICA_COLS,
#               var_name="sensor", value_name="value")
#         .pipe(small_multiple_lineplots_h, col_order=ICA_COLS)
# )

In [None]:
# med_ica = (
#     eda_tr[SELECTED_SENSOR_COLS+["subject", "step"]]
#         .reset_index()
#         .groupby("subject").apply(_standard_scale)
#         .set_index(["sequence", "step"])
#         .pipe(_run_ica)
              
# )
# med_ica.shape

In [None]:
# eda_ica_tr = pd.concat([eda_tr.set_index("step", append=True), med_ica], axis=1)
# eda_ica_tr.shape

In [None]:
# SELECTED_SEQ = eda_ica_tr.index.to_series().sample(1).iat[0]
# SELECTED_SEQ = one_seq.sequence.unique()[0]

# (
#     eda_ica_tr
#         .reset_index()
#         .loc[lambda df: df.sequence == SELECTED_SEQ,
#              med_ica.columns.union(SELECTED_SENSOR_COLS+["sequence", "step"])]
#         .assign(state=0)
#         .melt(id_vars=["sequence", "state", "step"],
#               value_vars=med_ica.columns.union(SELECTED_SENSOR_COLS),
#               var_name="sensor", value_name="value")
#         .pipe(small_multiple_lineplots_h, col_order=med_ica.columns.tolist()+SELECTED_SENSOR_COLS)
# )

## LGBM Tuning

In [None]:
# gkf_cv = skl.model_selection.GroupKFold(n_splits=3)
# tr_i, te_i = next(cv.split(X_pse, y_pse, cv_groups_pse))
# display(tr_i[:5])
# display(te_i[:5])
# # gives random ones between different init

In [None]:
import optuna

def objective(trial):
    tr_i, te_i = next(cv.split(X_sel, y, cv_groups))
    X_tr, X_te = X_sel.iloc[tr_i], X_sel.iloc[te_i]
    y_tr, y_te = y.iloc[tr_i], y.iloc[te_i]

    param = {
        'boosting_type': 
            trial.suggest_categorical('boosting_type', 
                                      ['gbdt', 'dart']),
        'num_leaves':
            trial.suggest_int('num_leaves', 2, 256),
        'colsample_bytree':
            trial.suggest_float("colsample_bytree", 0.4, 1.0),
        'subsample':
            trial.suggest_float("subsample", 0.4, 1.0),
        "subsample_freq": 
            trial.suggest_int("subsample_freq", 1, 7),
    }
    
    gbm = lgb.LGBMClassifier(n_estimators=200, **param)
    gbm.fit(X_tr, y_tr)
    y_hat = gbm.predict_proba(X_te)[:,1]
    score = skl.metrics.roc_auc_score(y_te, y_hat)
    return score

In [None]:
%%time
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

best_params = study.best_trial.params

In [None]:
gbm = lgb.LGBMClassifier(n_estimators=200, **best_params)
gbm

## #Pseudo-labelling
see https://www.kaggle.com/code/hasanbasriakcay/tpsapr22-fe-pseudo-labels-baseline

In [None]:
def pseudo_labeling(X_train, X_test, y_train, cv):
    from lightgbm import LGBMClassifier
    from sklearn.metrics import roc_auc_score

    oof = np.zeros(len(X_train))
    preds = np.zeros(len(X_test))
    
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)
    idx_tr = X_train.index

    for train_index, test_index in cv.split(X_train, y_train):
        clf = LGBMClassifier(force_col_wise=True)
        clf.fit(X_train.loc[train_index], y_train.loc[train_index], 
                eval_set = [(X_train.loc[test_index], y_train.loc[test_index])])
        oof[idx_tr[test_index]] = clf.predict_proba(X_train.loc[test_index])[:,1]
        preds += clf.predict_proba(X_test)[:,1] / cv.n_splits
    
    pseudo_labeled_test = X_test.copy()
    pseudo_labeled_test["pseudo_proba"] = preds
    
    auc = roc_auc_score(y_train, oof)
    print('LGBM scores CV =',round(auc,5))
    
    return pseudo_labeled_test

In [None]:
Xy_pseudo = pseudo_labeling(X_sel, X_sel_te, y, pgkf_cv)
Xy_pseudo.shape

In [None]:
subject_groups = raw_te[['sequence', 'subject']].drop_duplicates().set_index("sequence")['subject']

In [None]:
for th in [0.9, 0.95, 0.99]:
    with_qualified_subject = (
        Xy_pseudo.groupby(subject_groups)
            .filter(lambda sf:
                        sf.pseudo_proba.mean() < 1-th
                        or sf.pseudo_proba.mean() > th)
    )
    print(f"{th:<8}", len(Xy_pseudo.loc[with_qualified_subject.index]))

In [None]:
PSEUDO_THRESH = 0.9

# _qualified_pseudo_row = ~Xy_pseudo.pseudo_proba.between(1-PSEUDO_THRESH, PSEUDO_THRESH)
_qualified_pseudo_row = (
    Xy_pseudo
        .groupby(subject_groups)
        .filter(lambda sf:
                    sf.pseudo_proba.mean() < 1-PSEUDO_THRESH
                    or sf.pseudo_proba.mean() > PSEUDO_THRESH)
        .index
)
Xy_pseudo_good = Xy_pseudo.loc[_qualified_pseudo_row]

X_pse = pd.concat([
    X_sel,
    Xy_pseudo_good.loc[:,Xy_pseudo.columns[:-1]]
])
y_pse = pd.concat([
    raw_y.state,
    Xy_pseudo_good.loc[:,Xy_pseudo.columns[-1]].round(0)
])

X_pse_te = X_sel_te
X_pse.shape, y_pse.shape, X_pse_te.shape

In [None]:
cv_groups_pse = cv_groups.subject.append(
    subject_groups.loc[Xy_pseudo_good.index])

scores = skl.model_selection.cross_val_score(
    gbm, X_pse, y_pse,
    scoring=SCORING, cv=gkf_cv, groups=cv_groups_pse)

print("GBM, pseudo label AUC:")
print_mean_sem(scores)
# 0.9370 (SEM: 0.0011)

## Submission

In [None]:
X = X_pse
X_te = X_pse_te
y = y_pse
clf = gbm

In [None]:
# clf.fit(X, y)
# y_pred = clf.predict_proba(X_te)[:,1]
# (
#     pd.DataFrame({"sequence": X_mvp_te.index,
#                   "state": y_pred})
#         .to_csv("submission.csv", index=False)
# )

#### self-ensemble
e.g. averaging across 50 random runs

In [None]:
%%time
N_ITERS = 50

all_proba = []
for seed in range(N_ITERS):
    X_tr, X_va, y_tr, y_va = skl.model_selection.train_test_split(
        X, y, test_size=0.05, random_state=seed,
    )
    gbm.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        callbacks=[lgb.early_stopping(10)]
    )
    proba = gbm.predict_proba(X_te)[:,1]
    all_proba.append(proba)
    
# 2m6s for 100 estimators

In [None]:
y_pred = pd.DataFrame(all_proba).mean()
(
    pd.DataFrame({"sequence": X_mvp_te.index,
                  "state": y_pred})
        .to_csv("submission-self-ens.csv", index=False)
)

In [None]:
sns.displot(y_pred)

In [None]:
!head submission-self-ens.csv