# Setup, Data Import, Functions

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' #'last_expr'

import math, time, datetime as dt, os, sys 
from pathlib import Path

import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 999
pd.options.display.max_rows = 51

import matplotlib.pyplot as plt
import seaborn as sns
# sns.set_theme()

import numpy as np
np.set_printoptions(edgeitems=5,linewidth=250)

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GroupKFold

from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor, LGBMClassifier
from catboost import CatBoostRegressor, CatBoostClassifier

RS = 336699 # Random State

In [None]:
base_path = '/kaggle/input/tabular-playground-series-apr-2022/'
df_train_lbl = pd.read_csv(f'{base_path}/train_labels.csv')
'df_train_lbl.shape', df_train_lbl.shape

df_train = pd.read_csv(f'{base_path}/train.csv')
'df_train.shape', df_train.shape

df_train = df_train.merge(df_train_lbl, on=['sequence'], how='left')
'df_train.shape', df_train.shape

target_col = 'state'
'target_col', target_col

sensor_cols = df_train.columns[3:-1].to_list()
'sensor_cols', sensor_cols

# Intro

<h3>Hi!, </h3>

In this Notebook i will present some feature engineering ideas and test them on 3 Gradient Boosting Models on GroupKFold.<br>
For now, I am using only 5 CVs and the models are not tunned.<br>
When I'll run out of ideas for features, I will tune the models with Optuna and ensemble with stacking and/or voting. 

In [None]:
def score(models, X, y, groups):
    # Iterates over models from `models' list and fit's each with GroupKFold with 5 splits
    all_scores = []
    for model in models:
        
        cv_scores = [] 
        for idx_train, idx_test in GroupKFold(n_splits=5).split(X, groups = groups):
            X_train, X_test = X.iloc[idx_train], X.iloc[idx_test]
            y_train, y_test = y.iloc[idx_train], y.iloc[idx_test]

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            score = roc_auc_score(y_test, y_pred)
            cv_scores.append(score)

        cv_mean_std = f'{np.mean(cv_scores):.3f} +/- {np.std(cv_scores):.4f}'
        all_scores.append(cv_mean_std)
        print(f'{model.__class__.__name__} - {cv_mean_std}')

    return all_scores

models= [
    LGBMClassifier(random_state=RS, ),
    XGBClassifier(random_state=RS, use_label_encoder=False, verbosity=0, tree_method = 'gpu_hist'),
    CatBoostClassifier(random_state=RS, silent=True, task_type = 'GPU'),
        ]

df_scores = pd.DataFrame({
    'models': [model.__class__.__name__ for model in models]
})

highlight_cols = lambda  s: 'background-color: % s' % 'lightgreen'

# Fit: Raw Data - baseline

In [None]:
df = df_train
y = df['state']
y.shape
X = df[ sensor_cols + ['step'] ]
X.shape

Let's train the GBMs on raw data to have a baseline to improve on.

In [None]:
%%time
df_scores['raw-data'] = score(models, X, y, df.subject)
df_scores.style.applymap(highlight_cols, subset=pd.IndexSlice[:, ['raw-data']])

Results of last training are highlighted in green. I will be adding columns to this dataframe with results as we go.

# Fit: Aggregate All Sensor data for a Sequences

This will drastically reduce the number of features as we will replace data from 13 sensors with 5 columns and additionally aggregate by sequence which will reduce rows 60 folds.

In [None]:
X2 = df.groupby(['sequence', 'subject'])[sensor_cols].agg(['mean', 'std', 'skew', 'max', 'min']).reset_index()
X2.columns = ['_'.join(col) for col in X2.columns]
X2.shape

y2 = df.groupby(['sequence']).state.min()
y2.shape

group2 = df.groupby(['sequence']).subject.min()
group2.shape

In [None]:
%%time
df_scores['aggregate-sensor'] = score(models, X2, y2, group2)
df_scores.style.applymap(highlight_cols, subset=pd.IndexSlice[:, ['aggregate-sensor']])

# Fit: Add Subject Count

In [None]:
subject_count = df.subject.value_counts()
X3 = X2.merge(subject_count, left_on=['subject_'], right_index=True, how='left')

In [None]:
%%time
# we've just added a new column to X3, we can reuse y2 and group2
df_scores['agg+subject_count'] = score(models, X3, y2, group2)
df_scores.style.applymap(highlight_cols, subset=pd.IndexSlice[:, ['agg+subject_count']])

### What's next :
- explore other feature engineering ideas
- optimise training:
    - increase cv to 10+
    - optimise models with Optuna
    - stack multiple models
    - explore voting