In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# 1) EDA

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
label_df = pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')
test_df = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')
train_df = pd.merge(left=train_df, right=label_df, how='left')
train_df.head()

## 1.1) Plot an example

In [None]:
# get the name of all sensor columns
sensor_columns = [col for col in train_df.columns
                  if 'sensor' in col.split('_')]

In [None]:
def plot_sequence(sequence: int) -> None:
    mask = train_df.sequence == sequence
    ax = train_df[mask][sensor_columns].plot(subplots=True,
                                             sharex=True,
                                             figsize=(18, 20))
    ax[0].set_title(f'Sequence {sequence}', size=22)
    plt.show()

plot_sequence(1)

## 1.2) Histograms

In [None]:
label_df['state'].hist()
plt.title("State histogram")
plt.xlabel('Value')
plt.ylabel('Counts')
plt.show()

# 2) Feature Engineering

For the feature engineering, we will be using only the mean, standard deviation, maximum, and minimum of all the sensor in order to make prediction.

In [None]:
def create_features_df(df: pd.DataFrame, agg_dict) -> pd.DataFrame:
    feature_df = df.groupby('sequence').agg(agg_dict)
    feature_df.columns = ["_".join(x) for x in feature_df.columns]
    return feature_df.reset_index()

In [None]:
def q50(x):
    return x.quantile(0.5)

def q90(x):
    return x.quantile(0.9)

dict_features = {
    col: ['mean', 'std', 'max', 'min', 'skew', 'median', q50, q90]
    for col in sensor_columns
}

feature_df = create_features_df(train_df, dict_features)
feature_df.head()

test_features = create_features_df(test_df, dict_features)
test_sequence = test_features['sequence']
test_features = test_features.drop('sequence', axis=1)
test_features.head()

In [None]:
feature_df = pd.merge(left=feature_df, right=label_df, how='left')
feature_df = feature_df.drop('sequence', axis=1)
feature_df.head()

Constructing the train/test split.

In [None]:
from sklearn.model_selection import train_test_split
y = feature_df['state']
X = feature_df.drop('state', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    stratify=y,
                                                    random_state=42)

# 3) Models
First, we build helper functions for better evaluation of the models. We try Logistic Regression, XGBoost, and Catboost. After training the models, we try a blending using XGBoost and the prediction of the models.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

def make_pipeline(classifier):
    """ Create a pipeline for a classifier.
    """
    steps = [("scaler", StandardScaler()),
             ("classifier", classifier)]
    return Pipeline(steps)

def score_model(pipeline):
    """ Score a given pipeline using ROC AUC
    """
    predict = pipeline.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_test, predict)

def create_submission(model, name, test_features):
    """ Create a submission given a model and a name.
    """
    submission = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/sample_submission.csv', index_col=0)
    test_pred = model.predict_proba(test_features)[:, 1]
    submission['state'] = test_pred
    submission.to_csv(name + '.csv', index=True)
    
models = {}

## 3.1) Logistic Regression

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

pipe_lr = make_pipeline(
    LogisticRegression(random_state=52)
)

parameters = {'classifier__C': [100., 10., 1.0, 0.1, 0.01],
              'classifier__solver': ['lbfgs'],
              'classifier__penalty': ['l2'],
              'classifier__max_iter': [200],
             }
gs_cv_lr = GridSearchCV(pipe_lr,parameters,
                     scoring='roc_auc', cv=3)

gs_cv_lr.fit(X_train, y_train)
print(score_model(gs_cv_lr))
create_submission(gs_cv_lr, 'lr', test_features)
models['lr'] = gs_cv_lr

## 3.2) XGBoost

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

xgb = XGBClassifier(n_jobs=-1,
                    use_label_encoder=False,
                    verbosity=0,
                    # tree_method='gpu_hist'
                   )

pipe_xgb = make_pipeline(xgb)

parameters = {'classifier__learning_rate': [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
              'classifier__max_depth': [3, 4, 5, 6, 8, 10, 12, 15],
              'classifier__n_estimators': [100, 200, 300, 1000],
              'classifier__min_child_weight': [ 1, 3, 5, 7 ],
              'classifier__gamma': [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
              'classifier__colsample_bytree': [0.2, 0.5, 0.8, 1.0],
             }

gs_cv_xgb = RandomizedSearchCV(pipe_xgb, parameters,
                               scoring='roc_auc',
                               cv=3,
                               verbose=1)

gs_cv_xgb.fit(X_train, y_train)
print('score', score_model(gs_cv_xgb))
create_submission(gs_cv_xgb, 'xgb', test_features)
models['xgb'] = gs_cv_xgb

## 3.3) Catboost

In [None]:
from catboost import CatBoostClassifier

cbc = CatBoostClassifier(verbose=False)

pipe_cbc = make_pipeline(cbc)

parameters = {'classifier__learning_rate': [0.05, 0.1, 0.2, 0.3],
              'classifier__depth': [4, 5, 6, 7, 8, 9, 10],
              'classifier__iterations': [10, 20, 50, 100],
             }

gs_cv_cbc = RandomizedSearchCV(pipe_cbc,
                               parameters,
                               scoring='roc_auc',
                               cv=3,
                               verbose=1)
gs_cv_cbc.fit(X_train, y_train)
print('score', score_model(gs_cv_cbc))
create_submission(gs_cv_cbc, 'catboost', test_features)
models['catboost'] = gs_cv_cbc

## 3.4) Model blending

In [None]:
col_pred_0 = []
orig_cols = X.columns
for name, mod in models.items():
    X[name + '_pred_0'] = mod.predict_proba(X[orig_cols])[:, 1]
    test_features[name + '_pred_0'] = mod.predict_proba(test_features[orig_cols])[:, 1]
    col_pred_0.append(name + '_pred_0')

In [None]:
X_pred_0 = X[col_pred_0]
test_features_pred_0 = test_features[col_pred_0]
X_train, X_test, y_train, y_test = train_test_split(X_pred_0, y,
                                                    test_size=0.3,
                                                    stratify=y,
                                                    random_state=42)

In [None]:
xgb_meta = XGBClassifier(n_jobs=-1,
                    use_label_encoder=False,
                    verbosity=0,
                    # tree_method='gpu_hist'
                   )

pipe_meta_xgb = make_pipeline(xgb_meta)

parameters = {'classifier__learning_rate': [0.05, 0.1, 0.2, 0.3],
              'classifier__max_depth': [3, 6, 10],
              'classifier__n_estimators': [100, 200, 300, 1000],
              'classifier__colsample_bytree': [0.2, 0.5, 0.8, 1.0],
             }

meta_cv_xgb = RandomizedSearchCV(pipe_meta_xgb, parameters,
                               scoring='roc_auc',
                               cv=3,
                               verbose=1)

meta_cv_xgb.fit(X_train, y_train)
print('score', score_model(meta_cv_xgb))
create_submission(meta_cv_xgb, 'meta', test_features_pred_0)