# TPS March 21 - Stacking Ensemble

Hello everyone. Let me share with you my approach to March 21 competition. It's a simple ensemble of four models (XGBoost, LightGBM, CatBoost and RidgeClassifier), whose individual predictions were then trained on a meta-classifier.

All individual models hyperparameters were obtained using Optuna.

Huge thanks to all the participants who published awesome notebooks during the competition; these helped me to learn a lot about different topics such as stratified k-fold, and obviously stacking.

Special thanks to Craig Thomas for his notebook which I've been greatly inspired by:
https://www.kaggle.com/craigmthomas/tps-mar-2021-stacked-starter

## Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/test.csv')
sub = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/sample_submission.csv')

## Data Preprocessing

In [None]:
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

In [None]:
numerical = train.select_dtypes(exclude='object').columns
categorical = train.select_dtypes(include='object').columns

### Smooth Target Encoding

From the different tests I've made, I've found that smooth target encoding works best with LightGBM. So I'll transform categorical data using this method, and then feed that into my LightGBM model.

If you want to learn more about smooth target encoding, I recommend this great link which explains clearly how it works:
https://maxhalford.github.io/blog/target-encoding/

In [None]:
def SmoothTarget(train, test, features, weight):
    
    mean_target = train['target'].mean()
    
    for col in features:
        agg = train.groupby(col)['target'].agg(['count', 'mean'])
        count = agg['count']
        mean = agg['mean']
        
        smooth = (count*mean + weight*mean_target) / (count+weight)
        
        train[col] = train[col].map(smooth)
        test[col] = test[col].map(smooth)
    
    return train, test

In [None]:
train_se, test_se = train.copy(), test.copy()

In [None]:
train_se, test_se = SmoothTarget(train_se, test_se, categorical, 10)

### Leave-One-Out Encoding

Again, after many tests, categorical data transformed using LeaveOneOut encoding gave me better results with XGBoost, so I'll use this method for my XGBoost model.

In [None]:
from category_encoders import LeaveOneOutEncoder

In [None]:
train_loo, test_loo = train.copy(), test.copy()

In [None]:
for col in categorical:
    loo = LeaveOneOutEncoder()
    loo.fit(train_loo[col], train_loo['target'])
    train_loo[col] = loo.transform(train_loo[col])
    test_loo[col] = loo.transform(test_loo[col])

## Creating Level 1 models

In [None]:
target = train['target']

for dataframe in (train, train_se, train_loo):
    dataframe = dataframe.drop('target', axis=1, inplace=True)

### Cross Validation Function

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import auc, roc_auc_score, roc_curve, plot_roc_curve

In [None]:
def KFoldROC(X, y, test_set, model, params, folds, eval_set_bool):

    train_pred = np.zeros(len(train.index))
    test_pred = np.zeros(len(test.index))
    
    roc_score = []
    
    
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=21)

    for train_idx, test_idx in skf.split(X,y):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

        clf = model(**params)
        
        if eval_set_bool == True:          
            clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=200, verbose=False)
        else:
            clf.fit(X_train, y_train)

        train_pred[test_idx] = clf.predict_proba(X_test)[:, 1]
        
        test_pred += clf.predict_proba(test_set)[:, 1] / folds
        
        score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        roc_score.append(score)
    
    overall_roc = roc_auc_score(target, train_pred)
    
    return clf, train_pred, test_pred, np.mean(roc_score), overall_roc

I will be training a calibrated version of RidgeClassifier, which doesn't accept fit parameters such as eval_set. This is why I've added the eval_set_bool parameter to this cross-validation function. Depending on the model I'm training, it just allows me to specify whether the model accepts the eval_set parameter or not.

### XGB Classifier

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb_params = {
    'tree_method' : 'gpu_hist',
    'eval_metric' : 'auc',
    'verbosity' : 0,
    'learning_rate': 0.011,
     'n_estimators': 13278,
     'max_depth': 21,
     'reg_alpha': 7.369502726375538,
     'gamma': 0.6911623139352171,
     'reg_lambda': 4.4405272244246765,
     'subsample': 0.8558774777122383,
     'colsample_bytree': 0.17259675946606295,
     'min_child_weight': 2.1918267231776003
}

In [None]:
xgb, train_pred_xgb, test_pred_xgb, roc_xgb, overall_roc_xgb = KFoldROC(
    train_loo, target, test_loo, XGBClassifier, xgb_params, 5, eval_set_bool=True)

In [None]:
print(roc_xgb)

In [None]:
print(overall_roc_xgb)

### LGBM Classifier

In [None]:
from lightgbm import LGBMClassifier

In [None]:
lgb_params = {
 'learning_rate' : 0.03,
 'metric' : 'auc',
 'n_estimators': 8511,
 'num_leaves': 205,
 'max_depth': 10,
 'reg_alpha': 8.337753037902587,
 'reg_lambda': 2.778797190184823,
 'subsample': 0.593175849495612,
 'colsample_bytree' : 0.4228037476166183,
 'min_child_samples': 1592}

In [None]:
light, train_pred_light, test_pred_light, roc_light, overall_roc_light = KFoldROC(
    train_se, target, test_se, LGBMClassifier, lgb_params, 5, eval_set_bool=True)

In [None]:
print(roc_light)

In [None]:
print(overall_roc_light)

### Calibrated Ridge Classifier

In [None]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import RidgeClassifier

In [None]:
ridge_params = {
    'base_estimator' : RidgeClassifier(),
    'cv' : 5   
}

In [None]:
ridge, train_pred_ridge, test_pred_ridge, roc_ridge, overall_roc_ridge = KFoldROC(
    train_loo, target, test_loo, CalibratedClassifierCV, ridge_params, 5, eval_set_bool=False)

In [None]:
print(roc_ridge)

In [None]:
print(overall_roc_ridge)

### CatBoost Classifier

In [None]:
from catboost import CatBoostClassifier

In [None]:
cat_params = {
    'cat_features' : categorical,
    'task_type' : 'GPU',
    'grow_policy' : 'Depthwise',
    'loss_function' : 'Logloss',
    'eval_metric' : 'AUC',
    'metric_period' : 500,
    'learning_rate': 0.01,
    'max_depth': 15,
    'l2_leaf_reg': 2.998072993047546,
    'num_boost_round': 5535,
    'min_data_in_leaf': 296,
    'bagging_temperature': 1.8002809995267188,
    'penalties_coefficient': 3.2585922042596422
}

In [None]:
cb, train_pred_cb, test_pred_cb, roc_cb, overall_roc_cb = KFoldROC(
    train, target, test, CatBoostClassifier, cat_params, 5, eval_set_bool=True)

In [None]:
print(roc_cb)

In [None]:
print(overall_roc_cb)

## Ensemble Predictions

### Getting predictions

In [None]:
train_predictions = pd.DataFrame(
    [train_pred_xgb, train_pred_light, train_pred_ridge, train_pred_cb, target]).transpose()

train_predictions.columns = ['XGB', 'LightGBM', 'Ridge', 'CatBoost', 'target']

In [None]:
test_predictions = pd.DataFrame(
    [test_pred_xgb, test_pred_light, test_pred_ridge, test_pred_cb]).transpose()

test_predictions.columns = ['XGB', 'LightGBM', 'Ridge', 'CatBoost']

Since the code above takes a while to run, I save the train and test predictions to csv files, so that I can work with them in a new notebook specifically dedicated to ensembling methods (stacking of course, but also averaging or weighted averaging).

In [None]:
train_predictions.to_csv('train_predictions.csv', index=False)

In [None]:
test_predictions.to_csv('test_predictions.csv', index=False)

### Averaging

In [None]:
average_pred = (train_pred_xgb + train_pred_light + train_pred_ridge + train_pred_cb) / 4

In [None]:
print(roc_auc_score(target, average_pred))

In [None]:
average_pred_test = (test_pred_xgb + test_pred_light + test_pred_ridge + test_pred_cb) / 4

### Weighted Average

Weights were obtained using scipy.optimize on a different notebook. From the many tests I've made, the RidgeClassifier model only decreased the overall score, so I didn't include it in this prediction.

In [None]:
x = 0.33930655
y = 0.34311931
z = 0.31757414

In [None]:
w_avg_pred = train_pred_xgb*x + train_pred_light*y + train_pred_cb*z

In [None]:
print(roc_auc_score(target, w_avg_pred))

In [None]:
w_avg_pred_test = test_pred_xgb*x + test_pred_light*y + test_pred_cb*z

## Level 2 Classifier

Finally, let's create a meta-classifier into which we'll feed our previous' models predictions.

In [None]:
X_2 = train_predictions.drop('target', axis=1)
y_2 = train_predictions['target']

In [None]:
def KFoldROC_L2(X, y, model, folds):

    train_pred = np.zeros(len(train.index))
    test_pred = np.zeros(len(test.index))
    
    roc_score = []
    
    
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=21)

    for train_idx, test_idx in skf.split(X,y):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

        clf = CalibratedClassifierCV(model, cv=5)
        clf.fit(X_train, y_train)

        train_pred[test_idx] = clf.predict_proba(X_test)[:, 1]
        
        test_pred += clf.predict_proba(test_predictions)[:, 1] / folds
        
        score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        roc_score.append(score)
    
    overall_roc = roc_auc_score(target, train_pred)
    
    return clf, train_pred, test_pred, np.mean(roc_score), overall_roc

In [None]:
L2_clf, train_pred_L2, test_pred_L2, roc_L2, overall_roc_L2 = KFoldROC_L2(X_2, y_2, RidgeClassifier(), 10)

In [None]:
print(roc_L2)

In [None]:
print(overall_roc_L2)

## Submissions

Now let's produce submissions for our Level 2 Model, Average and Weighted Average models.

In [None]:
sub_id = sub['id']

In [None]:
sub_avg = pd.DataFrame(data=[sub_id, average_pred_test]).transpose()
sub_avg.columns = ['id', 'target']
sub_avg['id'] = sub['id'].astype('int64')

In [None]:
sub_wavg = pd.DataFrame(data=[sub_id, w_avg_pred_test]).transpose()
sub_wavg.columns = ['id', 'target']
sub_wavg['id'] = sub['id'].astype('int64')

In [None]:
sub_l2 = pd.DataFrame(data=[sub_id, test_pred_L2]).transpose()
sub_l2.columns = ['id', 'target']
sub_l2['id'] = sub['id'].astype('int64')

In [None]:
sub_avg.to_csv('Submission Average.csv', index=False)
sub_wavg.to_csv('Submission Weighted Average.csv', index=False)
sub_l2.to_csv('Submission Level 2.csv', index=False)