# Introduction

This model demonstrates simple kernel stacking using XGBoost, Catboost, Hist Gradient Boosting Regression, and Ridge Regression. It uses 3-fold cross validation to build each model, and makes both test and training predictions out-of-fold. Those results are then fed into the level 2 Ridge model, where 3-fold cross validation is used again to make out-of-fold predictions for the submission result. No feature engineering is employed. Basic tuning is implemented. If you like the model, please consider upvoting!

In [None]:
import pandas as pd
import numpy as np
import gc

train = pd.read_csv("../input/tabular-playground-series-oct-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-oct-2021/test.csv")
submission = pd.read_csv("../input/tabular-playground-series-oct-2021/sample_submission.csv")
train

# Define Features

There are two types of features in the dataset, `cat_features` which are categorical, and `cont_features`, which are continuous. 

In [None]:
cat_features = ["f22", "f43"]
cat_features.extend(["f{}".format(x) for x in range(242, 285)])

cont_features = ["f{}".format(x) for x in range(242)]
cont_features.remove("f22")
cont_features.remove("f43")

target = train["target"]

In [None]:
# Reduce memory usage on our columns
for feature in cont_features:
    train[feature] = train[feature].astype(np.float16)
    test[feature] = test[feature].astype(np.float16)
    
for feature in cat_features:
    train[feature] = train[feature].astype(np.int8)
    test[feature] = test[feature].astype(np.int8)
    
_ = gc.collect()

# Build Level 1 Models

In [None]:
!pip install --force-reinstall xgboost==1.3.1

In [None]:
import numpy

import gc
import warnings
warnings.filterwarnings("ignore")

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

random_state = 2021
n_folds = 3
k_fold = StratifiedKFold(n_splits=n_folds)

xgb_train_probas = numpy.zeros(len(train.index), )
xgb_test_probas = numpy.zeros(len(test.index), )

cb_train_probas = numpy.zeros(len(train.index), )
cb_test_probas = numpy.zeros(len(test.index), )

hgbc_train_probas = numpy.zeros(len(train.index), )
hgbc_test_probas = numpy.zeros(len(test.index), )

features = cat_features + cont_features

for fold, (train_index, test_index) in enumerate(k_fold.split(train, target)):
    print("--> Fold {}".format(fold + 1))
    y_train = target.iloc[train_index]
    y_valid = target.iloc[test_index]

    x_train = train[features].iloc[train_index]
    x_valid = train[features].iloc[test_index]

    xgb_model = XGBClassifier(
        seed=random_state,
        n_estimators=10000,
        verbosity=1,
        eval_metric="auc",
        tree_method="gpu_hist",
        gpu_id=0,
        alpha=5.089629324639061,
        colsample_bytree=0.9908475800809204,
        gamma=2.7408840774631726,
        reg_lambda=7.653094261603253,
        learning_rate=0.07318975820906748,
        max_bin=750,
        max_depth=9,
        min_child_weight=3.0472862580065305,
        subsample=0.5607802273775566,
        use_label_encoder=False,
    )
    xgb_model.fit(
        x_train,
        y_train,
        eval_set=[(x_valid, y_valid)], 
        verbose=0,
        early_stopping_rounds=50
    )

    train_oof_probas = xgb_model.predict_proba(x_valid)[:, -1]
    test_oof_probas = xgb_model.predict_proba(test[features])[:, -1]

    xgb_train_probas[test_index] = train_oof_probas
    xgb_test_probas += test_oof_probas / n_folds
    
    print(": XGB - ROC AUC Score = {}".format(roc_auc_score(y_valid, train_oof_probas)))
   
    del(xgb_model)
    _ = gc.collect()

    
    cb_model = CatBoostClassifier(
        verbose=0,
        eval_metric="AUC",
        random_state=random_state,
        num_boost_round=20000,
        od_type="Iter",
        od_wait=200,
        task_type="GPU",
        devices="0",
        bootstrap_type="Bernoulli",
        grow_policy="Depthwise",
        l2_leaf_reg=6.177060577081939,
        learning_rate=0.015198885894797058,
        loss_function="CrossEntropy",
        max_depth=6,
        min_data_in_leaf=2,
        penalties_coefficient=0.8746786262649054,
        cat_features=[x for x in range(len(cat_features))],
    )
    cb_model.fit(
        x_train,
        y_train,
        eval_set=[(x_valid, y_valid)], 
        verbose=0,
    )

    train_oof_probas = cb_model.predict_proba(x_valid)[:, -1]
    test_oof_probas = cb_model.predict_proba(test[features])[:, -1]

    cb_train_probas[test_index] = train_oof_probas
    cb_test_probas += test_oof_probas / n_folds
    print(": CB - ROC AUC Score = {}".format(roc_auc_score(y_valid, train_oof_probas)))
    
    del(cb_model)
    _ = gc.collect()

    
    hgbc_model = HistGradientBoostingClassifier(
        random_state=2021,
    )
    hgbc_model.fit(
        x_train,
        y_train,
    )

    train_oof_probas = hgbc_model.predict_proba(x_valid)[:, -1]
    test_oof_probas = hgbc_model.predict_proba(test[features])[:, -1]

    hgbc_train_probas[test_index] = train_oof_probas
    hgbc_test_probas += test_oof_probas / n_folds
    print(": HGBC - ROC AUC Score = {}".format(roc_auc_score(y_valid, train_oof_probas)))

    del(hgbc_model)
    del(x_valid)
    _ = gc.collect()

    print("")
    
print("--> Overall metrics")
print(": XGB - ROC AUC Score = {}".format(roc_auc_score(target, xgb_train_probas)))
print(": CB - ROC AUC Score = {}".format(roc_auc_score(target, cb_train_probas)))
print(": HGBC - ROC AUC Score = {}".format(roc_auc_score(target, hgbc_train_probas)))

# Build Level 2 Models

In [None]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import RidgeClassifier

random_state = 2021
n_folds = 3
k_fold = StratifiedKFold(n_splits=n_folds, random_state=random_state, shuffle=True)

l1_train = pd.DataFrame(data={
    "xgb": xgb_train_probas.tolist(),
    "cb": cb_train_probas.tolist(),
    "hgbc": hgbc_train_probas.tolist(),
})
l1_test = pd.DataFrame(data={
    "xgb": xgb_test_probas.tolist(),
    "cb": cb_test_probas.tolist(),
    "hgbc": hgbc_test_probas.tolist(),
})

train_probas = numpy.zeros(len(l1_train.index), )
test_probas = numpy.zeros(len(l1_test.index), )

features = ["xgb", "cb", "hgbc"]

for fold, (train_index, test_index) in enumerate(k_fold.split(l1_train, target)):
    print("--> Fold {}".format(fold + 1))
    y_train = target.iloc[train_index]
    y_valid = target.iloc[test_index]

    x_train = l1_train[features].iloc[train_index]
    x_valid = l1_train[features].iloc[test_index]
    
    model = CalibratedClassifierCV(RidgeClassifier(random_state=random_state), cv=n_folds)
    model.fit(
        x_train,
        y_train,
    )

    train_oof_probas = model.predict_proba(x_valid)[:, -1]
    test_oof_probas = model.predict_proba(l1_test[features])[:, -1]

    train_probas[test_index] = train_oof_probas
    test_probas += test_oof_probas / n_folds

    print(": ROC AUC Score = {}".format(roc_auc_score(y_valid, train_oof_probas)))
    print("")
    
print("--> Overall metrics")
print(": ROC AUC Score = {}".format(roc_auc_score(target, train_probas)))

# Make Submission

In [None]:
submission["target"] = test_probas.tolist()
submission.to_csv("submission.csv", index=False)