# Stacking

In this notebook we look at the best parameters found for the following models:

1. XGBoost
2. LightGBM
3. CatBoost
4. HistGradientBoosting (scikit-learn)

We then use stacking to ensemble these 4 models.

**Note:** I leave the models on their verbose settings so I can monitor their training since it will take a long time to finish

In [1]:
# Global variables for testing changes to this notebook quickly
RANDOM_SEED = 0
NUM_TREES = 15000
EARLY_STOP = 200
NUM_FOLDS = 3
TEST = False
SUBMIT = True

In [2]:
# General imports
import numpy as np
import pandas as pd
import scipy.stats as stats
import pyarrow
import time
import gc

# Evaluation and model selection
from sklearn.base import clone
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier

# Models
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

# Hide warnings (makes optuna output easier to parse)
import warnings
warnings.filterwarnings('ignore')

# Preparing the Data

We define our cross-validation scheme at the start to ensure that it is the same across all the models we consider

In [3]:
%%time

# Load Data
train = pd.read_feather("../data/train.feather")
test = pd.read_feather("../data/test.feather")
submission = pd.read_csv('../data/sample_submission.csv')

if TEST:
    train, junk = train_test_split(
        train, 
        train_size = 0.1,
        shuffle = True,
        stratify = train['target'],
    )
    train.reset_index(drop = True, inplace = True)
    
    del junk
    gc.collect()

# Relevant features
features = [x for x in train.columns if x not in ['id','target']]

Wall time: 2.23 s


In [4]:
# Stratified k-fold cross-validation
train['kfold'] = -1
skf = StratifiedKFold(n_splits = NUM_FOLDS, shuffle = True, random_state = RANDOM_SEED)
for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train['target'])):
    train['kfold'].iloc[valid_idx] = fold
    
oof_preds = pd.DataFrame(
    data = dict(kfold = train['kfold'])
)

test_preds = pd.DataFrame(
    data = dict(id = test['id'])
)

# Feature Engineering

We experiment with feature engineering using row statistics, primarily to add variance to our predictions.

In [5]:
def create_row_stats(data):
    cont_cols, cat_cols = list(), list()
    for col in features:
        if data[col].dtype.name.startswith("int"):
            cat_cols.append(col)
        else:
            cont_cols.append(col)
    new_data = data.copy()
    new_data['binary_count'] = data[cat_cols].sum(axis=1)
    new_data['binary_std'] = data[cat_cols].std(axis=1)
    new_data['min'] = data[cont_cols].min(axis=1)
    new_data['std'] = data[cont_cols].std(axis=1)
    new_data['max'] = data[cont_cols].max(axis=1)
    new_data['median'] = data[cont_cols].median(axis=1)
    new_data['mean'] = data[cont_cols].mean(axis=1)
    #new_data['var'] = data[cont_cols].var(axis=1)
    #new_data['sum'] = data[cont_cols].sum(axis=1)
    #new_data['sem'] = data[cont_cols].sem(axis=1)
    new_data['skew'] = data[cont_cols].skew(axis=1)
    new_data['median_abs_dev'] = stats.median_abs_deviation(data[cont_cols], axis=1)
    new_data['zscore'] = (np.abs(stats.zscore(data[cont_cols]))).sum(axis=1)
    return new_data

In [6]:
%%time

train = create_row_stats(train)
test = create_row_stats(test)

# New features
all_features = [x for x in train.columns if x not in ['id','target','kfold']]
assert features != all_features

Wall time: 53 s


# 1. XGBoost

We use the best parameters from [this Kaggle notebook](https://www.kaggle.com/rsizem2/tps-10-21-optuna-w-pruning-callbacks-xgboost). Except for using CPU rather than GPU, which in a lot of cases results in more accurate results

In [7]:
# Best Parameters
xgboost_params = {
    'random_state': RANDOM_SEED,
    'n_estimators': NUM_TREES,
    #'tree_method': 'hist',
    'max_depth': 5, 
    'learning_rate': 0.02261104274598307, 
    'min_child_weight': 74.7573299373233, 
    'subsample': 0.766, 
    'colsample_bytree': 0.268, 
    'colsample_bylevel': 0.591, 
    'reg_lambda': 75.35694292360638
}

In [8]:
def train_xgboost(model_params = {}, fit_params = {}, new_features = False):
    
    # Store the  predictions
    oof_preds = np.zeros((train.shape[0],))
    test_preds = np.zeros((test.shape[0],))
    print('')
    
    # Stratified k-fold cross-validation
    for fold in range(NUM_FOLDS):
        
        # Training and Validation Sets
        if new_features:
            X_train, y_train = train[train.kfold != fold][features], train[train.kfold != fold]['target']
            X_valid, y_valid = train[train.kfold == fold][features], train[train.kfold == fold]['target']
            X_test = test[features]
        else:
            X_train, y_train = train[train.kfold != fold][all_features], train[train.kfold != fold]['target']
            X_valid, y_valid = train[train.kfold == fold][all_features], train[train.kfold == fold]['target']
            X_test = test[all_features]
        
        # Define Model
        model = XGBClassifier(**{**xgboost_params, **model_params})
        gc.collect()
        
        start = time.time()
        
        model.fit(
            X_train, y_train,
            verbose = False,
            eval_set = [(X_valid, y_valid)],
            eval_metric = "auc",
            early_stopping_rounds = EARLY_STOP,
            **fit_params
        )
        
        # validation and test predictions
        valid_preds = model.predict_proba(X_valid)[:, 1]
        test_preds += model.predict_proba(X_test)[:, 1] / NUM_FOLDS
        oof_preds[train.kfold == fold] = valid_preds
        
        # fold auc score
        fold_auc = roc_auc_score(y_valid, valid_preds)
        end = time.time()
        print(f'Fold {fold} (AUC): {round(fold_auc, 5)} in {round(end - start, 2)}s.')
        
    return test_preds, oof_preds

In [9]:
# Train 3 models 
test_preds['XGBoost'], oof_preds['XGBoost'] = train_xgboost()
test_preds['XGB_Hist'], oof_preds['XGB_Hist'] = train_xgboost(
    model_params = dict(tree_method = 'hist')
)
test_preds['XGB_Stats'], oof_preds['XGB_Stats'] = train_xgboost(new_features = True)


Fold 0 (AUC): 0.85685 in 2180.56s.
Fold 1 (AUC): 0.85702 in 2497.38s.
Fold 2 (AUC): 0.85683 in 2284.06s.

Fold 0 (AUC): 0.85688 in 1511.14s.
Fold 1 (AUC): 0.85698 in 1625.21s.
Fold 2 (AUC): 0.8568 in 1430.2s.

Fold 0 (AUC): 0.85691 in 1962.9s.
Fold 1 (AUC): 0.85697 in 2149.46s.
Fold 2 (AUC): 0.85674 in 2147.12s.


# 2. LightGBM

In [10]:
# Best Parameters
lightgbm_params = {
    'random_state': RANDOM_SEED,
    'n_estimators': NUM_TREES,
    'max_depth': 6, 
    'learning_rate': 0.009099999999999999, 
    'min_child_samples': 4260, 
    'subsample': 0.87, 
    'subsample_freq': 3, 
    'colsample_bytree': 0.27, 
    'reg_lambda': 0.0003694272556917343, 
    'num_leaves': 26,
}

In [11]:
def train_lightgbm(model_params = {}, fit_params = {}, new_features = False):
    
    # Store the holdout predictions
    oof_preds = np.zeros((train.shape[0],))
    test_preds = np.zeros((test.shape[0],))
    print('')
    
    # Stratified k-fold cross-validation
    for fold in range(NUM_FOLDS):
        
        # Training and Validation Sets
        if new_features:
            X_train, y_train = train[train.kfold != fold][features], train[train.kfold != fold]['target']
            X_valid, y_valid = train[train.kfold == fold][features], train[train.kfold == fold]['target']
            X_test = test[features]
        else:
            X_train, y_train = train[train.kfold != fold][all_features], train[train.kfold != fold]['target']
            X_valid, y_valid = train[train.kfold == fold][all_features], train[train.kfold == fold]['target']
            X_test = test[all_features]
        
        # Define Model
        model = LGBMClassifier(**{**lightgbm_params, **model_params})
        gc.collect()
        
        start = time.time()
        
        model.fit(
            X_train, y_train,
            verbose = 0,
            eval_set = [(X_valid, y_valid)],
            eval_metric = "auc",
            early_stopping_rounds = EARLY_STOP,
            **fit_params
        )
        
        # validation and test predictions
        valid_preds = model.predict_proba(X_valid)[:, 1]
        test_preds += model.predict_proba(X_test)[:, 1] / NUM_FOLDS
        oof_preds[train.kfold == fold] = valid_preds
        
        # fold auc score
        fold_auc = roc_auc_score(y_valid, valid_preds)
        end = time.time()
        print(f'Fold {fold} (AUC): {round(fold_auc, 5)} in {round(end - start, 2)}s.')
        
    return test_preds, oof_preds

In [12]:
# Train 2 models
test_preds['LightGBM'], oof_preds['LightGBM'] = train_lightgbm()
test_preds['LGBM_Stats'], oof_preds['LGBM_Stats'] = train_lightgbm(new_features = True)


Fold 0 (AUC): 0.85706 in 584.37s.
Fold 1 (AUC): 0.8571 in 682.35s.
Fold 2 (AUC): 0.85693 in 645.17s.

Fold 0 (AUC): 0.85706 in 621.14s.
Fold 1 (AUC): 0.85713 in 650.93s.
Fold 2 (AUC): 0.85689 in 629.13s.


# 3. CatBoost

In [13]:
# Best Parameters
catboost_params = {
    'random_state': RANDOM_SEED,
    'n_estimators': NUM_TREES,
    'boosting_type': 'Plain',
    'bootstrap_type': 'Bernoulli',
    'early_stopping_rounds': EARLY_STOP,
    'eval_metric': 'AUC',
    'max_depth': 7, 
    'learning_rate': 0.01, 
    'min_child_samples': 12710, 
    'random_strength': 33.21156029537479, 
    'leaf_estimation_iterations': 1, 
    'subsample': 0.6990000000000001, 
    'reg_lambda': 60.52806724303393
}

In [14]:
def train_catboost(model_params = {}, fit_params = {}, new_features = False):
    
    # Store the predictions
    oof_preds = np.zeros((train.shape[0],))
    test_preds = np.zeros((test.shape[0],))
    print('')

    # Stratified k-fold cross-validation
    for fold in range(NUM_FOLDS):
        
        # Training and Validation Sets
        if new_features:
            X_train, y_train = train[train.kfold != fold][features], train[train.kfold != fold]['target']
            X_valid, y_valid = train[train.kfold == fold][features], train[train.kfold == fold]['target']
            X_test = test[features]
        else:
            X_train, y_train = train[train.kfold != fold][all_features], train[train.kfold != fold]['target']
            X_valid, y_valid = train[train.kfold == fold][all_features], train[train.kfold == fold]['target']
            X_test = test[all_features]
        
        start = time.time()
        
        # Define Model
        model = CatBoostClassifier(**{**catboost_params, **model_params})
        gc.collect()
        
        model.fit(
            X_train, y_train,
            verbose = False,
            eval_set = [(X_valid, y_valid)],
            use_best_model = True,
            **fit_params
        )
        
        # validation and test predictions
        valid_preds = model.predict_proba(X_valid)[:, 1]
        test_preds += model.predict_proba(X_test)[:, 1] / NUM_FOLDS
        oof_preds[train.kfold == fold] = valid_preds
        
        # fold auc score
        fold_auc = roc_auc_score(y_valid, valid_preds)
        end = time.time()
        print(f'Fold {fold} (AUC): {round(fold_auc, 5)} in {round(end - start, 2)}s.')
        
    return test_preds, oof_preds

In [15]:
# Train CatBoost
test_preds['CatBoost'], oof_preds['CatBoost'] = train_catboost()
test_preds['Cat_Stats'], oof_preds['Cat_Stats'] = train_catboost(new_features = True)


Fold 0 (AUC): 0.85667 in 2564.81s.
Fold 1 (AUC): 0.85683 in 2898.82s.
Fold 2 (AUC): 0.85663 in 2881.62s.

Fold 0 (AUC): 0.85667 in 2802.61s.
Fold 1 (AUC): 0.85675 in 2816.5s.
Fold 2 (AUC): 0.85659 in 2820.11s.


# 4. Scikit-Learn

In [16]:
# Best Parameters
histgbc_params = {
    'random_state': RANDOM_SEED,
    'max_iter': NUM_TREES,
    'validation_fraction': 0.33,
    'early_stopping': True,
    'n_iter_no_change': EARLY_STOP,
    'verbose': 0,
}

In [17]:
def train_histgbm(model_params = {}, fit_params = {}, new_features = False):
    
    # Store the predictions
    oof_preds = np.zeros((train.shape[0],))
    test_preds = np.zeros((test.shape[0],))
    print('')
    
    # Stratified k-fold cross-validation
    for fold in range(NUM_FOLDS):
        
        # Training and Validation Sets
        if new_features:
            X_train, y_train = train[train.kfold != fold][features], train[train.kfold != fold]['target']
            X_valid, y_valid = train[train.kfold == fold][features], train[train.kfold == fold]['target']
            X_test = test[features]
        else:
            X_train, y_train = train[train.kfold != fold][all_features], train[train.kfold != fold]['target']
            X_valid, y_valid = train[train.kfold == fold][all_features], train[train.kfold == fold]['target']
            X_test = test[all_features]
        
        # Define Model
        model = HistGradientBoostingClassifier(**{**histgbc_params, **model_params})
        gc.collect()
        
        start = time.time()
        
        model.fit(
            X_train, y_train,
            **fit_params
        )
        
        # validation and test predictions
        valid_preds = model.predict_proba(X_valid)[:, 1]
        test_preds += model.predict_proba(X_test)[:, 1] / NUM_FOLDS
        oof_preds[train.kfold == fold] = valid_preds
        
        # fold auc score
        fold_auc = roc_auc_score(y_valid, valid_preds)
        end = time.time()
        print(f'Fold {fold} (AUC): {round(fold_auc, 5)} in {round(end - start, 2)}s.')
        
    return test_preds, oof_preds

In [18]:
# Train 2 models with different random seets
test_preds['HistGBM'], oof_preds['HistGBM'] = train_histgbm()
test_preds['Hist_Stats'], oof_preds['Hist_Stats'] = train_histgbm(new_features = True)


Fold 0 (AUC): 0.8542 in 156.65s.
Fold 1 (AUC): 0.85421 in 159.53s.
Fold 2 (AUC): 0.85438 in 161.03s.

Fold 0 (AUC): 0.85407 in 173.18s.
Fold 1 (AUC): 0.85388 in 159.88s.
Fold 2 (AUC): 0.85394 in 162.2s.


# Predictions

In [19]:
oof_preds.head()

Unnamed: 0,kfold,XGBoost,XGB_Hist,XGB_Stats,LightGBM,LGBM_Stats,CatBoost,Cat_Stats,HistGBM,Hist_Stats
0,1,0.695719,0.67458,0.66308,0.677566,0.688227,0.642127,0.647414,0.695878,0.633604
1,1,0.197818,0.209527,0.198422,0.195066,0.185122,0.228275,0.224138,0.156371,0.24644
2,2,0.865054,0.844144,0.858808,0.860939,0.850658,0.866861,0.858791,0.86133,0.851976
3,0,0.465767,0.487962,0.477282,0.47484,0.492786,0.495233,0.499936,0.3204,0.526619
4,0,0.846476,0.846924,0.86786,0.861961,0.857914,0.847776,0.85161,0.819631,0.845977


In [20]:
test_preds.head()

Unnamed: 0,id,XGBoost,XGB_Hist,XGB_Stats,LightGBM,LGBM_Stats,CatBoost,Cat_Stats,HistGBM,Hist_Stats
0,1000000,0.742483,0.738944,0.74303,0.740738,0.733244,0.732316,0.737226,0.691237,0.650669
1,1000001,0.245132,0.250275,0.240614,0.251458,0.24521,0.262507,0.236537,0.267694,0.264484
2,1000002,0.906799,0.908351,0.908237,0.907186,0.906372,0.905988,0.91013,0.877557,0.891221
3,1000003,0.818379,0.820664,0.851904,0.834636,0.850805,0.842568,0.854714,0.841316,0.832215
4,1000004,0.255338,0.251446,0.258755,0.265646,0.263108,0.279009,0.274908,0.243637,0.254913


# Generate Submissions

We create submissions for the CPU generated predictions to see if they are better than the GPU generated models we created with Kaggle notebooks.

In [21]:
# Make submission
submission['target'] = test_preds['XGBoost']
if SUBMIT: submission.to_csv(f'../output/xgboost_cpu_{NUM_FOLDS}fold_submission.csv', index=False)

In [22]:
# Make submission
submission['target'] = test_preds['CatBoost']
if SUBMIT: submission.to_csv(f'../output/catboost_cpu_{NUM_FOLDS}fold_submission.csv', index=False)

# Stacking

We use XGBoost and LightGBM as meta models for stacking:

## 1. LightGBM Classifier

In [23]:
def stack_lightgbm():
    preds = np.zeros((test.shape[0],))
    scores = np.zeros(NUM_FOLDS)
    
    for j in range(NUM_FOLDS):
        X_train = oof_preds[oof_preds.kfold != j].drop('kfold', axis = 1)
        X_valid = oof_preds[oof_preds.kfold == j].drop('kfold', axis = 1)
        y_train = train['target'][train.kfold != j]
        y_valid = train['target'][train.kfold == j]
        X_test = test_preds.drop('id', axis = 1)

        model = LGBMClassifier(random_state = RANDOM_SEED, n_estimators = 200)
        model.fit(
            X_train, y_train,
            verbose = 0,
            eval_set = [(X_valid, y_valid)],
            eval_metric = "auc",
            early_stopping_rounds = 25,
        )

        preds += model.predict_proba(X_test)[:, 1] / NUM_FOLDS 
        preds_valid = model.predict_proba(X_valid)[:, 1]
        scores[j] = roc_auc_score(y_valid, preds_valid)
        print("Fold", j ,"(AUC):", scores[j])

    print("Avg (AUC):", round(scores.mean(),6))
    print("Min (AUC):", round(scores.min(),6))

    return preds

In [24]:
# LGBMClassifier meta model
submission['target'] = stack_lightgbm()
if SUBMIT: submission.to_csv(f'../output/stack_lgbm_{NUM_FOLDS}fold_submission.csv', index=False)

Fold 0 (AUC): 0.8571198053744282
Fold 1 (AUC): 0.8572540298873037
Fold 2 (AUC): 0.8570818167391762
Avg (AUC): 0.857152
Min (AUC): 0.857082


## 2. XGBoost Classifier

In [25]:
def stack_xgboost():
    preds = np.zeros((test.shape[0],))
    scores = np.zeros(NUM_FOLDS)
    
    for j in range(NUM_FOLDS):
        X_train = oof_preds[oof_preds.kfold != j].drop('kfold', axis = 1)
        X_valid = oof_preds[oof_preds.kfold == j].drop('kfold', axis = 1)
        y_train = train['target'][train.kfold != j]
        y_valid = train['target'][train.kfold == j]
        X_test = test_preds.drop('id', axis = 1)

        model = XGBClassifier(random_state = RANDOM_SEED, n_estimators = 200)
        model.fit(
            X_train, y_train,
            verbose = False,
            eval_set = [(X_valid, y_valid)],
            eval_metric = "auc",
            early_stopping_rounds = 25,
        )

        preds += model.predict_proba(X_test)[:, 1] / NUM_FOLDS 
        preds_valid = model.predict_proba(X_valid)[:, 1]
        scores[j] = roc_auc_score(y_valid, preds_valid)
        print("Fold", j ,"(AUC):", scores[j])

    print("Avg (AUC):", round(scores.mean(),6))
    print("Min (AUC):", round(scores.min(),6))

    return preds

In [26]:
# XGBClassifier meta model
submission['target'] = stack_xgboost()
if SUBMIT: submission.to_csv(f'../output/stack_xgb_{NUM_FOLDS}fold_submission.csv', index=False)

Fold 0 (AUC): 0.8570661991082071
Fold 1 (AUC): 0.8571657333818037
Fold 2 (AUC): 0.8570078618118256
Avg (AUC): 0.85708
Min (AUC): 0.857008
