<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setup" data-toc-modified-id="Setup-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setup</a></span></li><li><span><a href="#Data-overview" data-toc-modified-id="Data-overview-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data overview</a></span></li><li><span><a href="#Baseline" data-toc-modified-id="Baseline-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Baseline</a></span></li><li><span><a href="#Hyperparameter-tuning" data-toc-modified-id="Hyperparameter-tuning-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Hyperparameter tuning</a></span></li><li><span><a href="#Cross-validation" data-toc-modified-id="Cross-validation-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Cross-validation</a></span></li><li><span><a href="#Submission" data-toc-modified-id="Submission-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Submission</a></span></li></ul></div>

# Setup

In [1]:
import os
import gc
import time
import warnings
import subprocess

gc.enable()
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
from scipy.stats import mode
import pandas as pd
pd.set_option('precision', 4)
pd.set_option('display.max_columns', None)

import xgboost as xgb
import optuna
from optuna.samplers import TPESampler
from optuna.integration import XGBoostPruningCallback

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score

In [3]:
xgb.__version__, optuna.__version__

('1.6.0', '2.10.0')

In [4]:
SEED = 2311

os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [5]:
#Check GPU availability
try:
    subprocess.check_output('nvidia-smi')
    GPU = True
except Exception:
    GPU = False

print(f'GPU available: {GPU}')

GPU available: True


# Data overview

**Data source: [Data preprocessing notebook](https://www.kaggle.com/code/stiwar1/tps-may22-data-preprocessing)**

In [6]:
train = pd.read_csv('../input/tps-may22-data-preprocessing/train_processed.csv')
test = pd.read_csv('../input/tps-may22-data-preprocessing/test_processed.csv')

In [7]:
train.head()

Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_28,f_29,f_30,target,f_27_0,f_27_1,f_27_2,f_27_3,f_27_4,f_27_5,f_27_6,f_27_7,f_27_8,f_27_9,f_27_unique,fold
0,0,-1.3732,0.2389,-0.2434,0.5674,-0.6477,0.8393,0.1131,1,5,1,3,3,3,1,6,1,0,7,4,0.2982,-0.9197,3.0585,-2.5407,0.767,-2.7306,-0.2082,1.3634,67.6092,0,0,0,0,1,0,1,3,0,3,1,0,1,3,2
1,1,1.697,-1.7103,-2.2303,-0.5457,1.1132,-1.5522,0.4478,1,3,4,0,2,3,0,1,0,4,6,0,-3.1477,-1.0754,2.1791,2.2783,-0.6337,-1.2171,-3.7822,-0.0583,377.0964,0,0,1,0,2,0,2,2,0,3,2,4,1,5,4
2,2,1.6817,0.6167,-1.0277,0.8105,-0.6091,0.114,-0.7087,1,0,2,6,6,4,3,1,2,2,1,4,2.8207,-3.4853,-0.7842,-1.3858,-0.5206,-0.0091,2.7885,-3.7035,-195.5997,0,2,1,0,0,0,4,0,1,2,10,0,3,6,2
3,3,-0.1182,-0.5878,-0.8046,2.0868,0.371,-0.1288,-0.2826,3,2,1,0,1,6,4,2,3,3,0,3,1.0811,-2.1002,-2.3438,0.5726,-1.6532,1.686,-2.5331,-0.6086,210.8262,0,0,1,1,3,1,1,0,0,2,1,2,1,4,0
4,4,1.1485,-0.1766,-0.6649,-1.1013,0.4679,0.5001,0.4075,3,3,0,4,3,0,6,0,3,3,1,0,-0.1262,0.605,1.1337,-3.9129,-1.4304,2.1276,-3.3068,4.3714,-217.2118,0,1,1,1,3,1,2,1,1,2,7,5,4,6,4


In [8]:
test.head()

Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_28,f_29,f_30,f_27_0,f_27_1,f_27_2,f_27_3,f_27_4,f_27_5,f_27_6,f_27_7,f_27_8,f_27_9,f_27_unique
0,900000,0.4425,0.1744,-0.9998,0.7627,0.1868,-1.0748,0.5019,6,6,0,0,0,5,1,1,2,2,0,1,-1.1284,2.7493,-1.0064,-1.1939,-2.4357,-2.4274,-1.9669,5.7342,99.4784,0,0,1,0,0,0,1,0,3,11,0,2,5
1,900001,-0.6056,-0.3057,0.6277,-0.5789,-1.7509,1.3556,-0.1909,1,3,4,0,2,1,3,0,0,0,2,0,-4.4241,1.0808,2.3824,0.1494,1.8833,-2.8487,-0.7252,3.1942,-65.9938,1,0,0,5,0,1,1,0,4,6,2,1,6
2,900002,0.304,2.4451,0.2465,0.8182,0.3597,-1.3318,1.3586,3,3,4,1,1,3,2,2,4,1,0,3,-1.5239,-1.4067,-7.0261,1.3123,-5.1572,1.714,0.585,0.0669,-87.4056,0,1,1,1,0,2,0,1,1,10,4,4,5
3,900003,0.1541,0.2601,-1.3671,-0.0932,-1.111,-0.9485,1.1192,0,0,4,2,1,5,0,3,3,1,4,2,-1.4046,3.0111,-0.5945,-3.9395,1.7546,-2.364,-1.0033,3.8931,-281.2935,0,0,0,4,1,4,0,0,2,16,2,2,5
4,900004,-1.6519,-0.4243,-0.6674,-0.3221,-0.0895,0.1817,1.785,2,2,2,0,0,3,0,1,2,0,2,2,-1.9685,0.1006,0.0849,-0.9857,-0.1305,-3.5579,1.2107,1.8619,25.6294,0,2,0,4,1,1,1,1,3,0,1,5,5


In [9]:
TARGET = 'target'
features = [f for f in test.columns if f != 'id']

# Baseline

In [10]:
xtrain, xval, ytrain, yval = train_test_split(train[features], 
                                              train[TARGET],
                                              test_size=0.2,
                                              stratify=train[TARGET],
                                              shuffle=True,
                                              random_state=SEED)

In [11]:
TREE_METHOD = 'gpu_hist' if GPU else 'hist'

baseline = xgb.XGBClassifier(
    n_estimators=200,
    tree_method=TREE_METHOD,
    objective='binary:logistic',
    eval_metric='auc',
    random_state=SEED)

In [12]:
baseline.fit(
    xtrain, ytrain,
    verbose=0)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric='auc', gamma=0, gpu_id=0, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=200,
              n_jobs=0, num_parallel_tree=1, predictor='auto',
              random_state=2311, reg_alpha=0, reg_lambda=1, ...)

In [13]:
baseline_auc = roc_auc_score(yval, baseline.predict_proba(xval[features])[:, 1])
print(f'Baseline AUC: {baseline_auc:.5f}')

Baseline AUC: 0.98226


# Hyperparameter tuning

In [14]:
N_ESTIMATORS = 10000
EARLY_STOPPING_ROUNDS = 200
TREE_METHOD = 'gpu_hist' if GPU else 'hist'
OBJECTIVE = 'binary:logistic'
EVAL_METRIC = 'auc'

In [15]:
base_params = {
    'n_estimators': N_ESTIMATORS,
    'early_stopping_rounds': EARLY_STOPPING_ROUNDS,
    'tree_method': TREE_METHOD,
    'enable_categorical': GPU, #only available for gpu_hist
    'max_cat_to_onehot': 5,
    'eval_metric': EVAL_METRIC,
    'random_state': SEED,
    'verbosity': 0
}

In [16]:
def objective(trial, base_params, data):
    #Defining hyperparameter search space
    param_grid = {
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        
        'max_bin': trial.suggest_int('max_bin', 4, 512),
        
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.3),
        
        'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
        
        'gamma': trial.suggest_float('gamma', 0.1, 20.0, step=0.1),
        
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 100),
        
        'max_delta_step': trial.suggest_float('max_delta_step', 1, 10, step=0.5),
        
        'subsample': trial.suggest_float('subsample', 0.5, 0.95, step=0.05),
        
        'colsample_bytree': trial.suggest_float(
            'colsample_bytree', 0.5, 0.95, step=0.05),
        
        'colsample_bylevel': trial.suggest_float(
            'colsample_bylevel', 0.5, 0.95, step=0.05),
        
        'colsample_bynode': trial.suggest_float(
            'colsample_bynode', 0.5, 0.95, step=0.05),
        
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-5, 1e3, log=True),
        
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-5, 1e3, log=True)
    }

    if param_grid['booster'] == 'dart':
        param_grid['sample_type'] = 'weighted'
        param_grid['normalize_type'] = trial.suggest_categorical(
            'normalize_type', ['tree', 'forest'])
        param_grid['rate_drop'] = trial.suggest_float(
            'rate_drop', 0.1, 0.3)
        param_grid['skip_drop'] = trial.suggest_float(
            'skip_drop', 0.33, 0.67)

    model = xgb.XGBClassifier(
        **base_params, 
        **param_grid, 
        callbacks = [XGBoostPruningCallback(trial, 'validation_0-auc')])
    
    xtrain, xval, ytrain, yval = data
    
    model.fit(
        xtrain, ytrain,
        eval_set=[(xval, yval)],
        verbose=False)

    predictions = model.predict_proba(xval)[:, 1]
    return roc_auc_score(yval, predictions)

In [17]:
def tune_hyperparameters(
        base_params,
        data,
        direction='maximize', 
        n_trials=5):
    
    study = optuna.create_study(
        sampler=TPESampler(seed=SEED),
        direction=direction)
    
    study.optimize(
        lambda trial: objective(trial, base_params, data),
        n_trials=n_trials,
        gc_after_trial=True)
    
    return study.best_params, study.best_value

In [18]:
best_params, best_value = tune_hyperparameters(
    base_params=base_params,
    data=(xtrain, xval, ytrain, yval),
    n_trials=100)

[32m[I 2022-05-10 12:44:13,224][0m A new study created in memory with name: no-name-471b8e82-90cb-4b2f-86b1-b2304f503113[0m
[32m[I 2022-05-10 12:44:25,668][0m Trial 0 finished with value: 0.9604821680101833 and parameters: {'max_depth': 6, 'max_bin': 434, 'learning_rate': 0.10469290709525571, 'booster': 'gbtree', 'gamma': 14.200000000000001, 'min_child_weight': 3, 'max_delta_step': 2.0, 'subsample': 0.8500000000000001, 'colsample_bytree': 0.75, 'colsample_bylevel': 0.5, 'colsample_bynode': 0.55, 'reg_alpha': 0.00015649840907716483, 'reg_lambda': 41.70581893268244}. Best is trial 0 with value: 0.9604821680101833.[0m
[32m[I 2022-05-10 12:44:43,795][0m Trial 1 finished with value: 0.9723580575683811 and parameters: {'max_depth': 12, 'max_bin': 295, 'learning_rate': 0.07961566681396946, 'booster': 'gbtree', 'gamma': 12.6, 'min_child_weight': 56, 'max_delta_step': 8.0, 'subsample': 0.7, 'colsample_bytree': 0.6, 'colsample_bylevel': 0.5, 'colsample_bynode': 0.8500000000000001, 'reg_a

In [19]:
print(f'Best AUC: {best_value:.5f}')
print('Best params:')
for key, value in best_params.items():
    print(f'\t{key}: {value}')

Best AUC: 0.98902
Best params:
	max_depth: 11
	max_bin: 497
	learning_rate: 0.15850193899392595
	booster: gbtree
	gamma: 0.2
	min_child_weight: 49
	max_delta_step: 7.0
	subsample: 0.9
	colsample_bytree: 0.95
	colsample_bylevel: 0.7
	colsample_bynode: 0.9
	reg_alpha: 0.0015901912590753843
	reg_lambda: 21.02972568645695


In [20]:
model_params = dict(base_params, **best_params)

# Cross-validation

In [21]:
def evaluate_model(train, test, features, model_params, n_splits=5):
    
    oof_proba = {} #probability predictions
    test_proba = []
    oof_pred = {} #class predictions
    test_pred = []
    cv_scores = []
    
    cv_start = time.time()
    for fold in range(n_splits):
        xtrain = train[train['fold'] != fold].reset_index(drop=True)
        ytrain = xtrain[TARGET]

        xval = train[train['fold'] == fold].reset_index(drop=True)
        yval = xval[TARGET]
        val_idx = xval.index.to_list()

        fold_start = time.time()

        model = xgb.XGBClassifier(**model_params)
        
        model.fit(
            xtrain[features], ytrain,
            eval_set=[(xval[features], yval)], 
            verbose=False)

        val_pred = model.predict(xval[features])
        oof_pred.update(dict(zip(val_idx, val_pred)))
        val_proba = model.predict_proba(xval[features])[:, 1]
        oof_proba.update(dict(zip(val_idx, val_proba)))        

        score = roc_auc_score(yval, val_proba)
        cv_scores.append(score)

        fold_end = time.time()

        print(f'Fold #{fold}: AUC = {score:.5f} \
        [Time: {fold_end - fold_start:.2f}s]')
        
        test_pred.append(model.predict(test[features]))
        test_proba.append(model.predict_proba(test[features])[:, 1])
        
    cv_end = time.time()

    print(f'Average AUC = {np.mean(cv_scores):.5f} \
    with std. dev. = {np.std(cv_scores):.5f}')
    print(f'[Total time: {cv_end - cv_start:.2f}s]')

    oof_pred = pd.DataFrame.from_dict(oof_pred, orient='index').reset_index()
    oof_proba = pd.DataFrame.from_dict(oof_proba, orient='index').reset_index()
    
    test_pred = mode(np.column_stack(test_pred), axis=1).mode
    test_proba = np.mean(np.column_stack(test_proba), axis=1)
    
    return oof_pred, oof_proba, test_pred, test_proba

In [22]:
oof_pred, oof_proba, test_pred, test_proba = evaluate_model(train, test, features, model_params)

Fold #0: AUC = 0.98905         [Time: 81.06s]
Fold #1: AUC = 0.98941         [Time: 78.89s]
Fold #2: AUC = 0.98919         [Time: 58.82s]
Fold #3: AUC = 0.98915         [Time: 61.97s]
Fold #4: AUC = 0.98888         [Time: 60.34s]
Average AUC = 0.98914     with std. dev. = 0.00017
[Total time: 769.30s]


# Submission

In [23]:
xgb_pred = pd.DataFrame({
        'id': test.id,
        'target': test_pred.ravel()
    })

In [24]:
xgb_pred.to_csv('xgb_pred.csv', index=False)
!head xgb_pred.csv

id,target
900000,1
900001,1
900002,0
900003,0
900004,1
900005,0
900006,0
900007,1
900008,0


In [25]:
xgb_proba = pd.DataFrame({
        'id': test.id,
        'target': test_proba
    })

In [26]:
xgb_proba.to_csv('xgb_proba.csv', index=False)
!head xgb_proba.csv

id,target
900000,0.9988767
900001,0.9995184
900002,4.094732e-06
900003,0.022790318
900004,0.9985714
900005,0.082466796
900006,0.29355544
900007,0.9999982
900008,0.19217673


**Time to submit!**