# Setup

In [1]:
import gc
import time
import warnings
import subprocess

gc.enable()
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold

import xgboost as xgb
import optuna

SEED = 55

In [2]:
assert xgb.__version__ == '2.0.2', 'XGBoost version differs from original notebook.' 

In [3]:
#Check GPU availability
try:
    subprocess.check_output('nvidia-smi')
    DEVICE = 'cuda'
except Exception:
    DEVICE = 'cpu'

print(f'Available device: {DEVICE}')

Available device: cpu


# Data preparation

In [4]:
DATA_DIR = '/kaggle/input/playground-series-s4e1'
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

original = pd.read_csv('/kaggle/input/bank-customer-churn-prediction/Churn_Modelling.csv')

In [5]:
train = train.drop(['id', 'CustomerId', 'Surname'], axis=1)
test = test.drop(['id', 'CustomerId', 'Surname'], axis=1)
original = original.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

In [6]:
features = list(test.columns)
TARGET = 'Exited'

In [7]:
cat_features = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']

train[cat_features] = train[cat_features].astype('category')
test[cat_features] = test[cat_features].astype('category')
original[cat_features] = original[cat_features].astype('category')

In [8]:
train = train.drop_duplicates(keep='first', ignore_index=True) # repeating
train = train.drop_duplicates(subset=features, keep=False, ignore_index=True) # contradictory

original = original.drop_duplicates(keep='first', ignore_index=True)

# Baseline

In [9]:
# competition metric
def comp_metric(y_true, y_pred):
    return roc_auc_score(y_true, y_pred)

In [10]:
X, y = train[features], train[TARGET]
oof_preds = {}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
for fold, (train_ids, val_ids) in enumerate(cv.split(X, y)):
    X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
    X_val, y_val = X.iloc[val_ids], y.iloc[val_ids]
    
    model = xgb.XGBClassifier(
        n_estimators=1000,
        learning_rate=0.1,
        early_stopping_rounds=100,
        objective='binary:logistic',
        eval_metric='auc',
        booster='gbtree',
        tree_method='hist',
        device=DEVICE,
        enable_categorical=True,
        verbosity=0,
        n_jobs=-1,
        random_state=SEED)
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False)
        
    val_preds = model.predict_proba(X_val)[:, 1]
    oof_preds.update(dict(zip(val_ids, val_preds)))
    
    score = comp_metric(y_val, val_preds)
    print(f'Fold #{fold}: {score:.5f} ({model.best_iteration} rounds)')        
    _ = gc.collect()
    
oof_preds = pd.Series(oof_preds).sort_index()
print(f'\nOOF score: {comp_metric(y, oof_preds):.4f}\n')

Fold #0: 0.89082 (96 rounds)
Fold #1: 0.89075 (104 rounds)
Fold #2: 0.89020 (109 rounds)
Fold #3: 0.88782 (114 rounds)
Fold #4: 0.88751 (74 rounds)

OOF score: 0.8894



# Hyperparameter tuning

In [11]:
def objective(trial, features, model, extend, folds, seed):
    oof_preds = {}
    
    param_grid = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, step=0.01),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 16),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0, step=0.05),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0, step=0.05),
        'gamma': trial.suggest_float('gamma', 0, 20, step=0.1), #complexity-control
        'alpha': trial.suggest_float('alpha', 0, 5, step=0.1), #L1-reg
        'lambda': trial.suggest_float('lambda', 5e-3, 5e3, log=True), #L2-reg
#         'max_delta_step': trial.suggest_float('max_delta_step', 0, 10, step=0.5),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 5, step=0.1)
    }
    
    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (train_ids, val_ids) in enumerate(cv.split(train, train[TARGET])):
        X_train = train.iloc[train_ids]
        X_val = train.iloc[val_ids]
        if extend: # concat original dataset and remove duplicates, repeating followed by contradictory
            X_train = pd.concat([X_train, original], axis=0) \
                        .drop_duplicates(keep='first') \
                        .drop_duplicates(subset=features, keep=False, ignore_index=True)
        
        y_train, y_val = X_train.pop(TARGET), X_val.pop(TARGET)
        X_train, X_val = X_train[features], X_val[features]
        
        model.set_params(**param_grid)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0)
        
        val_preds = model.predict_proba(X_val)[:, 1]
        oof_preds.update(dict(zip(val_ids, val_preds)))
        
    oof_preds = pd.Series(oof_preds).sort_index()
    return comp_metric(train[TARGET], oof_preds)

In [12]:
def tune_params(features, model, extend, folds, seed, n_trials, direction):
    study = optuna.create_study(
        sampler=optuna.samplers.TPESampler(
            consider_endpoints=True,
            multivariate=True,
            group=True,
            seed=seed),
        pruner=optuna.pruners.HyperbandPruner(),
        direction=direction)
    study.optimize(
        func=lambda trial: objective(
            trial, features, model, extend, folds, seed),
        n_trials=n_trials,
        gc_after_trial=True)
    
    return study

# Cross-validation framework

In [13]:
def custom_cv(features, model, extend, folds, seed, verbose=True):
    oof_preds = {}
    test_preds = {}
    scores = []
    
    X_test = test[features]
    
    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (train_ids, val_ids) in enumerate(cv.split(train, train[TARGET])):
        X_train = train.iloc[train_ids]
        X_val = train.iloc[val_ids]
        if extend: # concat original dataset and remove duplicates, repeating followed by contradictory
            X_train = pd.concat([X_train, original], axis=0) \
                        .drop_duplicates(keep='first') \
                        .drop_duplicates(subset=features, keep=False, ignore_index=True)
        
        y_train, y_val = X_train.pop(TARGET), X_val.pop(TARGET)
        X_train, X_val = X_train[features], X_val[features]
        
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0)
        
        val_preds = model.predict_proba(X_val)[:, 1]
        oof_preds.update(dict(zip(val_ids, val_preds)))
        test_preds[f'fold{fold}'] = model.predict_proba(X_test)[:, 1]
        
        score = comp_metric(y_val, val_preds)
        scores.append(score)
        if verbose:
            print(f'Fold #{fold}: {score:.5f} ({model.best_iteration} rounds)')
            
        _ = gc.collect()
    
    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mean'] = test_preds.mean(axis=1) # mean of fold-wise predictions
    
    oof_preds = pd.Series(oof_preds).sort_index()
    print(f'\nOOF score: {comp_metric(train[TARGET], oof_preds):.5f}')
    print(f'Avg. score: {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    
    return oof_preds, test_preds

In [14]:
def run_experiment(features, extend=False, folds=5, seed=SEED, n_trials=50):
    
    base_params = {
        'booster': 'gbtree',
        'tree_method': 'hist',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'n_estimators': 10000,
        'early_stopping_rounds': 100,
        'device': DEVICE,
        'enable_categorical': True,
        'max_cat_to_onehot': 3, # for Geography column
        'verbosity': 0,
        'n_jobs': -1,
        'seed': seed
    }
    model = xgb.XGBClassifier(**base_params)
    
    print(f'----------Hyperparameter tuning----------')
    start = time.time()
    
    study = tune_params(features, model, extend, folds, seed, n_trials, direction='maximize') 
    # metric: AUC -> higher is better
    
    end = time.time()
    
    print(f'Best trial: {study.best_trial.number} -> Best value: {study.best_value:.5f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:20} - {v}')
    print(f'\n[Time taken: {end - start:.2f}s]\n')
    
    print(f'-----Cross-validation and prediction-----')
    start = time.time()
    
    model.set_params(**study.best_params)
    oof_preds, test_preds = custom_cv(features, model, extend, folds, seed)
    
    end = time.time()
    print(f'\n[Time taken: {end - start:.2f}s]\n')
    
    return oof_preds, test_preds

In [15]:
def create_submission_files(test_preds, config, notebook='01'):
    sub = sample_sub.copy()
    sub[TARGET] = test_preds['mean']
    sub.to_csv(f'{notebook}_{config}.csv', index=False)

**Trial run:**

In [16]:
optuna.logging.set_verbosity(optuna.logging.INFO)

In [17]:
_ , _ = run_experiment(features=features, n_trials=3)

[I 2024-01-05 08:21:52,135] A new study created in memory with name: no-name-bc3b82ec-1114-4b4c-8f5a-2a410886f8c1


----------Hyperparameter tuning----------


[I 2024-01-05 08:24:19,410] Trial 0 finished with value: 0.8898194420311876 and parameters: {'learning_rate': 0.01, 'max_depth': 12, 'min_child_weight': 9, 'subsample': 0.7, 'colsample_bytree': 0.8, 'gamma': 5.7, 'alpha': 4.3, 'lambda': 0.008822078646927935, 'scale_pos_weight': 1.4}. Best is trial 0 with value: 0.8898194420311876.
[I 2024-01-05 08:25:08,739] Trial 1 finished with value: 0.8866572567037293 and parameters: {'learning_rate': 0.08, 'max_depth': 3, 'min_child_weight': 13, 'subsample': 0.6, 'colsample_bytree': 0.85, 'gamma': 16.400000000000002, 'alpha': 4.5, 'lambda': 4098.637364141357, 'scale_pos_weight': 3.0}. Best is trial 0 with value: 0.8898194420311876.
[I 2024-01-05 08:25:56,274] Trial 2 finished with value: 0.8896588209577386 and parameters: {'learning_rate': 0.04, 'max_depth': 11, 'min_child_weight': 7, 'subsample': 0.85, 'colsample_bytree': 0.6, 'gamma': 16.2, 'alpha': 2.1, 'lambda': 11.226623011877013, 'scale_pos_weight': 4.1}. Best is trial 0 with value: 0.889819

Best trial: 0 -> Best value: 0.88982
Best hyperparameters:
learning_rate        - 0.01
max_depth            - 12
min_child_weight     - 9
subsample            - 0.7
colsample_bytree     - 0.8
gamma                - 5.7
alpha                - 4.3
lambda               - 0.008822078646927935
scale_pos_weight     - 1.4

[Time taken: 244.25s]

-----Cross-validation and prediction-----
Fold #0: 0.89147 (2635 rounds)
Fold #1: 0.89098 (2278 rounds)
Fold #2: 0.89050 (2936 rounds)
Fold #3: 0.88821 (1443 rounds)
Fold #4: 0.88811 (1835 rounds)

OOF score: 0.88982
Avg. score: 0.88985 +/- 0.00142

[Time taken: 159.75s]



In [18]:
optuna.logging.set_verbosity(optuna.logging.ERROR)

# Experiments

In [19]:
op = {} # Train-set OOF predictions
tp = {} # Test-set predictions

In [20]:
model_name = 'xgbv1'
dataset = 'trn'
folds = 5
seed = SEED
config = f'{model_name}_{dataset}_f{folds}_s{seed}'

op[config], tp[config] = run_experiment(
    features=features,  
    extend=False,
    folds=folds, 
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 49 -> Best value: 0.89024
Best hyperparameters:
learning_rate        - 0.01
max_depth            - 4
min_child_weight     - 11
subsample            - 0.65
colsample_bytree     - 0.95
gamma                - 4.0
alpha                - 1.3
lambda               - 0.10912981345000111
scale_pos_weight     - 1.7000000000000002

[Time taken: 3598.92s]

-----Cross-validation and prediction-----
Fold #0: 0.89186 (2900 rounds)
Fold #1: 0.89138 (1896 rounds)
Fold #2: 0.89110 (2373 rounds)
Fold #3: 0.88845 (1669 rounds)
Fold #4: 0.88854 (2305 rounds)

OOF score: 0.89024
Avg. score: 0.89027 +/- 0.00147

[Time taken: 165.60s]



In [21]:
model_name = 'xgbv1'
dataset = 'ext'
folds = 5
seed = SEED
config = f'{model_name}_{dataset}_f{folds}_s{seed}'

op[config], tp[config] = run_experiment(
    features=features,  
    extend=True,
    folds=folds, 
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 48 -> Best value: 0.89034
Best hyperparameters:
learning_rate        - 0.09999999999999999
max_depth            - 3
min_child_weight     - 5
subsample            - 0.85
colsample_bytree     - 0.65
gamma                - 0.8
alpha                - 3.8000000000000003
lambda               - 0.6534904967130574
scale_pos_weight     - 2.4000000000000004

[Time taken: 3146.50s]

-----Cross-validation and prediction-----
Fold #0: 0.89196 (515 rounds)
Fold #1: 0.89086 (595 rounds)
Fold #2: 0.89123 (584 rounds)
Fold #3: 0.88875 (410 rounds)
Fold #4: 0.88901 (744 rounds)

OOF score: 0.89034
Avg. score: 0.89036 +/- 0.00126

[Time taken: 47.35s]



In [22]:
model_name = 'xgbv1'
dataset = 'ext'
folds = 7
seed = SEED
config = f'{model_name}_{dataset}_f{folds}_s{seed}'

op[config], tp[config] = run_experiment(
    features=features,  
    extend=True,
    folds=folds, 
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 33 -> Best value: 0.89036
Best hyperparameters:
learning_rate        - 0.060000000000000005
max_depth            - 5
min_child_weight     - 14
subsample            - 0.65
colsample_bytree     - 0.6
gamma                - 0.1
alpha                - 4.2
lambda               - 0.43283394363421634
scale_pos_weight     - 3.6

[Time taken: 3971.43s]

-----Cross-validation and prediction-----
Fold #0: 0.88991 (300 rounds)
Fold #1: 0.89554 (445 rounds)
Fold #2: 0.88914 (231 rounds)
Fold #3: 0.89058 (348 rounds)
Fold #4: 0.89039 (295 rounds)
Fold #5: 0.88422 (253 rounds)
Fold #6: 0.89289 (404 rounds)

OOF score: 0.89036
Avg. score: 0.89038 +/- 0.00322

[Time taken: 53.91s]



In [23]:
model_name = 'xgbv1'
dataset = 'ext'
folds = 10
seed = SEED
config = f'{model_name}_{dataset}_f{folds}_s{seed}'

op[config], tp[config] = run_experiment(
    features=features,  
    extend=True,
    folds=folds, 
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 38 -> Best value: 0.89043
Best hyperparameters:
learning_rate        - 0.06999999999999999
max_depth            - 5
min_child_weight     - 10
subsample            - 0.7
colsample_bytree     - 0.6
gamma                - 2.4000000000000004
alpha                - 4.1000000000000005
lambda               - 0.16617122642544158
scale_pos_weight     - 2.5

[Time taken: 4982.90s]

-----Cross-validation and prediction-----
Fold #0: 0.89308 (388 rounds)
Fold #1: 0.89075 (655 rounds)
Fold #2: 0.89319 (346 rounds)
Fold #3: 0.88956 (192 rounds)
Fold #4: 0.89163 (301 rounds)
Fold #5: 0.89080 (296 rounds)
Fold #6: 0.89004 (263 rounds)
Fold #7: 0.88774 (194 rounds)
Fold #8: 0.88590 (508 rounds)
Fold #9: 0.89200 (418 rounds)

OOF score: 0.89043
Avg. score: 0.89047 +/- 0.00218

[Time taken: 70.89s]



In [24]:
model_name = 'xgbv1'
dataset = 'ext'
folds = 15
seed = SEED
config = f'{model_name}_{dataset}_f{folds}_s{seed}'

op[config], tp[config] = run_experiment(
    features=features,  
    extend=True,
    folds=folds, 
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 49 -> Best value: 0.89037
Best hyperparameters:
learning_rate        - 0.09999999999999999
max_depth            - 9
min_child_weight     - 10
subsample            - 0.8
colsample_bytree     - 0.6
gamma                - 2.8000000000000003
alpha                - 2.4000000000000004
lambda               - 126.87181296328886
scale_pos_weight     - 2.7

[Time taken: 8104.80s]

-----Cross-validation and prediction-----
Fold #0: 0.89152 (111 rounds)
Fold #1: 0.89072 (73 rounds)
Fold #2: 0.89307 (754 rounds)
Fold #3: 0.89339 (244 rounds)
Fold #4: 0.88736 (102 rounds)
Fold #5: 0.89286 (177 rounds)
Fold #6: 0.89643 (227 rounds)
Fold #7: 0.88081 (184 rounds)
Fold #8: 0.89623 (176 rounds)
Fold #9: 0.89231 (227 rounds)
Fold #10: 0.88714 (131 rounds)
Fold #11: 0.88814 (203 rounds)
Fold #12: 0.88108 (138 rounds)
Fold #13: 0.89520 (165 rounds)
Fold #14: 0.88959 (266 rounds)

OOF score: 0.89037
Avg. score: 0.89039 +/- 0.00465

[Time taken: 74.14s]

