<a href="https://colab.research.google.com/github/stiwari-ds/data-science-competitions/blob/main/machinehack/subscriber_prediction_hackathon/notebooks/06_xgboost_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
%%capture
!pip install --upgrade xgboost
!pip install --upgrade optuna

In [2]:
import os
import gc
import time
import warnings
import subprocess

gc.enable()
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)
np.set_printoptions(precision=4)

import optuna
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner

import xgboost
from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

SEED = 2311
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [3]:
#remove cell to run future versions
assert optuna.__version__ == '3.0.4', f'Change in Optuna version. Original notebook version: 3.0.4'
assert xgboost.__version__ == '1.7.2', f'Change in XGBoost version. Original notebook version: 1.7.2'

In [4]:
#Check GPU availability
try:
    subprocess.check_output('nvidia-smi')
    HAVE_GPU = True
except Exception:
    HAVE_GPU = False

print(f'GPU available: {HAVE_GPU}')

GPU available: True


In [5]:
DATA_URL = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/machinehack/subscriber_prediction_hackathon/data'

train = pd.read_csv(f'{DATA_URL}/raw/train.csv')
test = pd.read_csv(f'{DATA_URL}/raw/test.csv')
sample_sub = pd.read_csv(f'{DATA_URL}/raw/submission.csv')

In [6]:
TARGET = 'y_bool'

In [7]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

In [8]:
features = list(test.columns)
num_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
cat_features = [f for f in features if f not in num_features]

In [9]:
train[cat_features] = train[cat_features].astype('category')
test[cat_features] = test[cat_features].astype('category')

# Baseline

In [10]:
%%time
scores = []
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
X, y = train[features], train[TARGET]
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.loc[val_idx], y.iloc[val_idx]

    model = XGBClassifier(
        objective='binary:logistic',
        tree_method='gpu_hist' if HAVE_GPU else 'hist',
        enable_categorical=HAVE_GPU,
        eval_metric='logloss',
        early_stopping_rounds=100, 
        seed=SEED
    ) 
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=0
    )
    val_preds = model.predict_proba(X_val)[:, 1]
    score = log_loss(y_val, val_preds)
    scores.append(score)
    print(f'Fold #{fold}: ({model.best_iteration} rounds) Logloss = {score:.5f}')
    _ = gc.collect()

print(f'\nAvg Logloss = {np.mean(scores):.5f} +/- {np.std(scores):.5f}\n')

Fold #0: (5 rounds) Logloss = 0.58282
Fold #1: (8 rounds) Logloss = 0.58453
Fold #2: (6 rounds) Logloss = 0.58306
Fold #3: (8 rounds) Logloss = 0.58492
Fold #4: (8 rounds) Logloss = 0.58129

Avg Logloss = 0.58333 +/- 0.00130

CPU times: user 5.01 s, sys: 724 ms, total: 5.74 s
Wall time: 4.75 s


# Hyperparameter tuning

In [11]:
def objective(trial, data, model):

    scores = []
    X, y = data

    param_grid = {
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 15),
        'gamma': trial.suggest_float('gamma', 0, 15, step=0.1), #complexity-control
        'alpha': trial.suggest_float('alpha', 0, 5, step=0.05), #L1-reg
        'lambda': trial.suggest_float('lambda', 1e-3, 1e4, log=True), #L2-reg
        'subsample': trial.suggest_float('subsample', 0.55, 1.0, step=0.05),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.55, 1.0, step=0.05),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.55, 1.0, step=0.05),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.55, 1.0, step=0.05),
        # 'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 5, step=0.05) 
    }

    if model.get_xgb_params()['booster'] == 'dart':
        param_grid['learning_rate'] = trial.suggest_float('learning_rate', 0.05, 0.25, step=0.05)
        param_grid['max_depth'] = trial.suggest_int('max_depth', 3, 8)
        param_grid['sample_type'] = 'weighted'
        param_grid['normalize_type'] = 'forest'
        param_grid['rate_drop'] = trial.suggest_float('rate_drop', 0.05, 0.2, step=0.05)
    else:
        param_grid['learning_rate'] = trial.suggest_float('learning_rate', 0.01, 0.1, step=0.01)
        param_grid['max_depth'] = trial.suggest_int('max_depth', 3, 12)


    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        model.set_params(**param_grid)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0
        )
        best_iter = model.best_iteration
        val_preds = model.predict_proba(X_val, iteration_range=(0, best_iter))[:, 1]
        scores.append(log_loss(y_val, val_preds))

    return np.mean(scores)

In [12]:
def tune_params(data, model, n_trials, direction):
    study = optuna.create_study(
        sampler=TPESampler(
            consider_endpoints=True,
            multivariate=True,
            group=True,
            seed=SEED),
        pruner=HyperbandPruner(),
        direction=direction
    )
    study.optimize(
        func=lambda trial: objective(trial, data, model),
        n_trials=n_trials,
        gc_after_trial=True
    )
    return study

# Cross-validation

In [13]:
def cross_validate_predict(data, model, n_splits=5):
    oof_preds = {}  #out-of-fold predictions on train set
    test_preds = {} #predictions on test set for each fold
    scores = [] #scores on validation set

    X, y, X_test = data
       
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0
        )
        best_iter = model.best_iteration
        val_preds = model.predict_proba(X_val, iteration_range=(0, best_iter))[:, 1]
        test_preds[f'fold{fold}'] = model.predict_proba(X_test, iteration_range=(0, best_iter))[:, 1]
        oof_preds.update(dict(zip(val_idx, val_preds)))

        score = log_loss(y_val, val_preds)
        scores.append(score)
        print(f'Fold #{fold}: ({best_iter:>3} rounds) Logloss = {score:.5f}')
        _ = gc.collect()
    print(f'Avg. Logloss = {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    
    oof_preds = pd.Series(oof_preds).sort_index()
    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mean'] = test_preds.mean(axis=1)

    return oof_preds, test_preds

In [14]:
def run_experiment(data, booster='gbtree', n_trials=10):
    
    X, y, X_test = data

    base_params = {
        'objective': 'binary:logistic',
        'n_estimators': 10000,
        'booster': booster,
        'eval_metric': 'logloss',
        'early_stopping_rounds': 50,
        'tree_method': 'gpu_hist' if HAVE_GPU else 'hist',
        'predictor': 'gpu_predictor' if HAVE_GPU else 'cpu_predictor',
        'enable_categorical': HAVE_GPU,
        'max_cat_to_onehot': 2,
        'verbosity': 0,
        'seed': SEED
    }
    
    model = XGBClassifier(**base_params)
    
    print(f'----------Hyperparameter tuning----------')
    start = time.time()
    study = tune_params(
        data=(X, y),
        model=model,
        n_trials=n_trials, 
        direction='minimize' #metric: logloss -> lower is better
    )
    end = time.time()
    print(f'Best trial: {study.best_trial.number} -> Best value: {study.best_value:.5f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:15} - {v}')
    print(f'[Time taken: {end - start:.2f}s]\n')
    
    print(f'-----Cross-validation and prediction-----')
    start = time.time()
    model.set_params(**study.best_params)
    oof_preds, test_preds = cross_validate_predict(data, model)
    end = time.time()
    print(f'[Time taken: {end - start:.2f}s]\n')

    return oof_preds, test_preds

**Trial run**

In [15]:
optuna.logging.set_verbosity(optuna.logging.INFO)

In [16]:
%%time
op, tp = run_experiment(
    data=(train[features], train[TARGET], test[features]),
    booster='gbtree',
    n_trials=3
)

[32m[I 2022-12-12 15:32:15,776][0m A new study created in memory with name: no-name-204b2c81-e212-4ef3-8aa7-eebfe6395c9a[0m


----------Hyperparameter tuning----------


[32m[I 2022-12-12 15:32:20,174][0m Trial 0 finished with value: 0.5809357155533065 and parameters: {'min_child_weight': 6, 'gamma': 12.700000000000001, 'alpha': 1.1, 'lambda': 0.011126643536944284, 'subsample': 0.55, 'colsample_bytree': 0.9000000000000001, 'colsample_bylevel': 0.55, 'colsample_bynode': 0.6000000000000001, 'learning_rate': 0.08, 'max_depth': 8}. Best is trial 0 with value: 0.5809357155533065.[0m
[32m[I 2022-12-12 15:32:30,745][0m Trial 1 finished with value: 0.5810115020693767 and parameters: {'min_child_weight': 2, 'gamma': 1.7000000000000002, 'alpha': 0.75, 'lambda': 620.4004036675543, 'subsample': 1.0, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.6000000000000001, 'colsample_bynode': 0.8500000000000001, 'learning_rate': 0.04, 'max_depth': 9}. Best is trial 0 with value: 0.5809357155533065.[0m
[32m[I 2022-12-12 15:32:34,880][0m Trial 2 finished with value: 0.5808674393934863 and parameters: {'min_child_weight': 9, 'gamma': 11.600000000000001, 'alpha': 2.45,

Best trial: 2 -> Best value: 0.58087
Best hyperparameters:
min_child_weight - 9
gamma           - 11.600000000000001
alpha           - 2.45
lambda          - 0.0521731326394548
subsample       - 0.55
colsample_bytree - 0.9000000000000001
colsample_bylevel - 0.6000000000000001
colsample_bynode - 0.75
learning_rate   - 0.06999999999999999
max_depth       - 6
[Time taken: 19.17s]

-----Cross-validation and prediction-----
Fold #0: (308 rounds) Logloss = 0.58105
Fold #1: (177 rounds) Logloss = 0.58075
Fold #2: (292 rounds) Logloss = 0.57964
Fold #3: ( 62 rounds) Logloss = 0.58140
Fold #4: (207 rounds) Logloss = 0.58151
Avg. Logloss = 0.58087 +/- 0.00067
[Time taken: 4.45s]

CPU times: user 24.3 s, sys: 336 ms, total: 24.7 s
Wall time: 23.6 s


In [17]:
%%time
op, tp = run_experiment(
    data=(train[features], train[TARGET], test[features]),
    booster='dart',
    n_trials=3
)

[32m[I 2022-12-12 15:32:39,417][0m A new study created in memory with name: no-name-b2abc1a8-81d1-4b29-b5ef-5c446f1a634a[0m


----------Hyperparameter tuning----------


[32m[I 2022-12-12 15:33:07,192][0m Trial 0 finished with value: 0.5906565719559078 and parameters: {'min_child_weight': 6, 'gamma': 12.700000000000001, 'alpha': 1.1, 'lambda': 0.011126643536944284, 'subsample': 0.55, 'colsample_bytree': 0.9000000000000001, 'colsample_bylevel': 0.55, 'colsample_bynode': 0.6000000000000001, 'learning_rate': 0.2, 'max_depth': 6, 'rate_drop': 0.05}. Best is trial 0 with value: 0.5906565719559078.[0m
[32m[I 2022-12-12 15:33:22,764][0m Trial 1 finished with value: 0.639549139272599 and parameters: {'min_child_weight': 3, 'gamma': 2.2, 'alpha': 4.15, 'lambda': 8418.152379595756, 'subsample': 0.8, 'colsample_bytree': 0.6000000000000001, 'colsample_bylevel': 0.8500000000000001, 'colsample_bynode': 0.7000000000000001, 'learning_rate': 0.2, 'max_depth': 6, 'rate_drop': 0.2}. Best is trial 0 with value: 0.5906565719559078.[0m
[32m[I 2022-12-12 15:33:44,437][0m Trial 2 finished with value: 0.6034013161219302 and parameters: {'min_child_weight': 8, 'gamma': 

Best trial: 0 -> Best value: 0.59066
Best hyperparameters:
min_child_weight - 6
gamma           - 12.700000000000001
alpha           - 1.1
lambda          - 0.011126643536944284
subsample       - 0.55
colsample_bytree - 0.9000000000000001
colsample_bylevel - 0.55
colsample_bynode - 0.6000000000000001
learning_rate   - 0.2
max_depth       - 6
rate_drop       - 0.05
[Time taken: 65.09s]

-----Cross-validation and prediction-----
Fold #0: ( 51 rounds) Logloss = 0.59071
Fold #1: (111 rounds) Logloss = 0.59092
Fold #2: ( 77 rounds) Logloss = 0.59000
Fold #3: (111 rounds) Logloss = 0.59070
Fold #4: (124 rounds) Logloss = 0.59096
Avg. Logloss = 0.59066 +/- 0.00034
[Time taken: 28.79s]

CPU times: user 1min 37s, sys: 324 ms, total: 1min 37s
Wall time: 1min 33s


# gbtree booster

In [18]:
%%time
op_gbtree, tp_gbtree = run_experiment(
    data=(train[features], train[TARGET], test[features]),
    booster='gbtree',
    n_trials=200
)

[32m[I 2022-12-12 15:34:13,313][0m A new study created in memory with name: no-name-7a9dcd0a-02d1-484a-bc30-969c88008eac[0m


----------Hyperparameter tuning----------


[32m[I 2022-12-12 15:34:17,305][0m Trial 0 finished with value: 0.5809357155533065 and parameters: {'min_child_weight': 6, 'gamma': 12.700000000000001, 'alpha': 1.1, 'lambda': 0.011126643536944284, 'subsample': 0.55, 'colsample_bytree': 0.9000000000000001, 'colsample_bylevel': 0.55, 'colsample_bynode': 0.6000000000000001, 'learning_rate': 0.08, 'max_depth': 8}. Best is trial 0 with value: 0.5809357155533065.[0m
[32m[I 2022-12-12 15:34:22,150][0m Trial 1 finished with value: 0.5814723002868039 and parameters: {'min_child_weight': 2, 'gamma': 1.7000000000000002, 'alpha': 0.75, 'lambda': 620.4004036675543, 'subsample': 1.0, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.6000000000000001, 'colsample_bynode': 0.8500000000000001, 'learning_rate': 0.04, 'max_depth': 9}. Best is trial 0 with value: 0.5809357155533065.[0m
[32m[I 2022-12-12 15:34:25,990][0m Trial 2 finished with value: 0.5808674393934863 and parameters: {'min_child_weight': 9, 'gamma': 11.600000000000001, 'alpha': 2.45,

Best trial: 188 -> Best value: 0.58004
Best hyperparameters:
min_child_weight - 10
gamma           - 3.3000000000000003
alpha           - 0.65
lambda          - 0.005844037592155338
subsample       - 0.9500000000000001
colsample_bytree - 1.0
colsample_bylevel - 1.0
colsample_bynode - 1.0
learning_rate   - 0.09999999999999999
max_depth       - 3
[Time taken: 940.09s]

-----Cross-validation and prediction-----
Fold #0: ( 39 rounds) Logloss = 0.57983
Fold #1: ( 29 rounds) Logloss = 0.58103
Fold #2: (342 rounds) Logloss = 0.57837
Fold #3: ( 49 rounds) Logloss = 0.58146
Fold #4: ( 80 rounds) Logloss = 0.57949
Avg. Logloss = 0.58004 +/- 0.00111
[Time taken: 3.67s]

CPU times: user 19min 5s, sys: 13.8 s, total: 19min 19s
Wall time: 15min 43s


# dart booster

In [19]:
%%time
op_dart, tp_dart = run_experiment(
    data=(train[features], train[TARGET], test[features]),
    booster='dart',
    n_trials=50
)

[32m[I 2022-12-12 15:49:57,083][0m A new study created in memory with name: no-name-bc4ebd8c-dbe4-44f3-a152-14862f863570[0m


----------Hyperparameter tuning----------


[32m[I 2022-12-12 15:50:24,547][0m Trial 0 finished with value: 0.5906565719559078 and parameters: {'min_child_weight': 6, 'gamma': 12.700000000000001, 'alpha': 1.1, 'lambda': 0.011126643536944284, 'subsample': 0.55, 'colsample_bytree': 0.9000000000000001, 'colsample_bylevel': 0.55, 'colsample_bynode': 0.6000000000000001, 'learning_rate': 0.2, 'max_depth': 6, 'rate_drop': 0.05}. Best is trial 0 with value: 0.5906565719559078.[0m
[32m[I 2022-12-12 15:50:40,467][0m Trial 1 finished with value: 0.6389890683889389 and parameters: {'min_child_weight': 3, 'gamma': 2.2, 'alpha': 4.15, 'lambda': 8418.152379595756, 'subsample': 0.8, 'colsample_bytree': 0.6000000000000001, 'colsample_bylevel': 0.8500000000000001, 'colsample_bynode': 0.7000000000000001, 'learning_rate': 0.2, 'max_depth': 6, 'rate_drop': 0.2}. Best is trial 0 with value: 0.5906565719559078.[0m
[32m[I 2022-12-12 15:50:55,948][0m Trial 2 finished with value: 0.6073671107803073 and parameters: {'min_child_weight': 8, 'gamma':

Best trial: 33 -> Best value: 0.58152
Best hyperparameters:
min_child_weight - 2
gamma           - 5.7
alpha           - 1.85
lambda          - 0.8441960876832094
subsample       - 0.9000000000000001
colsample_bytree - 0.75
colsample_bylevel - 0.8
colsample_bynode - 0.8
learning_rate   - 0.05
max_depth       - 8
rate_drop       - 0.05
[Time taken: 1637.94s]

-----Cross-validation and prediction-----
Fold #0: (126 rounds) Logloss = 0.58128
Fold #1: (168 rounds) Logloss = 0.58177
Fold #2: (146 rounds) Logloss = 0.58061
Fold #3: ( 98 rounds) Logloss = 0.58262
Fold #4: (281 rounds) Logloss = 0.58199
Avg. Logloss = 0.58165 +/- 0.00068
[Time taken: 63.69s]

CPU times: user 29min 16s, sys: 4.44 s, total: 29min 20s
Wall time: 28min 21s


# Submission files

In [20]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [21]:
NOTEBOOK = '06'
SUBMISSION_PATH = f'/content/drive/MyDrive/data_science_competitions/machinehack/subscriber_prediction_hackathon/submissions/nb_{NOTEBOOK}'
if not os.path.isdir(SUBMISSION_PATH):
    os.makedirs(SUBMISSION_PATH)

In [22]:
def create_submission_files(test_preds, model_type, path=SUBMISSION_PATH):
    for col in (test_preds.columns):
        sub = sample_sub.copy()
        sub[TARGET] = test_preds[col]
        sub.to_csv(f'{path}/{model_type}_{col}.csv', index=False)

In [23]:
create_submission_files(tp_gbtree, 'gbtree')
create_submission_files(tp_dart, 'dart')