<a href="https://colab.research.google.com/github/stiwari-ds/data-science-competitions/blob/main/machinehack/analytics_olympiad22/notebooks/03_xgboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [16]:
%%capture
!pip install --upgrade xgboost
!pip install --upgrade optuna

In [17]:
import os
import gc
import time
import warnings
import subprocess

gc.enable()
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import xgboost
import optuna

from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner
from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

SEED = 23
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [18]:
#remove cell to run future versions
assert optuna.__version__ == '3.0.3', f'Change in Optuna version. Original notebook version: 3.0.3'
assert xgboost.__version__ == '1.6.2', f'Change in XGBoost version. Original notebook version: 1.6.2'

In [19]:
#Check GPU availability
try:
    subprocess.check_output('nvidia-smi')
    HAVE_GPU = True
except Exception:
    HAVE_GPU = False

print(f'GPU available: {HAVE_GPU}')

GPU available: True


# Data preparation

In [20]:
DATA_URL = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/machinehack/analytics_olympiad22/data'

train_full = pd.read_csv(f'{DATA_URL}/processed/train_proc.csv')
train_clip = pd.read_csv(f'{DATA_URL}/processed/train_clip.csv')
test = pd.read_csv(f'{DATA_URL}/processed/test_proc.csv')
sample_sub = pd.read_csv(f'{DATA_URL}/raw/submission.csv')

In [21]:
TARGET = 'OUTCOME'

In [22]:
features = [f for f in test.columns 
            if f not in ('ID', 'POSTAL_CODE', 'ANNUAL_MILEAGE')]

num_features = ['ID_COUNT', 'CREDIT_SCORE', 'ANNUAL_MILEAGE_K', 'DUIS', 
                'SPEEDING_VIOLATIONS', 'PAST_ACCIDENTS', 'TOTAL_PAST_INCIDENTS']

cat_features = [f for f in features if f not in num_features]

In [23]:
train_full[cat_features] = train_full[cat_features].astype('category')
train_clip[cat_features] = train_clip[cat_features].astype('category')
test[cat_features] = test[cat_features].astype('category')

In [24]:
original_features = ['AGE', 'GENDER', 'DRIVING_EXPERIENCE', 'EDUCATION', 'INCOME', 
                     'CREDIT_SCORE', 'VEHICLE_OWNERSHIP', 'VEHICLE_YEAR', 'MARRIED', 
                     'CHILDREN', 'POSTAL_CODE_SUBREGION', 'ANNUAL_MILEAGE_K', 
                     'SPEEDING_VIOLATIONS', 'DUIS', 'PAST_ACCIDENTS', 'TYPE_OF_VEHICLE']

cat_only_features = ['AGE', 'GENDER', 'DRIVING_EXPERIENCE', 'EDUCATION', 'INCOME', 
                     'VEHICLE_OWNERSHIP', 'VEHICLE_YEAR', 'MARRIED', 'CHILDREN', 
                     'TYPE_OF_VEHICLE', 'IS_ID_REPEATED', 'CREDIT_SCORE_BINS', 
                     'POSTAL_CODE_REGION', 'ANNUAL_MILEAGE_RANGE', 'HAS_PRIOR_DUIS', 
                     'HAS_PRIOR_SPEEDING_VIOLATIONS', 'HAS_PAST_ACCIDENTS',
                     'HAS_PAST_INCIDENTS']

#based on F-test and Chi2-test
reduced_features = ['GENDER', 'DRIVING_EXPERIENCE', 'POSTAL_CODE_SUBREGION']

#based on mutual information score
mi_features = ['AGE', 'GENDER', 'DRIVING_EXPERIENCE', 'EDUCATION', 'INCOME', 
               'VEHICLE_OWNERSHIP', 'VEHICLE_YEAR', 'MARRIED', 'CHILDREN', 
               'TYPE_OF_VEHICLE', 'ID_COUNT', 'CREDIT_SCORE_BINS', 'POSTAL_CODE_REGION', 
               'POSTAL_CODE_SUBREGION', 'ANNUAL_MILEAGE_RANGE', 'TOTAL_PAST_INCIDENTS', 
               'HAS_PAST_INCIDENTS']

# Baseline

In [25]:
%%time
scores = []
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
X, y = train_full[original_features], train_full[TARGET]
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.loc[val_idx], y.iloc[val_idx]

    model = XGBClassifier(
        objective='binary:logistic',
        tree_method='gpu_hist' if HAVE_GPU else 'hist',
        enable_categorical=HAVE_GPU,
        eval_metric='logloss',
        early_stopping_rounds=50, 
        seed=SEED
    ) 
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=0
    )
    val_preds = model.predict_proba(X_val)[:, 1]

    score = log_loss(y_val, val_preds)
    scores.append(score)
    print(f'Fold #{fold}: ({model.best_iteration} rounds) Logloss = {score:.6f}')
    _ = gc.collect()

print(f'\nAvg Logloss = {np.mean(scores):.6f} +/- {np.std(scores):.6f}\n')

Fold #0: (4 rounds) Logloss = 0.682039
Fold #1: (4 rounds) Logloss = 0.682480
Fold #2: (9 rounds) Logloss = 0.682147
Fold #3: (4 rounds) Logloss = 0.682025
Fold #4: (5 rounds) Logloss = 0.682392

Avg Logloss = 0.682217 +/- 0.000186

CPU times: user 3.91 s, sys: 58.9 ms, total: 3.97 s
Wall time: 3.44 s


# Hyperparameter tuning

In [26]:
def objective(trial, data, base_params):

    scores = []
    X, y = data

    param_grid = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, step=0.01),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 20),
        'gamma': trial.suggest_float('gamma', 0, 20, step=0.1), #complexity-control
        'alpha': trial.suggest_float('alpha', 0, 5, step=0.05), #L1-reg
        'lambda': trial.suggest_float('lambda', 1e-3, 1e5, log=True), #L2-reg
        'subsample': trial.suggest_float('subsample', 0.5, 1.0, step=0.05),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0, step=0.05),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0, step=0.05),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.5, 1.0, step=0.05),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.75, 1.5, step=0.05)
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        model = XGBClassifier(**base_params, **param_grid)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0
        )
        val_preds = model.predict_proba(X_val)[:, 1]
        scores.append(log_loss(y_val, val_preds))

    return np.mean(scores)

In [27]:
def tune_params(data, base_params, n_trials=10, direction='maximize'):
    study = optuna.create_study(
        sampler=TPESampler(seed=SEED),
        pruner=HyperbandPruner(),
        direction=direction
    )
    
    study.optimize(
        func=lambda trial: objective(trial, data, base_params),
        n_trials=n_trials,
        gc_after_trial=True
    )
    
    return study

# Cross-validation and experiment setup

In [28]:
def evaluate_model(data, model_params, verbose=True):
    oof_preds = {}  #out-of-fold predictions on train set
    test_preds = {} #predictions on test set for each fold
    scores = [] #F1 scores on validation set

    X, y, X_test = data

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        model = XGBClassifier(**model_params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0
        )
        val_preds = model.predict_proba(X_val)[:, 1]
        oof_preds.update(dict(zip(val_idx, val_preds)))
        
        test_preds[f'fold{fold}'] = model.predict_proba(X_test)[:, 1]

        score = log_loss(y_val, val_preds)
        scores.append(score)
        if verbose:
            print(f'Fold #{fold}: ({model.best_iteration} rounds) Logloss = {score:.6f}')
        
        _ = gc.collect()

    print(f'\nAvg Logloss = {np.mean(scores):.6f} +/- {np.std(scores):.6f}')
    
    oof_preds = pd.Series(oof_preds).sort_index()
    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mean'] = test_preds.mean(axis=1)

    return oof_preds, test_preds

In [29]:
def run_experiment(data, n_trials=5):
        
    X, y, X_test = data
    
    base_params = {
        'objective': 'binary:logistic',
        'n_estimators': 10000,
        'booster': 'gbtree',
        'eval_metric': 'logloss',
        'early_stopping_rounds': 100,
        'tree_method': 'gpu_hist' if HAVE_GPU else 'hist',
        'predictor': 'gpu_predictor' if HAVE_GPU else 'cpu_predictor',
        'enable_categorical': HAVE_GPU,
        'verbosity': 1,
        'seed': SEED
    }
    
    print(f'---------------Hyperparameter tuning---------------')
    study = tune_params(
        data=(X, y), 
        base_params=base_params,
        n_trials=n_trials,
        direction='minimize' #logloss -> lower is better
    )
    print(f'Best trial: {study.best_trial.number} -> Best value(Logloss): {study.best_value:.5f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:20} - {v}')
    
    model_params = {**base_params, **study.best_params}
    print(f'-----------------Cross-validation------------------')
    oof_preds, test_preds = evaluate_model(
        data=data, 
        model_params=model_params
    )
    return oof_preds, test_preds

### Trial run

In [30]:
%%time
optuna.logging.set_verbosity(optuna.logging.INFO)
op, tp = run_experiment(
    data=(train_full[original_features], train_full[TARGET], test[original_features]),
    n_trials=3
)

[32m[I 2022-11-06 10:37:21,873][0m A new study created in memory with name: no-name-fc28b95b-67d9-48b8-880a-ace8d2402ab2[0m


---------------Hyperparameter tuning---------------


[32m[I 2022-11-06 10:37:23,433][0m Trial 0 finished with value: 0.6811924729296139 and parameters: {'learning_rate': 0.16, 'max_depth': 12, 'min_child_weight': 16, 'gamma': 5.6000000000000005, 'alpha': 1.1, 'lambda': 308.87067834937415, 'subsample': 0.55, 'colsample_bytree': 0.7, 'colsample_bylevel': 0.8, 'colsample_bynode': 0.7, 'scale_pos_weight': 0.75}. Best is trial 0 with value: 0.6811924729296139.[0m
[32m[I 2022-11-06 10:37:25,786][0m Trial 1 finished with value: 0.6869966374908174 and parameters: {'learning_rate': 0.27, 'max_depth': 11, 'min_child_weight': 7, 'gamma': 11.8, 'alpha': 4.9, 'lambda': 5764.3531133041315, 'subsample': 0.5, 'colsample_bytree': 0.65, 'colsample_bylevel': 0.65, 'colsample_bynode': 0.95, 'scale_pos_weight': 1.25}. Best is trial 0 with value: 0.6811924729296139.[0m
[32m[I 2022-11-06 10:37:28,829][0m Trial 2 finished with value: 0.6810671264211337 and parameters: {'learning_rate': 0.04, 'max_depth': 3, 'min_child_weight': 19, 'gamma': 2.80000000000

Best trial: 2 -> Best value(Logloss): 0.68107
Best hyperparameters:
learning_rate        - 0.04
max_depth            - 3
min_child_weight     - 19
gamma                - 2.8000000000000003
alpha                - 2.1
lambda               - 0.5914465750030369
subsample            - 0.95
colsample_bytree     - 0.7
colsample_bylevel    - 0.95
colsample_bynode     - 0.8500000000000001
scale_pos_weight     - 0.8
-----------------Cross-validation------------------
Fold #0: (21 rounds) Logloss = 0.680976
Fold #1: (21 rounds) Logloss = 0.681067
Fold #2: (21 rounds) Logloss = 0.681134
Fold #3: (21 rounds) Logloss = 0.681036
Fold #4: (21 rounds) Logloss = 0.681122

Avg Logloss = 0.681067 +/- 0.000058
CPU times: user 12 s, sys: 320 ms, total: 12.3 s
Wall time: 12.4 s


In [31]:
optuna.logging.set_verbosity(optuna.logging.ERROR)

In [32]:
%%time
op1, tp1 = run_experiment(
    data=(train_full[original_features], train_full[TARGET], test[original_features]),
    n_trials=100
)

---------------Hyperparameter tuning---------------
Best trial: 86 -> Best value(Logloss): 0.68099
Best hyperparameters:
learning_rate        - 0.27
max_depth            - 11
min_child_weight     - 17
gamma                - 5.800000000000001
alpha                - 1.1500000000000001
lambda               - 246.70108756972104
subsample            - 0.8
colsample_bytree     - 0.8
colsample_bylevel    - 0.75
colsample_bynode     - 0.7
scale_pos_weight     - 1.0
-----------------Cross-validation------------------
Fold #0: (40 rounds) Logloss = 0.680865
Fold #1: (19 rounds) Logloss = 0.681009
Fold #2: (11 rounds) Logloss = 0.681129
Fold #3: (38 rounds) Logloss = 0.680885
Fold #4: (11 rounds) Logloss = 0.681061

Avg Logloss = 0.680990 +/- 0.000101
CPU times: user 5min 6s, sys: 5.06 s, total: 5min 11s
Wall time: 4min 17s


In [33]:
%%time
op2, tp2 = run_experiment(
    data=(train_clip[original_features], train_clip[TARGET], test[original_features]),
    n_trials=100
)

---------------Hyperparameter tuning---------------
Best trial: 46 -> Best value(Logloss): 0.68101
Best hyperparameters:
learning_rate        - 0.09
max_depth            - 6
min_child_weight     - 11
gamma                - 7.7
alpha                - 1.6500000000000001
lambda               - 243.9703669028142
subsample            - 0.8500000000000001
colsample_bytree     - 0.75
colsample_bylevel    - 1.0
colsample_bynode     - 0.8
scale_pos_weight     - 1.0
-----------------Cross-validation------------------
Fold #0: (34 rounds) Logloss = 0.681067
Fold #1: (39 rounds) Logloss = 0.681026
Fold #2: (77 rounds) Logloss = 0.680982
Fold #3: (133 rounds) Logloss = 0.680947
Fold #4: (251 rounds) Logloss = 0.681004

Avg Logloss = 0.681005 +/- 0.000040
CPU times: user 4min 44s, sys: 4.51 s, total: 4min 49s
Wall time: 3min 53s


In [34]:
%%time
op3, tp3 = run_experiment(
    data=(train_full[cat_only_features], train_full[TARGET], test[cat_only_features]),
    n_trials=100
)

---------------Hyperparameter tuning---------------
Best trial: 57 -> Best value(Logloss): 0.68092
Best hyperparameters:
learning_rate        - 0.2
max_depth            - 11
min_child_weight     - 18
gamma                - 6.2
alpha                - 0.45
lambda               - 564.1280043879658
subsample            - 0.55
colsample_bytree     - 0.9
colsample_bylevel    - 0.7
colsample_bynode     - 1.0
scale_pos_weight     - 1.0
-----------------Cross-validation------------------
Fold #0: (153 rounds) Logloss = 0.680748
Fold #1: (39 rounds) Logloss = 0.681014
Fold #2: (189 rounds) Logloss = 0.680956
Fold #3: (72 rounds) Logloss = 0.680827
Fold #4: (27 rounds) Logloss = 0.681057

Avg Logloss = 0.680921 +/- 0.000116
CPU times: user 4min 29s, sys: 4.08 s, total: 4min 33s
Wall time: 3min 40s


In [35]:
%%time
op4, tp4 = run_experiment(
    data=(train_clip[cat_only_features], train_clip[TARGET], test[cat_only_features]),
    n_trials=100
)

---------------Hyperparameter tuning---------------
Best trial: 47 -> Best value(Logloss): 0.68098
Best hyperparameters:
learning_rate        - 0.18000000000000002
max_depth            - 3
min_child_weight     - 20
gamma                - 3.2
alpha                - 5.0
lambda               - 0.020432133604794865
subsample            - 0.8500000000000001
colsample_bytree     - 0.6
colsample_bylevel    - 0.9
colsample_bynode     - 0.9
scale_pos_weight     - 1.0
-----------------Cross-validation------------------
Fold #0: (20 rounds) Logloss = 0.681038
Fold #1: (17 rounds) Logloss = 0.681063
Fold #2: (77 rounds) Logloss = 0.680990
Fold #3: (69 rounds) Logloss = 0.680801
Fold #4: (75 rounds) Logloss = 0.680987

Avg Logloss = 0.680976 +/- 0.000092
CPU times: user 4min 13s, sys: 3.87 s, total: 4min 17s
Wall time: 3min 25s


In [36]:
%%time
op5, tp5 = run_experiment(
    data=(train_full[mi_features], train_full[TARGET], test[mi_features]),
    n_trials=100
)

---------------Hyperparameter tuning---------------
Best trial: 90 -> Best value(Logloss): 0.68104
Best hyperparameters:
learning_rate        - 0.03
max_depth            - 9
min_child_weight     - 19
gamma                - 1.6
alpha                - 1.85
lambda               - 52.30229703346206
subsample            - 0.7
colsample_bytree     - 0.6
colsample_bylevel    - 0.9
colsample_bynode     - 0.95
scale_pos_weight     - 0.9
-----------------Cross-validation------------------
Fold #0: (46 rounds) Logloss = 0.680985
Fold #1: (47 rounds) Logloss = 0.681069
Fold #2: (46 rounds) Logloss = 0.681037
Fold #3: (47 rounds) Logloss = 0.681026
Fold #4: (49 rounds) Logloss = 0.681098

Avg Logloss = 0.681043 +/- 0.000039
CPU times: user 5min 10s, sys: 3.96 s, total: 5min 14s
Wall time: 4min 18s


In [37]:
%%time
op6, tp6 = run_experiment(
    data=(train_clip[mi_features], train_clip[TARGET], test[mi_features]),
    n_trials=100
)

---------------Hyperparameter tuning---------------
Best trial: 93 -> Best value(Logloss): 0.68096
Best hyperparameters:
learning_rate        - 0.17
max_depth            - 8
min_child_weight     - 7
gamma                - 2.1
alpha                - 3.8000000000000003
lambda               - 3058.771815197645
subsample            - 0.8
colsample_bytree     - 0.5
colsample_bylevel    - 0.55
colsample_bynode     - 0.6
scale_pos_weight     - 1.0
-----------------Cross-validation------------------
Fold #0: (132 rounds) Logloss = 0.680996
Fold #1: (72 rounds) Logloss = 0.680967
Fold #2: (53 rounds) Logloss = 0.680998
Fold #3: (148 rounds) Logloss = 0.680879
Fold #4: (221 rounds) Logloss = 0.680965

Avg Logloss = 0.680961 +/- 0.000043
CPU times: user 4min 54s, sys: 4.31 s, total: 4min 58s
Wall time: 4min 2s


In [38]:
%%time
op7, tp7 = run_experiment(
    data=(train_full[reduced_features], train_full[TARGET], test[reduced_features]),
    n_trials=100
)

---------------Hyperparameter tuning---------------
Best trial: 50 -> Best value(Logloss): 0.68099
Best hyperparameters:
learning_rate        - 0.23
max_depth            - 3
min_child_weight     - 10
gamma                - 0.0
alpha                - 2.3000000000000003
lambda               - 2811.8695592903473
subsample            - 0.6
colsample_bytree     - 0.75
colsample_bylevel    - 0.9
colsample_bynode     - 0.95
scale_pos_weight     - 1.0
-----------------Cross-validation------------------
Fold #0: (117 rounds) Logloss = 0.680794
Fold #1: (67 rounds) Logloss = 0.680988
Fold #2: (16 rounds) Logloss = 0.681178
Fold #3: (35 rounds) Logloss = 0.680940
Fold #4: (22 rounds) Logloss = 0.681067

Avg Logloss = 0.680993 +/- 0.000128
CPU times: user 2min 56s, sys: 3.13 s, total: 2min 59s
Wall time: 2min 30s


In [39]:
%%time
op8, tp8 = run_experiment(
    data=(train_clip[reduced_features], train_clip[TARGET], test[reduced_features]),
    n_trials=100
)

---------------Hyperparameter tuning---------------
Best trial: 94 -> Best value(Logloss): 0.68097
Best hyperparameters:
learning_rate        - 0.29000000000000004
max_depth            - 9
min_child_weight     - 12
gamma                - 5.2
alpha                - 4.65
lambda               - 8.255276200724126
subsample            - 0.75
colsample_bytree     - 0.55
colsample_bylevel    - 0.9
colsample_bynode     - 0.9
scale_pos_weight     - 1.0
-----------------Cross-validation------------------
Fold #0: (16 rounds) Logloss = 0.681004
Fold #1: (15 rounds) Logloss = 0.681000
Fold #2: (26 rounds) Logloss = 0.680982
Fold #3: (13 rounds) Logloss = 0.680919
Fold #4: (11 rounds) Logloss = 0.680947

Avg Logloss = 0.680970 +/- 0.000032
CPU times: user 2min 31s, sys: 2.69 s, total: 2min 34s
Wall time: 2min 5s


# Generating submission files

In [40]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [44]:
NOTEBOOK = '03'
SUBMISSION_PATH = f'/content/drive/MyDrive/data_science_competitions/machinehack/analytics_olympiad22/submissions/nb_{NOTEBOOK}'
if not os.path.isdir(SUBMISSION_PATH):
    os.makedirs(SUBMISSION_PATH)

In [45]:
def create_submission_files(test_preds: pd.DataFrame, expt_num: int):
    for col in (test_preds.columns):
        sub = sample_sub.copy()
        sub[TARGET] = test_preds[col]
        sub.to_csv(f'{SUBMISSION_PATH}/{expt_num}_{col}.csv', index=False)

In [46]:
create_submission_files(tp1, '01')
create_submission_files(tp2, '02')
create_submission_files(tp3, '03')
create_submission_files(tp4, '04')
create_submission_files(tp5, '05')
create_submission_files(tp6, '06')
create_submission_files(tp7, '07')
create_submission_files(tp8, '08')