<a href="https://colab.research.google.com/github/stiwari-ds/data-science-competitions/blob/main/analytics_vidhya/ml_summer_training_2022/notebooks/02_xgboost_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
%%capture
!pip install --upgrade xgboost
!pip install --upgrade optuna

In [2]:
import os
import gc
import time
import warnings
import subprocess

gc.enable()
warnings.filterwarnings('ignore')

import scipy.stats as st
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

import xgboost
import optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)

from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner
from optuna.integration import XGBoostPruningCallback
from xgboost import XGBClassifier

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, zero_one_loss
from tqdm.notebook import tqdm

SEED = 23
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [3]:
#remove cell to run future versions
assert optuna.__version__ == '2.10.1', f'Change in Optuna version. Original notebook version: 2.10.1'
assert xgboost.__version__ == '1.6.1', f'Change in XGBoost version. Original notebook version: 1.6.1'

In [4]:
SEED = 23
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [5]:
#Check GPU availability
try:
    subprocess.check_output('nvidia-smi')
    GPU = True
except Exception:
    GPU = False

print(f'GPU available: {GPU}')

GPU available: True


**Data setup**

In [6]:
train_all_url = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/analytics_vidhya/ml_summer_training_2022/data/processed/all_features/train.csv'
test_all_url = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/analytics_vidhya/ml_summer_training_2022/data/processed/all_features/test.csv'

train_original_url = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/analytics_vidhya/ml_summer_training_2022/data/processed/original_filled/train.csv'
test_original_url = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/analytics_vidhya/ml_summer_training_2022/data/processed/original_filled/test.csv'

train_selected_url = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/analytics_vidhya/ml_summer_training_2022/data/processed/selected_features/train.csv'
test_selected_url = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/analytics_vidhya/ml_summer_training_2022/data/processed/selected_features/test.csv'

In [7]:
target_url = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/analytics_vidhya/ml_summer_training_2022/data/raw/train.csv'
target = pd.read_csv(target_url, usecols=['loan_default']).iloc[:, 0] #converted to series

In [8]:
train_all = pd.read_csv(train_all_url)
test_all = pd.read_csv(test_all_url)

train_original = pd.read_csv(train_original_url)
test_original = pd.read_csv(test_original_url)

train_selected = pd.read_csv(train_selected_url)
test_selected = pd.read_csv(test_selected_url)

In [9]:
train_all.head()

Unnamed: 0,age,proof_submitted,loan_amount,asset_cost,no_of_loans,no_of_curr_loans,last_delinq_none,education_fill_1,education_fill_2,age_range,loan_amount_range,asset_cost_range,no_of_loans_range,no_of_curr_loans_range,proof_is_aadhar
0,27,0,504264,820920,2,2,0,1,1,0,1,1,1,1,1
1,48,0,728556,831444,6,2,0,1,1,3,1,1,2,1,1
2,30,1,642936,826092,0,0,0,2,2,1,1,1,0,0,0
3,28,0,746556,930924,0,0,0,1,1,1,1,1,0,0,1
4,29,0,1139880,1902000,0,0,0,1,1,1,2,3,0,0,1


In [10]:
train_original.head()

Unnamed: 0,age,education_fill_1,proof_submitted,loan_amount,asset_cost,no_of_loans,no_of_curr_loans,last_delinq_none
0,27,1,0,504264,820920,2,2,0
1,48,1,0,728556,831444,6,2,0
2,30,2,1,642936,826092,0,0,0
3,28,1,0,746556,930924,0,0,0
4,29,1,0,1139880,1902000,0,0,0


In [11]:
train_selected.head()

Unnamed: 0,age,proof_submitted,loan_amount,asset_cost,no_of_loans,no_of_curr_loans
0,27,0,504264,820920,2,2
1,48,0,728556,831444,6,2
2,30,1,642936,826092,0,0
3,28,0,746556,930924,0,0
4,29,0,1139880,1902000,0,0


In [12]:
target.head()

0    0
1    0
2    1
3    0
4    0
Name: loan_default, dtype: int64

In [13]:
categorical_features = ['proof_submitted', 'last_delinq_none', 'education_fill_1',
                        'education_fill_2', 'age_range', 'loan_amount_range', 
                        'asset_cost_range', 'no_of_loans_range']

# Baselines

In [14]:
def fit_baseline(X, y):
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, shuffle=True, stratify=target, random_state=SEED)
    
    baseline = XGBClassifier()
    baseline.fit(
        X_train, y_train, 
        eval_set=[(X_val, y_val)],
        verbose=0)
    
    preds = baseline.predict(X_val)
    print(classification_report(y_val, preds))

**Full data: original + created features**

In [15]:
fit_baseline(train_all, target)

              precision    recall  f1-score   support

           0       0.60      0.75      0.67       840
           1       0.41      0.27      0.32       560

    accuracy                           0.56      1400
   macro avg       0.51      0.51      0.50      1400
weighted avg       0.53      0.56      0.53      1400



**Original features**

In [16]:
fit_baseline(train_original, target)

              precision    recall  f1-score   support

           0       0.62      0.75      0.68       840
           1       0.44      0.29      0.35       560

    accuracy                           0.57      1400
   macro avg       0.53      0.52      0.51      1400
weighted avg       0.55      0.57      0.55      1400



**Selected features**

In [17]:
fit_baseline(train_selected, target)

              precision    recall  f1-score   support

           0       0.61      0.75      0.68       840
           1       0.44      0.29      0.35       560

    accuracy                           0.57      1400
   macro avg       0.52      0.52      0.51      1400
weighted avg       0.54      0.57      0.54      1400



# Hyperparameter tuning

In [18]:
def objective(trial, data, base_params):

    scores = []
    X, y = data
    cat_features = list(X.select_dtypes(include='category').columns)

    param_grid = {
        'objective': trial.suggest_categorical(
            'objective', ['binary:logistic', 'binary:hinge']),
        'learning_rate': trial.suggest_float(
            'learning_rate', 0.01, 0.4, step=0.01),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0, step=0.05),
        'colsample_bytree': trial.suggest_float(
            'colsample_bytree', 0.5, 1.0, step=0.05),
        'colsample_bylevel': trial.suggest_float(
            'colsample_bylevel', 0.5, 1.0, step=0.05),
        'colsample_bynode': trial.suggest_float(
            'colsample_bynode', 0.5, 1.0, step=0.05),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 10),
        'gamma': trial.suggest_float('gamma', 0, 20),
        'alpha': trial.suggest_float('alpha', 1e-5, 1e2, log=True),
        'lambda': trial.suggest_float('lambda', 1e-5, 1e2, log=True),
        'scale_pos_weight': trial.suggest_float(
            'scale_pos_weight', 1, 1.5, step=0.05)
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        model = XGBClassifier(
            **base_params, 
            **param_grid,
            callbacks=[XGBoostPruningCallback(
                trial=trial, 
                observation_key='validation_0-error')]
        )
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0
        )
        preds = model.predict(X_val)
        scores.append(zero_one_loss(y_val, preds))
    
    return np.mean(scores)

In [19]:
def tune_params(data, base_params, n_trials=10, direction='maximize'):
    study = optuna.create_study(
        sampler=TPESampler(seed=SEED),
        pruner=HyperbandPruner(),
        direction=direction)
    
    study.optimize(
        func=lambda trial: objective(trial, data, base_params),
        n_trials=n_trials,
        gc_after_trial=True)
    
    return study

**Checking functionality**

In [20]:
base_params = {
    'n_estimators': 2000,
    'eval_metric': 'error',
    'early_stopping_rounds': 50,
    'tree_method': 'gpu_hist' if GPU else 'hist', 
    'enable_categorical': GPU,
    'max_cat_to_onehot': 5, #internal one-hot encoding
    'use_best_model': True,
    'verbosity': 0,
    'random_seed': SEED
}

In [21]:
%%time
study = tune_params(
    data=(train_original, target), 
    base_params=base_params,
    n_trials=5,
    direction='minimize')

print(f'Best trial: {study.best_trial.number}' \
      f' -> Best value (f1_score): {study.best_value:.5f}')
print(f'Best hyperparameters:')
for k, v in study.best_params.items():
    print(f'{k:20} - {v}')

Best trial: 1 -> Best value (f1_score): 0.39529
Best hyperparameters:
objective            - binary:hinge
learning_rate        - 0.4
max_depth            - 13
subsample            - 0.5
colsample_bytree     - 0.65
colsample_bylevel    - 0.65
colsample_bynode     - 0.95
min_child_weight     - 7
gamma                - 2.2095542782734756
alpha                - 1.0085598568518716e-05
lambda               - 39.369838203102454
scale_pos_weight     - 1.05
CPU times: user 4.4 s, sys: 341 ms, total: 4.74 s
Wall time: 3.9 s


# Cross-validation

In [22]:
def evaluate_model(data, model_params, verbose=True):
    
    preds_test = []
    scores_f1 = [] #validation set weighted-f1 scores
    scores_0_1_loss = [] #validation set zero-one loss
    
    X, X_test, y = data
    cat_features = list(X.select_dtypes(include='category').columns)
    
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        model = XGBClassifier(**model_params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0
        )
        
        preds_val = model.predict(X_val)
        preds_test.append(model.predict(X_test))
        
        scores_0_1_loss.append(zero_one_loss(y_val, preds_val))
        scores_f1.append(f1_score(y_val, preds_val, average='macro'))
    
    if verbose:
        scores_df = pd.DataFrame.from_dict({
            '0-1 loss': scores_0_1_loss,
            'Macro-f1': scores_f1
        })
        scores_df.index.name = 'Fold'
        display(scores_df.T)
    
    print(f'Average 0-1 loss (error%) = {np.mean(scores_0_1_loss):.4f} (with std = {np.std(scores_0_1_loss):.4f})')
    print(f'Average Macro-f1 = {np.mean(scores_f1):.4f} (with std = {np.std(scores_f1):.4f})')
    
    preds_test = st.mode(np.column_stack(preds_test), axis=1).mode
    return preds_test

In [23]:
def run_experiment(data, n_trials=5):
    """Tune the hyperparameters, train and evaluate the model, and return test predictions."""
    
    X, X_test, y = data
    
    base_params = {
        'n_estimators': 2000,
        'eval_metric': 'error',
        'early_stopping_rounds': 50,
        'tree_method': 'gpu_hist' if GPU else 'hist', 
        'enable_categorical': GPU,
        'max_cat_to_onehot': 5, #internal one-hot encoding
        'use_best_model': True,
        'verbosity': 0,
        'random_seed': SEED
    }
    
    print(f'---------------Hyperparameter tuning---------------')
    study = tune_params(
        data=(X, y), 
        base_params=base_params,
        n_trials=n_trials,
        direction='minimize')
    print(f'Best trial: {study.best_trial.number} -> Best value (error%): {study.best_value:.5f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:20} - {v}')
    
    model_params = {**base_params, **study.best_params}
    print(f'-----------------Cross-validation------------------')
    preds_test = evaluate_model(
        data=(X, X_test, y), 
        model_params=model_params)
    
    return preds_test

In [24]:
%%time
preds_all = run_experiment((train_all, test_all, target), 500)

---------------Hyperparameter tuning---------------
Best trial: 414 -> Best value (error%): 0.39129
Best hyperparameters:
objective            - binary:logistic
learning_rate        - 0.24000000000000002
max_depth            - 12
subsample            - 0.6
colsample_bytree     - 0.5
colsample_bylevel    - 0.6
colsample_bynode     - 0.65
min_child_weight     - 5
gamma                - 2.5530661152099023
alpha                - 0.0021108835898612597
lambda               - 44.85424474061691
scale_pos_weight     - 1.1
-----------------Cross-validation------------------


Fold,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0-1 loss,0.3864,0.395,0.4,0.3829,0.3921,0.4,0.3829,0.4,0.3843,0.3857,0.3957,0.3943,0.3957,0.3921,0.39
Macro-f1,0.5086,0.4369,0.375,0.5331,0.4198,0.375,0.4742,0.375,0.4786,0.5171,0.4485,0.4145,0.4365,0.5087,0.4739


Average 0-1 loss (error%) = 0.3918 (with std = 0.0060)
Average Macro-f1 = 0.4517 (with std = 0.0514)
CPU times: user 4min 36s, sys: 10.1 s, total: 4min 46s
Wall time: 3min 54s


In [25]:
%%time
preds_original = run_experiment((train_original, test_original, target), 500)

---------------Hyperparameter tuning---------------
Best trial: 154 -> Best value (error%): 0.39014
Best hyperparameters:
objective            - binary:logistic
learning_rate        - 0.38
max_depth            - 14
subsample            - 0.65
colsample_bytree     - 0.5
colsample_bylevel    - 0.9
colsample_bynode     - 0.65
min_child_weight     - 7
gamma                - 3.2406026102918313
alpha                - 0.15370441665752266
lambda               - 0.05746739364757084
scale_pos_weight     - 1.05
-----------------Cross-validation------------------


Fold,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0-1 loss,0.3829,0.4,0.3986,0.3779,0.3914,0.4,0.3757,0.3971,0.39,0.3936,0.395,0.3971,0.3964,0.3957,0.3929
Macro-f1,0.5025,0.375,0.3902,0.5296,0.4498,0.375,0.4696,0.4725,0.4829,0.4622,0.443,0.3908,0.4386,0.4221,0.4442


Average 0-1 loss (error%) = 0.3923 (with std = 0.0074)
Average Macro-f1 = 0.4432 (with std = 0.0446)
CPU times: user 4min 36s, sys: 10.7 s, total: 4min 47s
Wall time: 3min 46s


In [26]:
%%time
preds_selected = run_experiment((train_selected, test_selected, target), 500)

---------------Hyperparameter tuning---------------
Best trial: 371 -> Best value (error%): 0.39086
Best hyperparameters:
objective            - binary:logistic
learning_rate        - 0.06999999999999999
max_depth            - 11
subsample            - 0.8
colsample_bytree     - 1.0
colsample_bylevel    - 0.6
colsample_bynode     - 0.7
min_child_weight     - 5
gamma                - 8.45574746355991
alpha                - 0.014641307038842725
lambda               - 5.282062621386403
scale_pos_weight     - 1.2
-----------------Cross-validation------------------


Fold,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0-1 loss,0.385,0.3921,0.39,0.3879,0.3993,0.4,0.3864,0.4,0.39,0.3921,0.3957,0.3921,0.4007,0.3843,0.3886
Macro-f1,0.4781,0.5071,0.4383,0.4635,0.4587,0.375,0.5311,0.375,0.4622,0.4806,0.5152,0.4684,0.4905,0.5474,0.4685


Average 0-1 loss (error%) = 0.3923 (with std = 0.0055)
Average Macro-f1 = 0.4706 (with std = 0.0469)
CPU times: user 4min 13s, sys: 9.58 s, total: 4min 22s
Wall time: 3min 30s
