<a href="https://colab.research.google.com/github/sidt-ai/data-science-competitions/blob/main/dphi/ds75-child-healthcare/notebooks/03_xgboost_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [29]:
import os
import gc
import time
import warnings
import subprocess

gc.enable()
warnings.filterwarnings('ignore')

In [30]:
# %%capture
# !pip install xgboost==1.6.1
# !pip install optuna==2.10.0

In [31]:
import numpy as np
from scipy.stats import mode
import pandas as pd
pd.set_option('precision', 4)
pd.set_option('display.max_columns', None)

import xgboost as xgb
import optuna
from optuna.samplers import TPESampler
from optuna.integration import XGBoostPruningCallback

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, f1_score

In [32]:
xgb.__version__, optuna.__version__

('1.6.1', '2.10.0')

In [33]:
SEED = 2311

os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [34]:
#Check GPU availability
try:
    subprocess.check_output('nvidia-smi')
    GPU = True
except Exception:
    GPU = False

print(f'GPU available: {GPU}')

GPU available: True


# Data preprocessing

In [35]:
train_proc_url = 'https://raw.githubusercontent.com/sidt-ai/data-science-competitions/main/dphi/ds75-child-healthcare/data/processed/train_proc.csv'
test_proc_url = 'https://raw.githubusercontent.com/sidt-ai/data-science-competitions/main/dphi/ds75-child-healthcare/data/processed/test_proc.csv'

train_ext_url = 'https://raw.githubusercontent.com/sidt-ai/data-science-competitions/main/dphi/ds75-child-healthcare/data/processed/train_ext.csv'
test_ext_url = 'https://raw.githubusercontent.com/sidt-ai/data-science-competitions/main/dphi/ds75-child-healthcare/data/processed/test_ext.csv'

In [36]:
# train_proc = pd.read_csv(train_proc_url)
test_proc = pd.read_csv(test_proc_url)

train = pd.read_csv(train_ext_url)
test = pd.read_csv(test_ext_url)

### Feature sets from previous notebooks:  
1. [EDA and baseline](https://github.com/sidt-ai/data-science-competitions/blob/main/dphi/ds75-child-healthcare/notebooks/01_eda_baseline.ipynb)
2. [Data preprocessing](https://github.com/sidt-ai/data-science-competitions/blob/main/dphi/ds75-child-healthcare/notebooks/02_data_preprocessing.ipynb)

In [37]:
TARGET = 'fetal_health'

original_features = test_proc.columns.to_list()

extended_features = test.columns.to_list()

cat_features = ['severe_decelerations', 'accelerations_cat', 'fetal_movement_cat',
                'uterine_contractions_cat', 'decelerations', 'abnormal_long_term_variability_cat']

hist_features = ['histogram_width', 'histogram_min', 'histogram_max', 
                 'histogram_number_of_peaks', 'histogram_number_of_zeroes', 
                 'histogram_mode', 'histogram_mean', 'histogram_median', 
                 'histogram_variance', 'histogram_tendency']

non_hist_features = [f for f in extended_features if f not in hist_features]

imbalanced_features = ['fetal_movement', 'light_decelerations', 
                       'severe_decelerations', 'histogram_variance',
                       'prolongued_decelerations', 'histogram_number_of_zeroes',
                       'percentage_of_time_with_abnormal_long_term_variability']

rejected_features = ['histogram_max', 'histogram_number_of_zeroes']

In [38]:
train[cat_features] = train[cat_features].astype('category')
test[cat_features] = test[cat_features].astype('category')

In [39]:
train[TARGET] = train[TARGET] - 1 #XGBoost expects labels starting from 0, we have (1, 2, 3)

### Creating folds

In [40]:
N_SPLITS = 5

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
# train_proc['fold'] = -1
train['fold'] = -1

for fold, (_, idx) in enumerate(skf.split(X=train, y=train[TARGET])):
    # train_proc.loc[idx, 'fold'] = fold
    train.loc[idx, 'fold'] = fold

In [41]:
train.head()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,mean_value_of_long_term_variability,histogram_width,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health,accelerations_cat,fetal_movement_cat,uterine_contractions_cat,decelerations,abnormal_long_term_variability_cat,fold
0,133,3,0,4,4,0,0,30,1.5,0,5.3,102,67,169,9,1,147,137,144,40,1,0,0,0,1,4,0,0
1,130,1,1,12,10,0,1,62,2.2,0,0.0,161,50,211,9,0,60,89,113,250,0,2,0,1,1,11,0,2
2,141,0,8,0,0,0,0,75,0.3,49,4.6,9,136,145,1,0,143,141,143,0,1,1,0,1,0,0,1,2
3,144,0,2,2,0,0,0,84,0.3,34,5.5,38,132,170,2,0,144,143,145,0,-1,2,0,1,1,0,1,4
4,106,1,0,11,0,0,0,63,0.6,0,11.5,30,95,125,1,0,112,110,112,1,0,0,0,0,1,0,0,0


# Modelling

### Hyperparameter tuning setup

In [69]:
N_ESTIMATORS = 10000
EARLY_STOPPING_ROUNDS = 200
TREE_METHOD = 'gpu_hist' if GPU else 'hist'
OBJECTIVE = 'multi:softmax'
NUM_CLASS = 3
EVAL_METRIC = 'mlogloss'

In [70]:
base_params = {
    'n_estimators': N_ESTIMATORS,
    'early_stopping_rounds': EARLY_STOPPING_ROUNDS,
    'tree_method': TREE_METHOD,
    'enable_categorical': GPU, #only available for gpu_hist
    'objective': OBJECTIVE,
    'num_class': NUM_CLASS,
    'max_cat_to_onehot': 3,
    'eval_metric': EVAL_METRIC,
    'random_state': SEED,
    'verbosity': 0
}

In [71]:
def objective(trial, base_params, data):
    #Defining hyperparameter search space
    param_grid = {
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        
        'max_bin': trial.suggest_int('max_bin', 4, 512),
        
        'learning_rate': trial.suggest_float(
            'learning_rate', 0.05, 0.3, step=0.025),
        
        # 'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
        
        'gamma': trial.suggest_float('gamma', 0.1, 20.0, step=0.1),
        
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 100),
        
        'max_delta_step': trial.suggest_float('max_delta_step', 1, 10, step=0.5),
        
        'subsample': trial.suggest_float('subsample', 0.5, 1.00, step=0.05),
        
        'colsample_bytree': trial.suggest_float(
            'colsample_bytree', 0.5, 1.00, step=0.05),
        
        'colsample_bylevel': trial.suggest_float(
            'colsample_bylevel', 0.5, 1.00, step=0.05),
        
        'colsample_bynode': trial.suggest_float(
            'colsample_bynode', 0.5, 1.00, step=0.05),
        
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-5, 1e3, log=True),
        
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-5, 1e3, log=True)
    }

    # if param_grid['booster'] == 'dart':
    #     param_grid['sample_type'] = 'weighted'
    #     param_grid['normalize_type'] = trial.suggest_categorical(
    #         'normalize_type', ['tree', 'forest'])
    #     param_grid['rate_drop'] = trial.suggest_float(
    #         'rate_drop', 0.1, 0.3)
    #     param_grid['skip_drop'] = trial.suggest_float(
    #         'skip_drop', 0.33, 0.67)

    model = xgb.XGBClassifier(
        **base_params, 
        **param_grid, 
        callbacks = [XGBoostPruningCallback(trial, 'validation_0-mlogloss')])
    
    xtrain, xval, ytrain, yval = data
    
    model.fit(
        xtrain, ytrain,
        eval_set=[(xval, yval)],
        verbose=False)

    predictions = model.predict(xval)
    return f1_score(yval, predictions, average='weighted')

In [72]:
def tune_hyperparameters(
        base_params,
        data, 
        n_trials=5):
    
    study = optuna.create_study(
        sampler=TPESampler(seed=SEED),
        direction='maximize')
    
    study.optimize(
        func=lambda trial: objective(trial, base_params, data),
        n_trials=n_trials,
        gc_after_trial=True)
    
    return study.best_params, study.best_value

### Cross-validation setup

In [73]:
def evaluate_model(train, test, features, model_params, n_splits=5):
    
    #out-of-fold predictions
    # oof_proba = {} #probability
    oof_pred = {} #class

    # test_proba = []
    test_pred = []
    cv_scores = []
    
    cv_start = time.time()
    for fold in range(n_splits):
        xtrain = train[train['fold'] != fold].reset_index(drop=True)
        ytrain = xtrain[TARGET]

        xval = train[train['fold'] == fold].reset_index(drop=True)
        yval = xval[TARGET]
        val_idx = xval.index.to_list()

        fold_start = time.time()

        model = xgb.XGBClassifier(**model_params)
        
        model.fit(
            xtrain[features], ytrain,
            eval_set=[(xval[features], yval)], 
            verbose=False)

        val_pred = model.predict(xval[features])
        oof_pred.update(dict(zip(val_idx, val_pred)))
        # val_proba = model.predict_proba(xval[features])[:, 1]
        # oof_proba.update(dict(zip(val_idx, val_proba)))        

        score = f1_score(yval, val_pred, average='weighted')
        cv_scores.append(score)

        fold_end = time.time()

        print(f'Fold #{fold}: f1_score = {score:.5f} \
        [Time: {fold_end - fold_start:.2f}s]')
        
        test_pred.append(model.predict(test[features]))
        # test_proba.append(model.predict_proba(test[features])[:, 1])
        
    cv_end = time.time()

    print(f'Average f1-score = {np.mean(cv_scores):.5f} \
    with std. dev. = {np.std(cv_scores):.5f}')
    print(f'[Total time: {cv_end - cv_start:.2f}s]')

    oof_pred = pd.DataFrame.from_dict(oof_pred, orient='index').reset_index()
    # oof_proba = pd.DataFrame.from_dict(oof_proba, orient='index').reset_index()
    
    test_pred = mode(np.column_stack(test_pred), axis=1).mode
    # test_proba = np.mean(np.column_stack(test_proba), axis=1)
    
    return oof_pred, test_pred

### Expmt 1 - Original features

In [74]:
xtrain, xval, ytrain, yval = train_test_split(train[original_features], 
                                              train[TARGET],
                                              test_size=0.2,
                                              stratify=train[TARGET],
                                              shuffle=True,
                                              random_state=SEED)

In [None]:
%%capture
best_params1, best_value1 = tune_hyperparameters(
    base_params=base_params,
    data=(xtrain, xval, ytrain, yval),
    n_trials=100)

In [76]:
print(f'Best f1_score: {best_value1:.5f}')
print('Best params:')
for key, value in best_params1.items():
    print(f'\t{key}: {value}')

Best f1_score: 0.90959
Best params:
	max_depth: 7
	max_bin: 434
	learning_rate: 0.1
	gamma: 3.0000000000000004
	min_child_weight: 7
	max_delta_step: 7.5
	subsample: 0.5
	colsample_bytree: 0.55
	colsample_bylevel: 0.9
	colsample_bynode: 0.8
	reg_alpha: 2.690877647821245e-05
	reg_lambda: 8.027733612068083e-05


In [77]:
model_params1 = dict(base_params, **best_params1)

In [78]:
oof_pred1, test_pred1 = evaluate_model(train, test, 
                                       original_features, 
                                       model_params1)

Fold #0: f1_score = 0.90389         [Time: 3.75s]
Fold #1: f1_score = 0.94111         [Time: 3.53s]
Fold #2: f1_score = 0.92540         [Time: 3.26s]
Fold #3: f1_score = 0.93358         [Time: 4.17s]
Fold #4: f1_score = 0.93051         [Time: 3.18s]
Average f1-score = 0.92690     with std. dev. = 0.01258
[Total time: 18.01s]


In [79]:
sub1 = pd.DataFrame({'fetal_health': test_pred1.ravel() + 1}) #resetting the labels 
sub1.to_csv('sub-03-original-features.csv', index=False)

In [80]:
!head sub-03-original-features.csv

fetal_health
1
2
1
1
1
1
1
1
1


### Expmt 2 - Extended features

In [81]:
xtrain, xval, ytrain, yval = train_test_split(train[extended_features], 
                                              train[TARGET],
                                              test_size=0.2,
                                              stratify=train[TARGET],
                                              shuffle=True,
                                              random_state=SEED)

In [None]:
%%capture
best_params2, best_value2 = tune_hyperparameters(
    base_params=base_params,
    data=(xtrain, xval, ytrain, yval),
    n_trials=100)

In [83]:
print(f'Best f1_score: {best_value2:.5f}')
print('Best params:')
for key, value in best_params2.items():
    print(f'\t{key}: {value}')

Best f1_score: 0.90834
Best params:
	max_depth: 7
	max_bin: 434
	learning_rate: 0.1
	gamma: 3.0000000000000004
	min_child_weight: 7
	max_delta_step: 7.5
	subsample: 0.5
	colsample_bytree: 0.55
	colsample_bylevel: 0.9
	colsample_bynode: 0.8
	reg_alpha: 2.690877647821245e-05
	reg_lambda: 8.027733612068083e-05


In [84]:
model_params2 = dict(base_params, **best_params2)

In [85]:
oof_pred2, test_pred2 = evaluate_model(train, test, 
                                       extended_features, 
                                       model_params2)

Fold #0: f1_score = 0.90683         [Time: 1.69s]
Fold #1: f1_score = 0.94065         [Time: 4.64s]
Fold #2: f1_score = 0.93877         [Time: 2.79s]
Fold #3: f1_score = 0.93016         [Time: 3.33s]
Fold #4: f1_score = 0.92065         [Time: 3.37s]
Average f1-score = 0.92741     with std. dev. = 0.01250
[Total time: 15.95s]


In [86]:
sub2 = pd.DataFrame({'fetal_health': test_pred2.ravel() + 1}) #resetting the labels 
sub2.to_csv('sub-03-extended-features.csv', index=False)

In [87]:
!head sub-03-extended-features.csv

fetal_health
1
2
1
1
1
1
1
1
1


### Expmt 3 - Balanced features

In [88]:
balanced_features = [f for f in extended_features if f not in imbalanced_features]

In [89]:
xtrain, xval, ytrain, yval = train_test_split(train[balanced_features], 
                                              train[TARGET],
                                              test_size=0.2,
                                              stratify=train[TARGET],
                                              shuffle=True,
                                              random_state=SEED)

In [None]:
%%capture
best_params3, best_value3 = tune_hyperparameters(
    base_params=base_params,
    data=(xtrain, xval, ytrain, yval),
    n_trials=100)

In [91]:
print(f'Best f1_score: {best_value3:.5f}')
print('Best params:')
for key, value in best_params3.items():
    print(f'\t{key}: {value}')

Best f1_score: 0.90397
Best params:
	max_depth: 7
	max_bin: 434
	learning_rate: 0.1
	gamma: 3.0000000000000004
	min_child_weight: 7
	max_delta_step: 7.5
	subsample: 0.5
	colsample_bytree: 0.55
	colsample_bylevel: 0.9
	colsample_bynode: 0.8
	reg_alpha: 2.690877647821245e-05
	reg_lambda: 8.027733612068083e-05


In [92]:
model_params3 = dict(base_params, **best_params3)

In [93]:
oof_pred3, test_pred3 = evaluate_model(train, test, 
                                       balanced_features, 
                                       model_params3)

Fold #0: f1_score = 0.91487         [Time: 2.82s]
Fold #1: f1_score = 0.93763         [Time: 6.37s]
Fold #2: f1_score = 0.91755         [Time: 3.48s]
Fold #3: f1_score = 0.92390         [Time: 5.40s]
Fold #4: f1_score = 0.91564         [Time: 5.11s]
Average f1-score = 0.92192     with std. dev. = 0.00847
[Total time: 23.30s]


In [94]:
sub3 = pd.DataFrame({'fetal_health': test_pred3.ravel() + 1}) #resetting the labels 
sub3.to_csv('sub-03-balanced-features.csv', index=False)

In [95]:
!head sub-03-balanced-features.csv

fetal_health
1
2
1
1
1
1
1
1
1
