<a href="https://colab.research.google.com/github/stiwari-ds/data-science-competitions/blob/main/others/devfolio_oracleofdelphi/notebooks/01_xgboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
%%capture
!pip install --upgrade xgboost
!pip install --upgrade optuna

In [2]:
import os
import gc
import time
import warnings
import subprocess

gc.enable()
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)
np.set_printoptions(precision=4)

import optuna
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner

import xgboost
from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

SEED = 23
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [3]:
#remove cell to run future versions
assert optuna.__version__ == '3.0.4', f'Change in Optuna version. Original notebook version: 3.0.4'
assert xgboost.__version__ == '1.7.2', f'Change in XGBoost version. Original notebook version: 1.7.2'

In [4]:
#Check GPU availability
try:
    subprocess.check_output('nvidia-smi')
    HAVE_GPU = True
except Exception:
    HAVE_GPU = False

print(f'GPU available: {HAVE_GPU}')

GPU available: True


# Data Preparation

In [5]:
#Data
DATA_URL = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/others/devfolio_oracleofdelphi/data'
train = pd.read_csv(f'{DATA_URL}/train.csv')
test = pd.read_csv(f'{DATA_URL}/test.csv')

In [6]:
train = train.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
test = test.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)

In [7]:
TARGET = 'Credit Score'
label_mapping = {'Standard': 0, 'Good': 1, 'Best': 2}
train[TARGET] = train[TARGET].replace(label_mapping)

**Feature sets from EDA notebook.**

In [8]:
all_features = list(test.columns)

top_features = ['Utlization Ratio', 'Credit Inquiries']

mi_features = top_features + ['Monthly_Inhand_Salary ', 'Interest Rate', 
              'Changed_Credit_Limit', 'Outstanding_Debt', 'Total_EMI_per_month']

anova_features = top_features + ['Accounts of user', 'Credit cards user have', 
                 'Interest Rate', 'Num_of_Loan', 'Delay_from_due_date', 
                 'Num_of_Delayed_Payment', 'Outstanding_Debt']

**Outlier removal**

In [9]:
train = train[train['Accounts of user'] <= 10]
train = train[train['Credit cards user have'] <= 10]
train = train[train['Interest Rate'] <= 35]
train = train[(train['Num_of_Loan'] < 10) & (train['Num_of_Loan'] >= 0)]
train = train[(train['Num_of_Delayed_Payment'] < 30) & (train['Num_of_Delayed_Payment'] >= 0)]
train = train[train['Changed_Credit_Limit'] >= 0]
train = train[train['Total_EMI_per_month'] < 1000]

train = train.reset_index(drop=True)

**Original train set for comparison during modeling**

In [10]:
train_og = pd.read_csv(f'{DATA_URL}/train.csv')

train_og = train_og.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
train_og[TARGET] = train_og[TARGET].replace(label_mapping)

# Baseline

In [11]:
def custom_f1(ytrue, ypred):
    return -f1_score(ytrue, ypred, average='macro') #'-' to convert f1 to loss

In [12]:
%%time
scores = []
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
X, y = train[all_features], train[TARGET]
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.loc[val_idx], y.iloc[val_idx]

    model = XGBClassifier(
        objective='multi:softmax',
        num_class=3,
        tree_method='gpu_hist' if HAVE_GPU else 'hist',
        enable_categorical=HAVE_GPU,
        eval_metric=custom_f1,
        early_stopping_rounds=100, 
        seed=SEED
    ) 
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=0
    )
    val_preds = model.predict(X_val)
    score = f1_score(y_val, val_preds, average='macro')
    scores.append(score)
    print(f'Fold #{fold}: ({model.best_iteration} rounds) F1-macro = {score:.5f}')
    _ = gc.collect()

print(f'\nAvg F1-macro = {np.mean(scores):.5f} +/- {np.std(scores):.5f}\n')

Fold #0: (38 rounds) F1-macro = 0.88808
Fold #1: (76 rounds) F1-macro = 0.89528
Fold #2: (55 rounds) F1-macro = 0.89224
Fold #3: (27 rounds) F1-macro = 0.88820
Fold #4: (19 rounds) F1-macro = 0.87709

Avg F1-macro = 0.88818 +/- 0.00616

CPU times: user 4.74 s, sys: 757 ms, total: 5.49 s
Wall time: 5.33 s


# Hyperparameter tuning

In [13]:
def objective(trial, data, model):

    scores = []
    X, y = data

    param_grid = {
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.2, step=0.025),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 15),
        'gamma': trial.suggest_float('gamma', 0, 15, step=0.1), #complexity-control
        'alpha': trial.suggest_float('alpha', 0, 5, step=0.05), #L1-reg
        'lambda': trial.suggest_float('lambda', 1e-2, 1e3, log=True), #L2-reg
        'subsample': trial.suggest_float('subsample', 0.65, 1.0, step=0.05),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.65, 1.0, step=0.05),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.65, 1.0, step=0.05),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.65, 1.0, step=0.05),
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        model.set_params(**param_grid)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0
        )
        val_preds = model.predict(X_val)
        scores.append(f1_score(y_val, val_preds, average='macro'))

    return np.mean(scores)

In [14]:
def tune_params(data, model, n_trials, direction):
    study = optuna.create_study(
        sampler=TPESampler(
            consider_endpoints=True,
            multivariate=True,
            group=True,
            seed=SEED),
        pruner=HyperbandPruner(),
        direction=direction
    )
    study.optimize(
        func=lambda trial: objective(trial, data, model),
        n_trials=n_trials,
        gc_after_trial=True
    )
    return study

# Cross-validation

In [16]:
def cross_val_predict(data, model, n_splits=5):
    scores = {
        'Fold': [str(i) for i in range(n_splits)],
        'F1-micro': [],
        'F1-macro': [],
        'F1-weighted': []
    }
    test_preds = {}
    
    X, y, X_test = data

    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]

        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0
        )
        val_preds = model.predict(X_val)
        test_preds[f'fold{fold}'] = model.predict(X_test)

        f1micro = f1_score(y_val, val_preds, average='micro')
        scores['F1-micro'].append(f1micro)
        f1macro = f1_score(y_val, val_preds, average='macro')
        scores['F1-macro'].append(f1macro)
        f1weight = f1_score(y_val, val_preds, average='weighted')
        scores['F1-weighted'].append(f1weight)
        _ = gc.collect()

    scores['Fold'].append('Avg.')
    for metric in ['F1-micro', 'F1-macro', 'F1-weighted']:
        mean_score = np.mean(scores[metric])
        scores[metric].append(mean_score)
    score_df = pd.DataFrame.from_dict(scores).set_index('Fold')
    display(score_df)

    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mode'] = test_preds.mode(axis=1)[0].astype('int')
    return test_preds

In [17]:
def run_experiment(data, n_trials=10):
    
    X, y, X_test = data

    base_params = {
        'objective': 'multi:softmax',
        'n_estimators': 10000,
        'booster': 'gbtree',
        'eval_metric': custom_f1,
        'early_stopping_rounds': 100,
        'tree_method': 'gpu_hist' if HAVE_GPU else 'hist',
        'predictor': 'gpu_predictor' if HAVE_GPU else 'cpu_predictor',
        'enable_categorical': HAVE_GPU,
        'verbosity': 0,
        'seed': SEED
    }
    
    model = XGBClassifier(**base_params)
    
    print(f'----------Hyperparameter tuning----------')
    start = time.time()
    study = tune_params(
        data=(X, y),
        model=model,
        n_trials=n_trials, 
        direction='maximize' #metric: f1_score -> higher is better
    )
    end = time.time()
    print(f'Best trial: {study.best_trial.number} -> Best value: {study.best_value:.5f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:15} - {v}')
    print(f'[Time taken: {end - start:.2f}s]\n')
    
    print(f'-----Cross-validation and prediction-----')
    start = time.time()
    model.set_params(**study.best_params)
    test_preds = cross_val_predict(data, model)
    end = time.time()
    print(f'[Time taken: {end - start:.2f}s]\n')

    return test_preds

**Trial run**

In [18]:
optuna.logging.set_verbosity(optuna.logging.INFO)

In [19]:
%%time
tp_trial1 = run_experiment(
    data=(train[all_features], train[TARGET], test[all_features]),
    n_trials=3
)

[32m[I 2022-12-11 06:38:11,138][0m A new study created in memory with name: no-name-7827de44-2930-4c54-b3aa-8e4a8d0c1334[0m


----------Hyperparameter tuning----------


[32m[I 2022-12-11 06:38:16,659][0m Trial 0 finished with value: 0.8884841584384198 and parameters: {'learning_rate': 0.125, 'max_depth': 10, 'min_child_weight': 12, 'gamma': 4.2, 'alpha': 1.1, 'lambda': 26.98425462244015, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.8500000000000001, 'colsample_bynode': 0.8}. Best is trial 0 with value: 0.8884841584384198.[0m
[32m[I 2022-12-11 06:38:23,074][0m Trial 1 finished with value: 0.8862566684884934 and parameters: {'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 14, 'gamma': 4.5, 'alpha': 2.95, 'lambda': 780.0718033403551, 'subsample': 0.9500000000000001, 'colsample_bytree': 0.65, 'colsample_bylevel': 0.75, 'colsample_bynode': 0.75}. Best is trial 0 with value: 0.8884841584384198.[0m
[32m[I 2022-12-11 06:38:35,206][0m Trial 2 finished with value: 0.8949940568440276 and parameters: {'learning_rate': 0.175, 'max_depth': 8, 'min_child_weight': 3, 'gamma': 0.0, 'alpha': 4.75, 'lambda': 0.0509

Best trial: 2 -> Best value: 0.89499
Best hyperparameters:
learning_rate   - 0.175
max_depth       - 8
min_child_weight - 3
gamma           - 0.0
alpha           - 4.75
lambda          - 0.050992204592857734
subsample       - 0.8
colsample_bytree - 0.75
colsample_bylevel - 0.9500000000000001
colsample_bynode - 0.8
[Time taken: 24.14s]

-----Cross-validation and prediction-----


Unnamed: 0_level_0,F1-micro,F1-macro,F1-weighted
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.9264,0.8985,0.9259
1,0.9196,0.903,0.9191
2,0.9234,0.903,0.9225
3,0.9117,0.8906,0.9117
4,0.9137,0.8836,0.9127
Avg.,0.919,0.8958,0.9184


[Time taken: 11.27s]

CPU times: user 36.4 s, sys: 275 ms, total: 36.7 s
Wall time: 35.4 s


In [20]:
%%time
tp_trial2 = run_experiment(
    data=(train_og[all_features], train_og[TARGET], test[all_features]),
    n_trials=3
)

[32m[I 2022-12-11 06:38:46,563][0m A new study created in memory with name: no-name-ca792142-902e-4fb8-9611-4642dc6a5a23[0m


----------Hyperparameter tuning----------


[32m[I 2022-12-11 06:38:50,958][0m Trial 0 finished with value: 0.886315661100889 and parameters: {'learning_rate': 0.125, 'max_depth': 10, 'min_child_weight': 12, 'gamma': 4.2, 'alpha': 1.1, 'lambda': 26.98425462244015, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.8500000000000001, 'colsample_bynode': 0.8}. Best is trial 0 with value: 0.886315661100889.[0m
[32m[I 2022-12-11 06:38:57,101][0m Trial 1 finished with value: 0.8855733163269072 and parameters: {'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 14, 'gamma': 4.5, 'alpha': 2.95, 'lambda': 780.0718033403551, 'subsample': 0.9500000000000001, 'colsample_bytree': 0.65, 'colsample_bylevel': 0.75, 'colsample_bynode': 0.75}. Best is trial 0 with value: 0.886315661100889.[0m
[32m[I 2022-12-11 06:39:07,593][0m Trial 2 finished with value: 0.8955790062117716 and parameters: {'learning_rate': 0.175, 'max_depth': 8, 'min_child_weight': 3, 'gamma': 0.0, 'alpha': 4.75, 'lambda': 0.0509922

Best trial: 2 -> Best value: 0.89558
Best hyperparameters:
learning_rate   - 0.175
max_depth       - 8
min_child_weight - 3
gamma           - 0.0
alpha           - 4.75
lambda          - 0.050992204592857734
subsample       - 0.8
colsample_bytree - 0.75
colsample_bylevel - 0.9500000000000001
colsample_bynode - 0.8
[Time taken: 21.10s]

-----Cross-validation and prediction-----


Unnamed: 0_level_0,F1-micro,F1-macro,F1-weighted
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.9196,0.8963,0.9197
1,0.913,0.8883,0.9127
2,0.9253,0.9031,0.9245
3,0.9114,0.8822,0.9102
4,0.9228,0.9023,0.9227
Avg.,0.9184,0.8944,0.918


[Time taken: 10.99s]

CPU times: user 32.8 s, sys: 244 ms, total: 33.1 s
Wall time: 32.1 s


In [21]:
optuna.logging.set_verbosity(optuna.logging.ERROR)

# Exp 1 - clipped data + all features

In [22]:
%%time
tp_clip_all = run_experiment(
    data=(train[all_features], train[TARGET], test[all_features]),
    n_trials=100
)

----------Hyperparameter tuning----------
Best trial: 93 -> Best value: 0.89633
Best hyperparameters:
learning_rate   - 0.2
max_depth       - 10
min_child_weight - 4
gamma           - 0.30000000000000004
alpha           - 4.4
lambda          - 3.456216164946881
subsample       - 0.8
colsample_bytree - 0.8
colsample_bylevel - 0.8
colsample_bynode - 0.65
[Time taken: 528.17s]

-----Cross-validation and prediction-----


Unnamed: 0_level_0,F1-micro,F1-macro,F1-weighted
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.9302,0.9021,0.9297
1,0.9186,0.8981,0.9178
2,0.9215,0.9006,0.9207
3,0.9108,0.8877,0.9103
4,0.9156,0.8873,0.9149
Avg.,0.9193,0.8952,0.9187


[Time taken: 6.52s]

CPU times: user 9min 22s, sys: 5.31 s, total: 9min 27s
Wall time: 8min 54s


# Exp 2 - original data + all features

In [23]:
%%time
tp_og_all = run_experiment(
    data=(train_og[all_features], train_og[TARGET], test[all_features]),
    n_trials=100
)

----------Hyperparameter tuning----------
Best trial: 99 -> Best value: 0.89740
Best hyperparameters:
learning_rate   - 0.2
max_depth       - 5
min_child_weight - 11
gamma           - 0.30000000000000004
alpha           - 3.75
lambda          - 0.08558307549697705
subsample       - 0.75
colsample_bytree - 0.65
colsample_bylevel - 0.9
colsample_bynode - 0.65
[Time taken: 483.65s]

-----Cross-validation and prediction-----


Unnamed: 0_level_0,F1-micro,F1-macro,F1-weighted
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.9204,0.8981,0.9207
1,0.9155,0.8896,0.9147
2,0.9253,0.9024,0.9247
3,0.9163,0.8883,0.9153
4,0.9261,0.9025,0.9256
Avg.,0.9207,0.8962,0.9202


[Time taken: 7.62s]

CPU times: user 8min 39s, sys: 4.94 s, total: 8min 44s
Wall time: 8min 11s


# Exp 3 - clipped data + MI features

In [24]:
%%time
tp_clip_mi = run_experiment(
    data=(train[mi_features], train[TARGET], test[mi_features]),
    n_trials=100
)

----------Hyperparameter tuning----------
Best trial: 92 -> Best value: 0.88962
Best hyperparameters:
learning_rate   - 0.175
max_depth       - 8
min_child_weight - 7
gamma           - 0.30000000000000004
alpha           - 4.3
lambda          - 0.029284880982540524
subsample       - 0.7000000000000001
colsample_bytree - 0.65
colsample_bylevel - 0.9500000000000001
colsample_bynode - 0.65
[Time taken: 478.56s]

-----Cross-validation and prediction-----


Unnamed: 0_level_0,F1-micro,F1-macro,F1-weighted
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.9205,0.8814,0.9194
1,0.9196,0.8956,0.9186
2,0.9244,0.9045,0.9244
3,0.9108,0.8848,0.9102
4,0.9156,0.8828,0.9143
Avg.,0.9182,0.8898,0.9174


[Time taken: 9.56s]

CPU times: user 8min 31s, sys: 5.53 s, total: 8min 36s
Wall time: 8min 8s


# Exp 4 - clipped data + ANOVA features

In [25]:
%%time
tp_clip_anova = run_experiment(
    data=(train[anova_features], train[TARGET], test[anova_features]),
    n_trials=100
)

----------Hyperparameter tuning----------
Best trial: 66 -> Best value: 0.89712
Best hyperparameters:
learning_rate   - 0.175
max_depth       - 10
min_child_weight - 4
gamma           - 0.1
alpha           - 1.8
lambda          - 0.06553376734028496
subsample       - 0.9500000000000001
colsample_bytree - 0.65
colsample_bylevel - 0.9
colsample_bynode - 0.8500000000000001
[Time taken: 486.73s]

-----Cross-validation and prediction-----


Unnamed: 0_level_0,F1-micro,F1-macro,F1-weighted
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.9302,0.9044,0.9296
1,0.9225,0.9053,0.9219
2,0.9215,0.9009,0.9203
3,0.9069,0.8861,0.907
4,0.9146,0.8877,0.9137
Avg.,0.9192,0.8969,0.9185


[Time taken: 8.27s]

CPU times: user 8min 39s, sys: 5.39 s, total: 8min 45s
Wall time: 8min 15s


# Exp 5 - clipped data + top features

In [26]:
%%time
tp_clip_top = run_experiment(
    data=(train[top_features], train[TARGET], test[top_features]),
    n_trials=100
)

----------Hyperparameter tuning----------
Best trial: 43 -> Best value: 0.85525
Best hyperparameters:
learning_rate   - 0.15000000000000002
max_depth       - 10
min_child_weight - 6
gamma           - 0.2
alpha           - 3.9000000000000004
lambda          - 9.63948602888017
subsample       - 0.8500000000000001
colsample_bytree - 0.7000000000000001
colsample_bylevel - 0.8500000000000001
colsample_bynode - 0.8500000000000001
[Time taken: 272.53s]

-----Cross-validation and prediction-----


Unnamed: 0_level_0,F1-micro,F1-macro,F1-weighted
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.9002,0.8473,0.8981
1,0.9021,0.8653,0.8994
2,0.8983,0.8653,0.8951
3,0.8914,0.8526,0.889
4,0.8943,0.8457,0.8902
Avg.,0.8972,0.8552,0.8944


[Time taken: 4.47s]

CPU times: user 4min 56s, sys: 4.24 s, total: 5min
Wall time: 4min 37s


# Submission files

In [27]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [28]:
NOTEBOOK = '01'
SUBMISSION_PATH = f'/content/drive/MyDrive/data_science_competitions/others/devfolio_oracleofdelphi/submissions/nb_{NOTEBOOK}'
if not os.path.isdir(SUBMISSION_PATH):
    os.makedirs(SUBMISSION_PATH)

In [29]:
def create_submission_files(test_preds, dataset, feature_set, path=SUBMISSION_PATH):
    for col in test_preds.columns:
        sub = pd.DataFrame()
        sub[TARGET] = test_preds[col].replace({0: 'Standard', 1: 'Good', 2: 'Best'})
        sub.to_csv(f'{path}/{dataset}_{feature_set}_{col}.csv', index=False)

In [30]:
create_submission_files(tp_clip_all, 'clip', 'all')
create_submission_files(tp_clip_mi, 'clip', 'mi')
create_submission_files(tp_clip_anova, 'clip', 'anova')
create_submission_files(tp_clip_top, 'clip', 'top')

In [31]:
create_submission_files(tp_og_all, 'og', 'all')