# Setup

In [1]:
%%capture
!pip install --upgrade xgboost
!pip install --upgrade optuna

In [2]:
import os
import gc
import time
import warnings
import subprocess

gc.enable()
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)
np.set_printoptions(precision=4)

import optuna
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner

import xgboost
from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score, f1_score

SEED = 23
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [3]:
#remove cell to run future versions
assert optuna.__version__ == '3.0.3', f'Change in Optuna version. Original notebook version: 3.0.3'
assert xgboost.__version__ == '1.6.2', f'Change in XGBoost version. Original notebook version: 1.6.2'

In [4]:
#Check GPU availability
try:
    subprocess.check_output('nvidia-smi')
    HAVE_GPU = True
except Exception:
    HAVE_GPU = False

print(f'GPU available: {HAVE_GPU}')

GPU available: True


In [12]:
DATA_URL = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/analytics_vidhya/dataverse_hack/data'

train = pd.read_csv(f'{DATA_URL}/processed/train.csv') #processed dataset from notebook 00
test = pd.read_csv(f'{DATA_URL}/processed/test.csv')
sample_sub = pd.read_csv(f'{DATA_URL}/raw/sample_submission.csv')

In [13]:
TARGET = 'is_claim'

In [14]:
features = list(test.columns)

num_features = ['policy_tenure', 'age_of_car', 'age_of_policyholder', 
                'population_density', 'airbags', 'displacement', 'turning_radius',
                'length', 'width','height', 'gross_weight', 'ncap_rating']

cat_features = [f for f in features if f not in num_features]

In [15]:
train[cat_features] = train[cat_features].astype('category')
test[cat_features] = test[cat_features].astype('category')

# Baseline

In [16]:
def get_best_threshold(y_true, pred_probs):
    candidate_thresholds = np.arange(0, 1, 0.01)
    candidate_scores = [f1_score(y_true, (pred_probs >= t).astype('int')) 
                        for t in candidate_thresholds]
    best_threshold = candidate_thresholds[np.argmax(candidate_scores)]
    return best_threshold

In [17]:
%%time
scores_f1 = []
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
X, y = train[features], train[TARGET]
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.loc[val_idx], y.iloc[val_idx]

    model = XGBClassifier(
        objective='binary:logistic',
        tree_method='gpu_hist' if HAVE_GPU else 'hist',
        enable_categorical=HAVE_GPU,
        eval_metric='aucpr',
        early_stopping_rounds=100, 
        seed=SEED
    ) 
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=0
    )
    val_probs = model.predict_proba(X_val)[:, 1]
    best_threshold = get_best_threshold(y_val, val_probs)
    val_preds = (val_probs >= best_threshold).astype('int')
    score = f1_score(y_val, val_preds)
    scores_f1.append(score)
    print(f'Fold #{fold}: ({model.best_iteration} rounds) F1-score = {score:.5f}')
    _ = gc.collect()

print(f'\nAvg F1-score = {np.mean(scores_f1):.5f} +/- {np.std(scores_f1):.5f}\n')

Fold #0: (5 rounds) F1-score = 0.16726
Fold #1: (2 rounds) F1-score = 0.16585
Fold #2: (1 rounds) F1-score = 0.17947
Fold #3: (9 rounds) F1-score = 0.16408
Fold #4: (12 rounds) F1-score = 0.17342

Avg F1-score = 0.17002 +/- 0.00568

CPU times: user 7.96 s, sys: 285 ms, total: 8.25 s
Wall time: 7.58 s


# Hyperparameter tuning

In [34]:
def objective(trial, data, model, proba):

    scores = []
    X, y = data

    param_grid = {
        'learning_rate': trial.suggest_float('learning_rate', 0.025, 0.3, step=0.025),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 20),
        'gamma': trial.suggest_float('gamma', 0, 20, step=0.1), #complexity-control
        'alpha': trial.suggest_float('alpha', 0, 5, step=0.05), #L1-reg
        'lambda': trial.suggest_float('lambda', 1e-3, 1e5, log=True), #L2-reg
        'subsample': trial.suggest_float('subsample', 0.5, 1.0, step=0.05),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0, step=0.05),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0, step=0.05),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.5, 1.0, step=0.05),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 15, step=0.05),
        'max_cat_to_onehot': trial.suggest_categorical('max_cat_to_onehot', [3, 6, 11]) 
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        model.set_params(**param_grid)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0
        )
        if proba:
            val_preds = model.predict_proba(X_val)[:, 1]
            scores.append(average_precision_score(y_val, val_preds))
        else:
            val_preds = model.predict(X_val)
            scores.append(f1_score(y_val, val_preds))

    return np.mean(scores)

In [35]:
def tune_params(data, model, proba, n_trials=10, direction='maximize'):
    study = optuna.create_study(
        sampler=TPESampler(seed=SEED),
        pruner=HyperbandPruner(),
        direction=direction
    )
    study.optimize(
        func=lambda trial: objective(trial, data, model, proba),
        n_trials=n_trials,
        gc_after_trial=True
    )
    return study

# Cross-validation

In [36]:
def cross_validate_predict(data, model, proba, n_splits=5):
    oof_preds = {}  #out-of-fold predictions on train set
    test_preds = {} #predictions on test set for each fold
    scores = [] #scores on validation set

    X, y, X_test = data
       
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0
        )

        if proba:
            val_probs = model.predict_proba(X_val)[:, 1]
            test_probs = model.predict_proba(X_test)[:, 1]
            best_threshold = get_best_threshold(y_val, val_probs)
            val_preds = (val_probs >= best_threshold).astype('int')
            test_preds_fold = (test_probs >= best_threshold).astype('int')
        else:
            val_preds = model.predict(X_val)
            test_preds_fold = model.predict(X_test)
        
        oof_preds.update(dict(zip(val_idx, val_preds)))
        test_preds[f'fold{fold}'] = test_preds_fold

        score = f1_score(y_val, val_preds)
        scores.append(score)
        print(f'Fold #{fold}: F1 = {score:.5f}')
        _ = gc.collect()
    print(f'Avg. F1 = {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    
    oof_preds = pd.Series(oof_preds).sort_index()
    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mode'] = test_preds.mode(axis=1)[0].astype('int')

    return oof_preds, test_preds

In [37]:
def run_experiment(data, proba, n_trials=10):
    
    X, y, X_test = data

    base_params = {
        'objective': 'binary:logistic',
        'n_estimators': 10000,
        'booster': 'gbtree',
        'eval_metric': 'aucpr',
        'early_stopping_rounds': 100,
        'tree_method': 'gpu_hist' if HAVE_GPU else 'hist',
        'predictor': 'gpu_predictor' if HAVE_GPU else 'cpu_predictor',
        'enable_categorical': HAVE_GPU,
        'verbosity': 0,
        'seed': SEED
    }
    
    model = XGBClassifier(**base_params)
    
    print(f'----------Hyperparameter tuning----------')
    start = time.time()
    study = tune_params(
        data=(X, y),
        model=model,
        proba=proba,
        n_trials=n_trials, 
        direction='maximize' #metric: f1-score/avg_precision -> higher is better
    )
    end = time.time()
    print(f'Best trial: {study.best_trial.number} -> Best value: {study.best_value:.5f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:15} - {v}')
    print(f'[Time taken: {end - start:.2f}s]\n')
    
    print(f'-----Cross-validation and prediction-----')
    start = time.time()
    model.set_params(**study.best_params)
    oof_preds, test_preds = cross_validate_predict(data, model, proba)
    end = time.time()
    print(f'[Time taken: {end - start:.2f}s]\n')

    return oof_preds, test_preds

**Trial runs**

In [38]:
optuna.logging.set_verbosity(optuna.logging.INFO)

In [39]:
%%time
op, tp = run_experiment(
    data=(train[features], train[TARGET], test[features]),
    proba=False,
    n_trials=3
)

[32m[I 2022-11-13 11:43:56,161][0m A new study created in memory with name: no-name-226b120f-34f9-4fa1-a234-63ec1b3bb133[0m


----------Hyperparameter tuning----------


[32m[I 2022-11-13 11:44:00,242][0m Trial 0 finished with value: 0.0 and parameters: {'learning_rate': 0.17500000000000002, 'max_depth': 15, 'min_child_weight': 16, 'gamma': 5.6000000000000005, 'alpha': 1.1, 'lambda': 308.87067834937415, 'subsample': 0.55, 'colsample_bytree': 0.7, 'colsample_bylevel': 0.8, 'colsample_bynode': 0.7, 'scale_pos_weight': 1.0, 'max_cat_to_onehot': 6}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-11-13 11:44:09,217][0m Trial 1 finished with value: 0.16540757962781721 and parameters: {'learning_rate': 0.2, 'max_depth': 15, 'min_child_weight': 18, 'gamma': 1.3, 'alpha': 1.4500000000000002, 'lambda': 0.20112938596732194, 'subsample': 0.95, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.55, 'colsample_bynode': 0.5, 'scale_pos_weight': 14.200000000000001, 'max_cat_to_onehot': 6}. Best is trial 1 with value: 0.16540757962781721.[0m
[32m[I 2022-11-13 11:44:12,855][0m Trial 2 finished with value: 0.0005333333333333334 and parameters: {'learning_rate': 0.2

Best trial: 1 -> Best value: 0.16541
Best hyperparameters:
learning_rate   - 0.2
max_depth       - 15
min_child_weight - 18
gamma           - 1.3
alpha           - 1.4500000000000002
lambda          - 0.20112938596732194
subsample       - 0.95
colsample_bytree - 0.8
colsample_bylevel - 0.55
colsample_bynode - 0.5
scale_pos_weight - 14.200000000000001
max_cat_to_onehot - 6
[Time taken: 16.76s]

-----Cross-validation and prediction-----
Fold #0: F1 = 0.16533
Fold #1: F1 = 0.16378
Fold #2: F1 = 0.17084
Fold #3: F1 = 0.15991
Fold #4: F1 = 0.16717
Avg. F1 = 0.16541 +/- 0.00362
[Time taken: 18.12s]

CPU times: user 37.3 s, sys: 728 ms, total: 38.1 s
Wall time: 34.9 s


In [40]:
%%time
op, tp = run_experiment(
    data=(train[features], train[TARGET], test[features]),
    proba=True,
    n_trials=3
)

[32m[I 2022-11-13 11:44:31,068][0m A new study created in memory with name: no-name-58bef7d6-7690-41ff-9d66-d518faefca90[0m


----------Hyperparameter tuning----------


[32m[I 2022-11-13 11:44:34,180][0m Trial 0 finished with value: 0.10380622728647278 and parameters: {'learning_rate': 0.17500000000000002, 'max_depth': 15, 'min_child_weight': 16, 'gamma': 5.6000000000000005, 'alpha': 1.1, 'lambda': 308.87067834937415, 'subsample': 0.55, 'colsample_bytree': 0.7, 'colsample_bylevel': 0.8, 'colsample_bynode': 0.7, 'scale_pos_weight': 1.0, 'max_cat_to_onehot': 6}. Best is trial 0 with value: 0.10380622728647278.[0m
[32m[I 2022-11-13 11:44:43,120][0m Trial 1 finished with value: 0.10666766850252214 and parameters: {'learning_rate': 0.2, 'max_depth': 15, 'min_child_weight': 18, 'gamma': 1.3, 'alpha': 1.4500000000000002, 'lambda': 0.20112938596732194, 'subsample': 0.95, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.55, 'colsample_bynode': 0.5, 'scale_pos_weight': 14.200000000000001, 'max_cat_to_onehot': 6}. Best is trial 1 with value: 0.10666766850252214.[0m
[32m[I 2022-11-13 11:44:46,833][0m Trial 2 finished with value: 0.10629359930563434 and par

Best trial: 1 -> Best value: 0.10667
Best hyperparameters:
learning_rate   - 0.2
max_depth       - 15
min_child_weight - 18
gamma           - 1.3
alpha           - 1.4500000000000002
lambda          - 0.20112938596732194
subsample       - 0.95
colsample_bytree - 0.8
colsample_bylevel - 0.55
colsample_bynode - 0.5
scale_pos_weight - 14.200000000000001
max_cat_to_onehot - 6
[Time taken: 15.85s]

-----Cross-validation and prediction-----
Fold #0: F1 = 0.16735
Fold #1: F1 = 0.16651
Fold #2: F1 = 0.17813
Fold #3: F1 = 0.17052
Fold #4: F1 = 0.18313
Avg. F1 = 0.17313 +/- 0.00647
[Time taken: 20.00s]

CPU times: user 38.1 s, sys: 575 ms, total: 38.7 s
Wall time: 35.9 s


In [41]:
optuna.logging.set_verbosity(optuna.logging.ERROR)

# Exp 1

In [42]:
%%time
op1, tp1 = run_experiment(
    data=(train[features], train[TARGET], test[features]),
    proba=False,
    n_trials=200
)

----------Hyperparameter tuning----------
Best trial: 144 -> Best value: 0.17686
Best hyperparameters:
learning_rate   - 0.125
max_depth       - 3
min_child_weight - 10
gamma           - 8.200000000000001
alpha           - 3.95
lambda          - 0.052999716771597524
subsample       - 0.8500000000000001
colsample_bytree - 0.75
colsample_bylevel - 0.9
colsample_bynode - 0.55
scale_pos_weight - 10.65
max_cat_to_onehot - 6
[Time taken: 757.00s]

-----Cross-validation and prediction-----
Fold #0: F1 = 0.17579
Fold #1: F1 = 0.17577
Fold #2: F1 = 0.18390
Fold #3: F1 = 0.16915
Fold #4: F1 = 0.17969
Avg. F1 = 0.17686 +/- 0.00488
[Time taken: 11.43s]

CPU times: user 14min 44s, sys: 10.1 s, total: 14min 54s
Wall time: 12min 48s


# Exp 2

In [43]:
%%time
op2, tp2 = run_experiment(
    data=(train[features], train[TARGET], test[features]),
    proba=True,
    n_trials=200
)

----------Hyperparameter tuning----------
Best trial: 107 -> Best value: 0.11260
Best hyperparameters:
learning_rate   - 0.025
max_depth       - 6
min_child_weight - 5
gamma           - 12.0
alpha           - 4.55
lambda          - 99.88472972267857
subsample       - 0.95
colsample_bytree - 1.0
colsample_bylevel - 0.6
colsample_bynode - 0.55
scale_pos_weight - 14.600000000000001
max_cat_to_onehot - 3
[Time taken: 916.49s]

-----Cross-validation and prediction-----
Fold #0: F1 = 0.17251
Fold #1: F1 = 0.17122
Fold #2: F1 = 0.18737
Fold #3: F1 = 0.16955
Fold #4: F1 = 0.17867
Avg. F1 = 0.17587 +/- 0.00653
[Time taken: 15.49s]

CPU times: user 17min 29s, sys: 11.3 s, total: 17min 40s
Wall time: 15min 31s


# Submission files

In [44]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [45]:
NOTEBOOK = '01'
SUBMISSION_PATH = f'/content/drive/MyDrive/data_science_competitions/analytics_vidhya/dataverse_hack/submissions/nb_{NOTEBOOK}'
if not os.path.isdir(SUBMISSION_PATH):
    os.makedirs(SUBMISSION_PATH)

In [46]:
def create_submission_files(test_preds: pd.DataFrame, expt_num: int):
    for col in (test_preds.columns):
        sub = sample_sub.copy()
        sub[TARGET] = test_preds[col]
        sub.to_csv(f'{SUBMISSION_PATH}/{expt_num}_{col}.csv', index=False)

In [47]:
create_submission_files(tp1, '01')
create_submission_files(tp2, '02')