# Setup

In [1]:
import gc
import time
import warnings
import subprocess

gc.enable()
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

import xgboost as xgb
import optuna

SEED = 55

In [2]:
xgb.__version__

'2.0.3'

In [3]:
#Check GPU availability
try:
    subprocess.check_output('nvidia-smi')
    DEVICE = 'cuda'
except Exception:
    DEVICE = 'cpu'

print(f'Available device: {DEVICE}')

Available device: cuda


# Data preparation

In [4]:
DATA_DIR = '/kaggle/input/ml-olympiad-smoking'

train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

original = pd.read_csv('/kaggle/input/smoker-status-prediction-using-biosignals/train_dataset.csv')

In [5]:
TARGET = 'smoking'
FEATURES = [f for f in test.columns if f != 'id']
CAT_FEATURES = ['hearing(left)', 'hearing(right)', 'dental caries']

In [6]:
train[CAT_FEATURES] = train[CAT_FEATURES].astype('category')
test[CAT_FEATURES] = test[CAT_FEATURES].astype('category')
original[CAT_FEATURES] = original[CAT_FEATURES].astype('category')

# Baseline

In [7]:
# competition metric
def comp_metric(y_true, y_pred):
    return roc_auc_score(y_true, y_pred)

In [8]:
X, y = train[FEATURES], train[TARGET]
oof_preds = {}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
for fold, (train_ids, val_ids) in enumerate(cv.split(X, y)):
    X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
    X_val, y_val = X.iloc[val_ids], y.iloc[val_ids]
    
    model = xgb.XGBClassifier(
        n_estimators=1000,
        learning_rate=0.1,
        early_stopping_rounds=100,
        objective='binary:logistic',
        eval_metric='auc',
        booster='gbtree',
        tree_method='hist',
        device=DEVICE,
        enable_categorical=True,
        verbosity=0,
        n_jobs=-1,
        random_state=SEED)
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False)
        
    val_preds = model.predict_proba(X_val)[:, 1]
    oof_preds.update(dict(zip(val_ids, val_preds)))
    
    score = comp_metric(y_val, val_preds)
    print(f'Fold #{fold}: {score:.5f} ({model.best_iteration} rounds)')        
    _ = gc.collect()
    
oof_preds = pd.Series(oof_preds).sort_index()
print(f'\nOOF score: {comp_metric(y, oof_preds):.5f}\n')

Fold #0: 0.86828 (593 rounds)
Fold #1: 0.86782 (569 rounds)
Fold #2: 0.86962 (522 rounds)
Fold #3: 0.87113 (585 rounds)
Fold #4: 0.86764 (589 rounds)

OOF score: 0.86889



# Hyperparameter tuning

In [9]:
def objective(trial, features, model, extend, folds, seed):
    oof_preds = {}
    X, y = train[features], train[TARGET]
    
    param_grid = {
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.3, step=0.025),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 64),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0, step=0.05),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0, step=0.05),
        'gamma': trial.suggest_float('gamma', 0, 20, step=0.1), #complexity-control
        'alpha': trial.suggest_float('alpha', 0, 5, step=0.1), #L1-reg
        'lambda': trial.suggest_float('lambda', 5e-3, 5e3, log=True), #L2-reg
#         'max_delta_step': trial.suggest_float('max_delta_step', 0, 10, step=0.5),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 1.5, step=0.05)
    }
    
    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (train_ids, val_ids) in enumerate(cv.split(train, train[TARGET])):
        X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
        X_val, y_val = X.iloc[val_ids], y.iloc[val_ids]
        
        if extend: # original data added only to training folds
            X_train = pd.concat([X_train, original[features]], axis=0, ignore_index=True)
            y_train = pd.concat([y_train, original[TARGET]], axis=0, ignore_index=True)
        
        model.set_params(**param_grid)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0)
        
        val_preds = model.predict_proba(X_val)[:, 1]
        oof_preds.update(dict(zip(val_ids, val_preds)))
        
    oof_preds = pd.Series(oof_preds).sort_index()
    return comp_metric(train[TARGET], oof_preds)

In [10]:
def tune_params(features, model, extend, folds, seed, n_trials, direction):
    study = optuna.create_study(
        sampler=optuna.samplers.TPESampler(
            consider_endpoints=True,
            multivariate=True,
            group=True,
            seed=seed),
        pruner=optuna.pruners.HyperbandPruner(),
        direction=direction)
    study.optimize(
        func=lambda trial: objective(
            trial, features, model, extend, folds, seed),
        n_trials=n_trials,
        gc_after_trial=True)
    
    return study

# Cross-validation framework

In [11]:
def custom_cv(features, model, extend, folds, seed, verbose=True):
    oof_preds = {}
    test_preds = {}
    scores = []
    
    X, y = train[features], train[TARGET]
    X_test = test[features]
    
    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (train_ids, val_ids) in enumerate(cv.split(train, train[TARGET])):
        X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
        X_val, y_val = X.iloc[val_ids], y.iloc[val_ids]
        
        if extend: # original data added only to training folds
            X_train = pd.concat([X_train, original[features]], axis=0, ignore_index=True)
            y_train = pd.concat([y_train, original[TARGET]], axis=0, ignore_index=True)
        
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0)
        
        val_preds = model.predict_proba(X_val)[:, 1]
        oof_preds.update(dict(zip(val_ids, val_preds)))
        test_preds[f'fold{fold}'] = model.predict_proba(X_test)[:, 1]
        
        score = comp_metric(y_val, val_preds)
        scores.append(score)
        if verbose:
            print(f'Fold #{fold}: {score:.5f} ({model.best_iteration} rounds)')
            
        _ = gc.collect()
    
    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mean'] = test_preds.mean(axis=1) # mean of fold-wise predictions
    
    oof_preds = pd.Series(oof_preds).sort_index()
    print(f'\nOOF score: {comp_metric(train[TARGET], oof_preds):.5f}')
    print(f'Avg. score: {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    
    return oof_preds, test_preds

In [12]:
def run_experiment(features, extend=False, folds=5, seed=SEED, n_trials=50):
    
    base_params = {
        'booster': 'gbtree',
        'tree_method': 'hist',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'n_estimators': 10000,
        'early_stopping_rounds': 100,
        'device': DEVICE,
        'enable_categorical': True,
        'max_cat_to_onehot': 2,
        'verbosity': 0,
        'n_jobs': -1,
        'seed': seed
    }
    model = xgb.XGBClassifier(**base_params)
    
    print(f'----------Hyperparameter tuning----------')
    start = time.time()
    
    study = tune_params(features, model, extend, folds, seed, n_trials, direction='maximize') 
    # metric: AUC -> higher is better
    
    end = time.time()
    
    print(f'Best trial: {study.best_trial.number} -> Best value: {study.best_value:.5f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:20} - {v}')
    print(f'\n[Time taken: {end - start:.2f}s]\n')
    
    print(f'-----Cross-validation and prediction-----')
    start = time.time()
    
    model.set_params(**study.best_params)
    oof_preds, test_preds = custom_cv(features, model, extend, folds, seed)
    
    end = time.time()
    print(f'\n[Time taken: {end - start:.2f}s]\n')
    
    return oof_preds, test_preds

**Submission files**

In [13]:
def create_submission_files(preds, config, notebook='01'):
    sub = sample_sub.copy()
    sub[TARGET] = preds['mean']
    sub.to_csv(f'{notebook}_{config}.csv', index=False)

**Trial run:**

In [14]:
optuna.logging.set_verbosity(optuna.logging.INFO)

In [15]:
_ , _ = run_experiment(features=FEATURES, extend=True, n_trials=3)

[I 2024-03-08 18:05:29,351] A new study created in memory with name: no-name-9e888eb0-84db-4bc6-826e-04ffaf5bc4bc


----------Hyperparameter tuning----------


[I 2024-03-08 18:06:09,753] Trial 0 finished with value: 0.8695097050002643 and parameters: {'learning_rate': 0.07500000000000001, 'max_depth': 12, 'min_child_weight': 32, 'subsample': 0.7, 'colsample_bytree': 0.8, 'gamma': 5.7, 'alpha': 4.3, 'lambda': 0.008822078646927935, 'scale_pos_weight': 1.05}. Best is trial 0 with value: 0.8695097050002643.
[I 2024-03-08 18:06:22,715] Trial 1 finished with value: 0.8579173074853048 and parameters: {'learning_rate': 0.25, 'max_depth': 3, 'min_child_weight': 50, 'subsample': 0.6, 'colsample_bytree': 0.85, 'gamma': 16.400000000000002, 'alpha': 4.5, 'lambda': 4098.637364141357, 'scale_pos_weight': 1.25}. Best is trial 0 with value: 0.8695097050002643.
[I 2024-03-08 18:06:34,834] Trial 2 finished with value: 0.8650116225118322 and parameters: {'learning_rate': 0.125, 'max_depth': 11, 'min_child_weight': 27, 'subsample': 0.85, 'colsample_bytree': 0.6, 'gamma': 16.2, 'alpha': 2.1, 'lambda': 11.226623011877013, 'scale_pos_weight': 1.4}. Best is trial 0 

Best trial: 0 -> Best value: 0.86951
Best hyperparameters:
learning_rate        - 0.07500000000000001
max_depth            - 12
min_child_weight     - 32
subsample            - 0.7
colsample_bytree     - 0.8
gamma                - 5.7
alpha                - 4.3
lambda               - 0.008822078646927935
scale_pos_weight     - 1.05

[Time taken: 65.57s]

-----Cross-validation and prediction-----
Fold #0: 0.86911 (1490 rounds)
Fold #1: 0.86789 (1206 rounds)
Fold #2: 0.87082 (2317 rounds)
Fold #3: 0.87157 (2375 rounds)
Fold #4: 0.86822 (1812 rounds)

OOF score: 0.86951
Avg. score: 0.86952 +/- 0.00144

[Time taken: 41.15s]



In [16]:
optuna.logging.set_verbosity(optuna.logging.ERROR)

# Experiments

In [17]:
op = {} # Train-set OOF predictions
tp = {} # Test-set predictions

### number of folds

In [18]:
%%time
dataset = 'trn'
feature_set = 'all'
folds = 7
seed = SEED
config = f'{dataset}_{feature_set}_f{folds}_s{seed}'

op[config], tp[config] = run_experiment(
    features=FEATURES,  
    extend=False,
    folds=folds, 
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 36 -> Best value: 0.87184
Best hyperparameters:
learning_rate        - 0.05
max_depth            - 5
min_child_weight     - 26
subsample            - 0.65
colsample_bytree     - 0.85
gamma                - 1.3
alpha                - 2.6
lambda               - 55.11308203855102
scale_pos_weight     - 1.0

[Time taken: 2548.71s]

-----Cross-validation and prediction-----
Fold #0: 0.87097 (1704 rounds)
Fold #1: 0.87117 (1874 rounds)
Fold #2: 0.86932 (1614 rounds)
Fold #3: 0.87547 (2114 rounds)
Fold #4: 0.87207 (1624 rounds)
Fold #5: 0.87325 (1587 rounds)
Fold #6: 0.87070 (2119 rounds)

OOF score: 0.87184
Avg. score: 0.87185 +/- 0.00186

[Time taken: 88.38s]

CPU times: user 48min 20s, sys: 21.6 s, total: 48min 41s
Wall time: 43min 57s


In [19]:
%%time
dataset = 'trn'
feature_set = 'all'
folds = 15
seed = SEED
config = f'{dataset}_{feature_set}_f{folds}_s{seed}'

op[config], tp[config] = run_experiment(
    features=FEATURES,  
    extend=False,
    folds=folds, 
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 41 -> Best value: 0.87246
Best hyperparameters:
learning_rate        - 0.05
max_depth            - 9
min_child_weight     - 53
subsample            - 0.75
colsample_bytree     - 0.6
gamma                - 2.0
alpha                - 4.800000000000001
lambda               - 72.1987075080367
scale_pos_weight     - 1.1

[Time taken: 5030.60s]

-----Cross-validation and prediction-----
Fold #0: 0.87127 (1419 rounds)
Fold #1: 0.87396 (1410 rounds)
Fold #2: 0.87122 (1307 rounds)
Fold #3: 0.87011 (1406 rounds)
Fold #4: 0.87207 (1255 rounds)
Fold #5: 0.87028 (2664 rounds)
Fold #6: 0.87244 (1808 rounds)
Fold #7: 0.87326 (1232 rounds)
Fold #8: 0.87564 (1688 rounds)
Fold #9: 0.87328 (1381 rounds)
Fold #10: 0.87439 (1388 rounds)
Fold #11: 0.87646 (1418 rounds)
Fold #12: 0.87090 (1665 rounds)
Fold #13: 0.86473 (1352 rounds)
Fold #14: 0.87739 (2061 rounds)

OOF score: 0.87246
Avg. score: 0.87249 +/- 0.00298

[Time taken: 191.54s]

CPU times: user 

### extend data using original

In [20]:
%%time
dataset = 'ext'
feature_set = 'all'
folds = 7
seed = SEED
config = f'{dataset}_{feature_set}_f{folds}_s{seed}'

op[config], tp[config] = run_experiment(
    features=FEATURES,  
    extend=True,
    folds=folds, 
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 47 -> Best value: 0.87429
Best hyperparameters:
learning_rate        - 0.05
max_depth            - 8
min_child_weight     - 61
subsample            - 0.65
colsample_bytree     - 0.65
gamma                - 1.5
alpha                - 3.7
lambda               - 8.687426597336362
scale_pos_weight     - 1.1

[Time taken: 3006.65s]

-----Cross-validation and prediction-----
Fold #0: 0.87363 (985 rounds)
Fold #1: 0.87352 (1151 rounds)
Fold #2: 0.87274 (1248 rounds)
Fold #3: 0.87725 (1289 rounds)
Fold #4: 0.87424 (1454 rounds)
Fold #5: 0.87571 (1291 rounds)
Fold #6: 0.87301 (1151 rounds)

OOF score: 0.87429
Avg. score: 0.87430 +/- 0.00150

[Time taken: 95.11s]

CPU times: user 56min 56s, sys: 12.4 s, total: 57min 8s
Wall time: 51min 41s


### reduced feature set

In [21]:
reduced_features = [f for f in FEATURES if f not in 
                    ('hearing(left)', 'hearing(right)', 'Urine protein', 'AST', 'Systolic', 'Cholesterol')]

In [22]:
%%time
dataset = 'ext'
feature_set = 'red'
folds = 7
seed = SEED
config = f'{dataset}_{feature_set}_f{folds}_s{seed}'

op[config], tp[config] = run_experiment(
    features=reduced_features,  
    extend=True,
    folds=folds, 
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 48 -> Best value: 0.87278
Best hyperparameters:
learning_rate        - 0.05
max_depth            - 7
min_child_weight     - 8
subsample            - 0.75
colsample_bytree     - 0.8
gamma                - 1.8
alpha                - 3.2
lambda               - 1.1825650047713019
scale_pos_weight     - 1.1

[Time taken: 2420.62s]

-----Cross-validation and prediction-----
Fold #0: 0.87160 (1407 rounds)
Fold #1: 0.87202 (1125 rounds)
Fold #2: 0.87080 (1183 rounds)
Fold #3: 0.87595 (1506 rounds)
Fold #4: 0.87288 (1347 rounds)
Fold #5: 0.87441 (1095 rounds)
Fold #6: 0.87188 (1173 rounds)

OOF score: 0.87278
Avg. score: 0.87279 +/- 0.00166

[Time taken: 76.29s]

CPU times: user 45min 59s, sys: 6.41 s, total: 46min 5s
Wall time: 41min 37s


**Time to submit!**