# Setup

In [1]:
import gc
import time
import warnings

gc.enable()
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from IPython.display import clear_output

pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import StratifiedKFold, KFold

import xgboost as xgb
import optuna

SEED = 2024

In [2]:
# Check GPU availability
import subprocess

try:
    subprocess.check_output('nvidia-smi')
    DEVICE = 'cuda'
except Exception:
    DEVICE = 'cpu'

print(f'Available device: {DEVICE}')

Available device: cuda


In [3]:
DATA_DIR = '/kaggle/input/playground-series-s4e4'

train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

# Data preparation

In [4]:
# dropping irrelevant columns
cols_to_drop = ['id']

train = train.drop(cols_to_drop, axis=1)
test = test.drop(cols_to_drop, axis=1)

In [5]:
TARGET = 'Rings'
features = [f for f in test.columns]

In [6]:
cat_features = ['Sex']
train[cat_features] = train[cat_features].astype('category')
test[cat_features] = test[cat_features].astype('category')

# Hyperparameter tuning

In [7]:
def objective(trial, feature_set, model, cv, stratify_col):
    scores = []
    
    param_grid = {
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, step=0.01),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 64),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0, step=0.05),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0, step=0.05),
        'gamma': trial.suggest_float('gamma', 0, 20, step=0.1), #complexity-control
        'alpha': trial.suggest_float('alpha', 0, 5, step=0.1), #L1-reg
        'lambda': trial.suggest_float('lambda', 5e-3, 5e3, log=True), #L2-reg
#         'max_delta_step': trial.suggest_float('max_delta_step', 0, 10, step=0.5),
        'max_cat_to_onehot': trial.suggest_categorical('max_cat_to_onehot', [1, 4])
    }
    
    for fold, (train_ids, val_ids) in enumerate(cv.split(train, stratify_col)):
        X_train, y_train = train[feature_set].iloc[train_ids], train[TARGET].iloc[train_ids]
        X_val, y_val = train[feature_set].iloc[val_ids], train[TARGET].iloc[val_ids]
        
        model.set_params(**param_grid)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0)
    
        val_preds = model.predict(X_val)
        scores.append(comp_metric(y_val, val_preds))
        
    return np.mean(scores)

In [8]:
def tune_params(feature_set, model, cv, stratify_col, seed, n_trials, direction):
    study = optuna.create_study(
        sampler=optuna.samplers.TPESampler(
            consider_endpoints=True,
            multivariate=True,
            group=True,
            seed=seed),
        pruner=optuna.pruners.HyperbandPruner(),
        direction=direction)
    
    study.optimize(
        func=lambda trial: objective(
            trial, feature_set, model, cv, stratify_col),
        n_trials=n_trials,
        gc_after_trial=True)
    
    return study

# Modeling framework

In [9]:
def comp_metric(y_true, y_pred):
    return mean_squared_log_error(y_true, y_pred, squared=False)

In [10]:
def custom_cv(feature_set, model, cv, stratify_col, verbose=True):
    X_test = test[feature_set]
    
    oof_preds, test_preds = {}, {}
    scores = []

    for fold, (train_ids, val_ids) in enumerate(cv.split(train, stratify_col)):
        X_train, y_train = train[feature_set].iloc[train_ids], train[TARGET].iloc[train_ids]
        X_val, y_val = train[feature_set].iloc[val_ids], train[TARGET].iloc[val_ids]

        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=False)

        val_preds = model.predict(X_val)
        oof_preds.update(dict(zip(val_ids, val_preds)))
        test_preds[f'fold{fold}'] = model.predict(X_test)

        score = comp_metric(y_val, val_preds)
        scores.append(score)
        if verbose:
            print(f'Fold #{fold:>2}: {score:.5f} ({model.best_iteration:>4} rounds)')
        _ = gc.collect()

    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mean'] = test_preds.mean(axis=1) # mean of fold-wise predictions
    
    oof_preds = pd.Series(oof_preds).sort_index()
    print(f'\nAvg score: {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    print(f'OOF score: {comp_metric(train[TARGET], oof_preds):.5f}\n')
    
    return oof_preds, test_preds

In [11]:
def run_experiment(feature_set, cv, stratify_col=None, seed=SEED, n_trials=200):
    
    base_params = {
        'booster': 'gbtree',
        'tree_method': 'hist',
        'objective': 'reg:squaredlogerror',
        'eval_metric': 'rmsle',
        'learning_rate': 0.02,
        'n_estimators': 10000,
        'early_stopping_rounds': 100,
        'device': DEVICE,
        'enable_categorical': True,
        'verbosity': 0,
        'n_jobs': -1,
        'seed': seed
    }
    model = xgb.XGBRegressor(**base_params)
    
    start = time.time()
    study = tune_params(
        feature_set, model, cv, stratify_col, seed, n_trials, direction='minimize') 
    end = time.time()
    
    clear_output(wait=True)
    print(f'----------Hyperparameter tuning----------')
    print(f'Best trial: {study.best_trial.number} -> Best value: {study.best_value:.5f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:20} - {v}')
    print(f'\n[Time taken: {end - start:.2f}s]\n')
    
    print(f'-----Cross-validation and prediction-----')
    start = time.time()
    
    model.set_params(**study.best_params)
    oof_preds, test_preds = custom_cv(feature_set, model, cv, stratify_col)
    
    end = time.time()
    print(f'\n[Time taken: {end - start:.2f}s]\n')
    
    return oof_preds, test_preds

In [12]:
def create_submission_files(preds, config, notebook='01'):
    for col in preds.columns:
        sub = sample_sub.copy()
        sub[TARGET] = preds[col]  # include postprocessing
        sub.to_csv(f'nb{notebook}_{config}_{col}.csv', index=False)

In [13]:
op = {}  # OOF preds
tp = {}  # Test preds

# Experiments

In [14]:
optuna.logging.set_verbosity(optuna.logging.INFO)
_ , _ = run_experiment(
    feature_set=features,
    cv= KFold(n_splits=5, shuffle=True, random_state=SEED),
    n_trials=3)

----------Hyperparameter tuning----------
Best trial: 0 -> Best value: 0.17434
Best hyperparameters:
max_depth            - 8
min_child_weight     - 46
subsample            - 0.65
colsample_bytree     - 0.6
gamma                - 4.1000000000000005
alpha                - 0.5
lambda               - 115.45450043256209
max_cat_to_onehot    - 1

[Time taken: 45.51s]

-----Cross-validation and prediction-----
Fold # 0: 0.17606 (1236 rounds)
Fold # 1: 0.17368 ( 952 rounds)
Fold # 2: 0.17503 (1877 rounds)
Fold # 3: 0.17478 (1982 rounds)
Fold # 4: 0.17217 (1630 rounds)

Avg score: 0.17434 +/- 0.00132
OOF score: 0.17435


[Time taken: 20.02s]



### KFold

In [15]:
%%time
feature_set = 'ALL'
folds = 10
seed = SEED

config = f'feat{feature_set}_fold{folds}_seed{seed}'

cv = KFold(n_splits=folds, shuffle=True, random_state=seed)

op[config], tp[config] = run_experiment(
    feature_set=features,
    cv=cv, 
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 71 -> Best value: 0.15006
Best hyperparameters:
max_depth            - 9
min_child_weight     - 3
subsample            - 0.95
colsample_bytree     - 0.95
gamma                - 0.0
alpha                - 2.1
lambda               - 0.2577647061346276
max_cat_to_onehot    - 4

[Time taken: 6631.59s]

-----Cross-validation and prediction-----
Fold # 0: 0.15308 (9122 rounds)
Fold # 1: 0.15090 (9999 rounds)
Fold # 2: 0.14976 (9996 rounds)
Fold # 3: 0.14892 (9959 rounds)
Fold # 4: 0.14992 (9999 rounds)
Fold # 5: 0.14927 (9999 rounds)
Fold # 6: 0.15110 (9972 rounds)
Fold # 7: 0.15054 (9979 rounds)
Fold # 8: 0.14697 (4446 rounds)
Fold # 9: 0.15019 (7489 rounds)

Avg score: 0.15006 +/- 0.00151
OOF score: 0.15007


[Time taken: 219.12s]

CPU times: user 2h 6min 1s, sys: 29 s, total: 2h 6min 30s
Wall time: 1h 54min 12s


In [16]:
%%time
feature_set = 'ALL'
folds = 15
seed = SEED

config = f'feat{feature_set}_fold{folds}_seed{seed}'

cv = KFold(n_splits=folds, shuffle=True, random_state=seed)

op[config], tp[config] = run_experiment(
    feature_set=features,
    cv=cv, 
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 155 -> Best value: 0.14931
Best hyperparameters:
max_depth            - 7
min_child_weight     - 3
subsample            - 0.85
colsample_bytree     - 0.6
gamma                - 0.0
alpha                - 1.0
lambda               - 0.1408487297538937
max_cat_to_onehot    - 4

[Time taken: 11570.31s]

-----Cross-validation and prediction-----
Fold # 0: 0.15094 (5383 rounds)
Fold # 1: 0.15372 (4521 rounds)
Fold # 2: 0.14895 (6310 rounds)
Fold # 3: 0.14764 (4848 rounds)
Fold # 4: 0.15197 (8455 rounds)
Fold # 5: 0.14587 (6262 rounds)
Fold # 6: 0.14970 (5934 rounds)
Fold # 7: 0.14828 (5930 rounds)
Fold # 8: 0.14843 (6541 rounds)
Fold # 9: 0.15050 (6630 rounds)
Fold #10: 0.14635 (6279 rounds)
Fold #11: 0.15272 (7526 rounds)
Fold #12: 0.14495 (3680 rounds)
Fold #13: 0.15070 (4989 rounds)
Fold #14: 0.14895 (5604 rounds)

Avg score: 0.14931 +/- 0.00243
OOF score: 0.14933


[Time taken: 283.43s]

CPU times: user 3h 34min 45s, sys: 1min 26s, to

### StratifiedKFold

In [17]:
%%time
feature_set = 'ALL'
folds = 10
stratify = 'SEX'
seed = SEED

config = f'feat{feature_set}_fold{folds}_skf{stratify}_seed{seed}'

cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)

op[config], tp[config] = run_experiment(
    feature_set=features,
    cv=cv,
    stratify_col=train['Sex'],
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 171 -> Best value: 0.14915
Best hyperparameters:
max_depth            - 6
min_child_weight     - 2
subsample            - 1.0
colsample_bytree     - 0.6
gamma                - 0.0
alpha                - 0.0
lambda               - 0.05975753432641861
max_cat_to_onehot    - 4

[Time taken: 8024.47s]

-----Cross-validation and prediction-----
Fold # 0: 0.14673 (2153 rounds)
Fold # 1: 0.14938 (2264 rounds)
Fold # 2: 0.14968 (2967 rounds)
Fold # 3: 0.15255 (2141 rounds)
Fold # 4: 0.15060 (2104 rounds)
Fold # 5: 0.14934 (3415 rounds)
Fold # 6: 0.15098 (1602 rounds)
Fold # 7: 0.14935 (2911 rounds)
Fold # 8: 0.14486 (2507 rounds)
Fold # 9: 0.14802 (2223 rounds)

Avg score: 0.14915 +/- 0.00207
OOF score: 0.14916


[Time taken: 73.08s]

CPU times: user 2h 26min 52s, sys: 27.8 s, total: 2h 27min 19s
Wall time: 2h 14min 58s


In [18]:
%%time
feature_set = 'ALL'
folds = 15
stratify = 'SEX'
seed = SEED

config = f'feat{feature_set}_fold{folds}_skf{stratify}_seed{seed}'

cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)

op[config], tp[config] = run_experiment(
    feature_set=features,
    cv=cv,
    stratify_col=train['Sex'],
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 155 -> Best value: 0.14935
Best hyperparameters:
max_depth            - 7
min_child_weight     - 3
subsample            - 0.85
colsample_bytree     - 0.6
gamma                - 0.0
alpha                - 1.0
lambda               - 0.1408487297538937
max_cat_to_onehot    - 4

[Time taken: 11566.22s]

-----Cross-validation and prediction-----
Fold # 0: 0.14693 (5550 rounds)
Fold # 1: 0.14864 (7953 rounds)
Fold # 2: 0.14895 (5459 rounds)
Fold # 3: 0.15129 (3809 rounds)
Fold # 4: 0.14907 (9494 rounds)
Fold # 5: 0.15362 (6918 rounds)
Fold # 6: 0.15047 (4205 rounds)
Fold # 7: 0.15190 (7616 rounds)
Fold # 8: 0.14859 (7156 rounds)
Fold # 9: 0.15115 (3820 rounds)
Fold #10: 0.14934 (3359 rounds)
Fold #11: 0.15020 (6516 rounds)
Fold #12: 0.14705 (3691 rounds)
Fold #13: 0.14363 (3671 rounds)
Fold #14: 0.14935 (8619 rounds)

Avg score: 0.14935 +/- 0.00229
OOF score: 0.14936


[Time taken: 279.66s]

CPU times: user 3h 34min 26s, sys: 1min 41s, to