# Setup

In [1]:
import gc
import time
import warnings

gc.enable()
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from IPython.display import clear_output

pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold

import xgboost as xgb
import optuna

SEED = 2024

In [2]:
# Check GPU availability
import subprocess

try:
    subprocess.check_output('nvidia-smi')
    DEVICE = 'cuda'
except Exception:
    DEVICE = 'cpu'

print(f'Available device: {DEVICE}')

Available device: cuda


In [3]:
DATA_DIR = '/kaggle/input/playground-series-s4e4'

train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

# Data preparation

In [4]:
# dropping irrelevant columns
cols_to_drop = ['id']

train = train.drop(cols_to_drop, axis=1)
test = test.drop(cols_to_drop, axis=1)

In [5]:
TARGET = 'Rings'
features = [f for f in test.columns]

In [6]:
# log-transforming the target to optimize for RMSE instead of RMSLE
train[TARGET] = np.log1p(train[TARGET])

In [7]:
cat_features = ['Sex']
train[cat_features] = train[cat_features].astype('category')
test[cat_features] = test[cat_features].astype('category')

# Hyperparameter tuning

In [8]:
def objective(trial, feature_set, model, cv, stratify_col):
    scores = []
    
    param_grid = {
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, step=0.01),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 64),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0, step=0.05),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0, step=0.05),
        'gamma': trial.suggest_float('gamma', 0, 20, step=0.1), #complexity-control
        'alpha': trial.suggest_float('alpha', 0, 5, step=0.1), #L1-reg
        'lambda': trial.suggest_float('lambda', 5e-3, 5e3, log=True), #L2-reg
#         'max_delta_step': trial.suggest_float('max_delta_step', 0, 10, step=0.5),
        'max_cat_to_onehot': trial.suggest_categorical('max_cat_to_onehot', [1, 4])
    }
    
    for fold, (train_ids, val_ids) in enumerate(cv.split(train, stratify_col)):
        X_train, y_train = train[feature_set].iloc[train_ids], train[TARGET].iloc[train_ids]
        X_val, y_val = train[feature_set].iloc[val_ids], train[TARGET].iloc[val_ids]
        
        model.set_params(**param_grid)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0)
    
        val_preds = model.predict(X_val)
        scores.append(comp_metric(y_val, val_preds))
        
    return np.mean(scores)

In [9]:
def tune_params(feature_set, model, cv, stratify_col, seed, n_trials, direction):
    study = optuna.create_study(
        sampler=optuna.samplers.TPESampler(
            consider_endpoints=True,
            multivariate=True,
            group=True,
            seed=seed),
        pruner=optuna.pruners.HyperbandPruner(),
        direction=direction)
    
    study.optimize(
        func=lambda trial: objective(
            trial, feature_set, model, cv, stratify_col),
        n_trials=n_trials,
        gc_after_trial=True)
    
    return study

# Modeling framework

In [10]:
def comp_metric(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

In [11]:
def custom_cv(feature_set, model, cv, stratify_col, verbose=True):
    X_test = test[feature_set]
    
    oof_preds, test_preds = {}, {}
    scores = []

    for fold, (train_ids, val_ids) in enumerate(cv.split(train, stratify_col)):
        X_train, y_train = train[feature_set].iloc[train_ids], train[TARGET].iloc[train_ids]
        X_val, y_val = train[feature_set].iloc[val_ids], train[TARGET].iloc[val_ids]

        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=False)

        val_preds = model.predict(X_val)
        oof_preds.update(dict(zip(val_ids, val_preds)))
        test_preds[f'fold{fold}'] = model.predict(X_test)

        score = comp_metric(y_val, val_preds)
        scores.append(score)
        if verbose:
            print(f'Fold #{fold:>2}: {score:.5f} ({model.best_iteration:>4} rounds)')
        _ = gc.collect()

    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mean'] = test_preds.mean(axis=1) # mean of fold-wise predictions
    
    oof_preds = pd.Series(oof_preds).sort_index()
    print(f'\nAvg score: {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    print(f'OOF score: {comp_metric(train[TARGET], oof_preds):.5f}\n')
    
    return oof_preds, test_preds

In [12]:
def run_experiment(feature_set, cv, stratify_col=None, seed=SEED, n_trials=200):
    
    base_params = {
        'booster': 'gbtree',
        'tree_method': 'hist',
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'learning_rate': 0.02,
        'n_estimators': 10000,
        'early_stopping_rounds': 100,
        'device': DEVICE,
        'enable_categorical': True,
        'verbosity': 0,
        'n_jobs': -1,
        'seed': seed
    }
    model = xgb.XGBRegressor(**base_params)
    
    start = time.time()
    study = tune_params(
        feature_set, model, cv, stratify_col, seed, n_trials, direction='minimize') 
    end = time.time()
    
    clear_output(wait=True)
    print(f'----------Hyperparameter tuning----------')
    print(f'Best trial: {study.best_trial.number} -> Best value: {study.best_value:.5f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:20} - {v}')
    print(f'\n[Time taken: {end - start:.2f}s]\n')
    
    print(f'-----Cross-validation and prediction-----')
    start = time.time()
    
    model.set_params(**study.best_params)
    oof_preds, test_preds = custom_cv(feature_set, model, cv, stratify_col)
    
    end = time.time()
    print(f'\n[Time taken: {end - start:.2f}s]\n')
    
    return oof_preds, test_preds

In [13]:
def create_submission_files(preds, config, notebook='01'):
    for col in preds.columns:
        sub = sample_sub.copy()
        sub[TARGET] = np.expm1(preds[col])  # inverse transform for log-transformed target
        sub.to_csv(f'nb{notebook}_{config}_{col}.csv', index=False)

In [14]:
op = {}  # OOF preds
tp = {}  # Test preds

# Experiments

In [15]:
optuna.logging.set_verbosity(optuna.logging.INFO)
_ , _ = run_experiment(
    feature_set=features,
    cv= KFold(n_splits=5, shuffle=True, random_state=SEED),
    n_trials=3)

----------Hyperparameter tuning----------
Best trial: 0 -> Best value: 0.15837
Best hyperparameters:
max_depth            - 8
min_child_weight     - 46
subsample            - 0.65
colsample_bytree     - 0.6
gamma                - 4.1000000000000005
alpha                - 0.5
lambda               - 115.45450043256209
max_cat_to_onehot    - 1

[Time taken: 21.33s]

-----Cross-validation and prediction-----
Fold # 0: 0.15992 ( 538 rounds)
Fold # 1: 0.15725 ( 577 rounds)
Fold # 2: 0.15870 ( 521 rounds)
Fold # 3: 0.15907 ( 957 rounds)
Fold # 4: 0.15687 ( 789 rounds)

Avg score: 0.15837 +/- 0.00114
OOF score: 0.15837


[Time taken: 11.47s]



### KFold

In [16]:
%%time
feature_set = 'ALL'
folds = 10
seed = SEED

config = f'feat{feature_set}_fold{folds}_seed{seed}'

cv = KFold(n_splits=folds, shuffle=True, random_state=seed)

op[config], tp[config] = run_experiment(
    feature_set=features,
    cv=cv, 
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 14 -> Best value: 0.14840
Best hyperparameters:
max_depth            - 4
min_child_weight     - 22
subsample            - 0.9
colsample_bytree     - 0.7
gamma                - 0.0
alpha                - 2.9000000000000004
lambda               - 0.6211464628769217
max_cat_to_onehot    - 4

[Time taken: 4884.48s]

-----Cross-validation and prediction-----
Fold # 0: 0.15107 (5819 rounds)
Fold # 1: 0.14948 (4847 rounds)
Fold # 2: 0.14809 (5736 rounds)
Fold # 3: 0.14712 (4709 rounds)
Fold # 4: 0.14810 (5869 rounds)
Fold # 5: 0.14722 (6012 rounds)
Fold # 6: 0.14949 (4655 rounds)
Fold # 7: 0.14851 (5241 rounds)
Fold # 8: 0.14600 (3985 rounds)
Fold # 9: 0.14896 (4251 rounds)

Avg score: 0.14840 +/- 0.00137
OOF score: 0.14841


[Time taken: 126.08s]

CPU times: user 1h 35min 10s, sys: 32.8 s, total: 1h 35min 43s
Wall time: 1h 23min 31s


In [17]:
%%time
feature_set = 'ALL'
folds = 15
seed = SEED

config = f'feat{feature_set}_fold{folds}_seed{seed}'

cv = KFold(n_splits=folds, shuffle=True, random_state=seed)

op[config], tp[config] = run_experiment(
    feature_set=features,
    cv=cv, 
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 106 -> Best value: 0.14823
Best hyperparameters:
max_depth            - 7
min_child_weight     - 28
subsample            - 0.95
colsample_bytree     - 0.8
gamma                - 0.0
alpha                - 2.5
lambda               - 31.233953204464836
max_cat_to_onehot    - 1

[Time taken: 6711.71s]

-----Cross-validation and prediction-----
Fold # 0: 0.14966 (2726 rounds)
Fold # 1: 0.15322 (1246 rounds)
Fold # 2: 0.14804 (2005 rounds)
Fold # 3: 0.14668 (1862 rounds)
Fold # 4: 0.15086 (2015 rounds)
Fold # 5: 0.14460 (1767 rounds)
Fold # 6: 0.14815 (2123 rounds)
Fold # 7: 0.14696 (1495 rounds)
Fold # 8: 0.14721 (1999 rounds)
Fold # 9: 0.14926 (1241 rounds)
Fold #10: 0.14541 (1856 rounds)
Fold #11: 0.15173 (1782 rounds)
Fold #12: 0.14389 (1580 rounds)
Fold #13: 0.14958 (1818 rounds)
Fold #14: 0.14815 (1631 rounds)

Avg score: 0.14823 +/- 0.00250
OOF score: 0.14825


[Time taken: 205.70s]

CPU times: user 2h 13min 5s, sys: 34.1 s, total

### StratifiedKFold

In [18]:
%%time
feature_set = 'ALL'
folds = 10
stratify = 'SEX'
seed = SEED

config = f'feat{feature_set}_fold{folds}_skf{stratify}_seed{seed}'

cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)

op[config], tp[config] = run_experiment(
    feature_set=features,
    cv=cv,
    stratify_col=train['Sex'],
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 14 -> Best value: 0.14847
Best hyperparameters:
max_depth            - 4
min_child_weight     - 22
subsample            - 0.9
colsample_bytree     - 0.7
gamma                - 0.0
alpha                - 2.9000000000000004
lambda               - 0.6211464628769217
max_cat_to_onehot    - 4

[Time taken: 4890.50s]

-----Cross-validation and prediction-----
Fold # 0: 0.14566 (4622 rounds)
Fold # 1: 0.14855 (5550 rounds)
Fold # 2: 0.14909 (6213 rounds)
Fold # 3: 0.15212 (4125 rounds)
Fold # 4: 0.14982 (5140 rounds)
Fold # 5: 0.14868 (5535 rounds)
Fold # 6: 0.15077 (3308 rounds)
Fold # 7: 0.14843 (6437 rounds)
Fold # 8: 0.14464 (3307 rounds)
Fold # 9: 0.14698 (5032 rounds)

Avg score: 0.14847 +/- 0.00214
OOF score: 0.14849


[Time taken: 122.07s]

CPU times: user 1h 35min 5s, sys: 45 s, total: 1h 35min 50s
Wall time: 1h 23min 33s


In [19]:
%%time
feature_set = 'ALL'
folds = 15
stratify = 'SEX'
seed = SEED

config = f'feat{feature_set}_fold{folds}_skf{stratify}_seed{seed}'

cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)

op[config], tp[config] = run_experiment(
    feature_set=features,
    cv=cv,
    stratify_col=train['Sex'],
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 52 -> Best value: 0.14836
Best hyperparameters:
max_depth            - 8
min_child_weight     - 61
subsample            - 1.0
colsample_bytree     - 0.6
gamma                - 0.0
alpha                - 1.8
lambda               - 58.291146234137656
max_cat_to_onehot    - 1

[Time taken: 7306.36s]

-----Cross-validation and prediction-----
Fold # 0: 0.14600 (1622 rounds)
Fold # 1: 0.14745 (1844 rounds)
Fold # 2: 0.14772 (2303 rounds)
Fold # 3: 0.15012 (1824 rounds)
Fold # 4: 0.14837 (2251 rounds)
Fold # 5: 0.15297 (2048 rounds)
Fold # 6: 0.14923 (2262 rounds)
Fold # 7: 0.15085 (2235 rounds)
Fold # 8: 0.14726 (2157 rounds)
Fold # 9: 0.15082 (1617 rounds)
Fold #10: 0.14848 (2243 rounds)
Fold #11: 0.14942 (1996 rounds)
Fold #12: 0.14621 (1981 rounds)
Fold #13: 0.14251 (2114 rounds)
Fold #14: 0.14803 (2196 rounds)

Avg score: 0.14836 +/- 0.00239
OOF score: 0.14838


[Time taken: 252.70s]

CPU times: user 2h 23min 34s, sys: 35.4 s, total: