# Setup

In [None]:
import gc
import time
import warnings

gc.enable()
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from IPython.display import clear_output

pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

from sklearn.metrics import 
from sklearn.model_selection import StratifiedKFold, KFold

import xgboost as xgb
import optuna

SEED = 2024

In [None]:
# Check GPU availability
import subprocess

try:
    subprocess.check_output('nvidia-smi')
    DEVICE = 'cuda'
except Exception:
    DEVICE = 'cpu'

print(f'Available device: {DEVICE}')

In [None]:
DATA_DIR = ''

train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

# Data preparation

In [None]:
# dropping irrelevant columns
cols_to_drop = []

train = train.drop(cols_to_drop, axis=1)
test = test.drop(cols_to_drop, axis=1)

In [None]:
TARGET = ''
features = [f for f in test.columns]
cat_features = []

In [None]:
train[cat_features] = train[cat_features].astype('category')
test[cat_features] = test[cat_features].astype('category')

In [None]:
# feature sets


# Hyperparameter tuning

In [None]:
def objective(trial, feature_set, model, cv, stratify_col):
    scores = []
    
    param_grid = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, step=0.01),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 64),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0, step=0.05),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0, step=0.05),
        'gamma': trial.suggest_float('gamma', 0, 20, step=0.1), #complexity-control
        'alpha': trial.suggest_float('alpha', 0, 5, step=0.1), #L1-reg
        'lambda': trial.suggest_float('lambda', 5e-3, 5e3, log=True), #L2-reg
#         'max_delta_step': trial.suggest_float('max_delta_step', 0, 10, step=0.5),
        'max_cat_to_onehot': trial.suggest_categorical('max_cat_to_onehot', [2, 4])
    }
    
    for fold, (train_ids, val_ids) in enumerate(cv.split(train, stratify_col)):
        X_train, y_train = train[feature_set].iloc[train_ids], train[TARGET].iloc[train_ids]
        X_val, y_val = train[feature_set].iloc[val_ids], train[TARGET].iloc[val_ids]
        
        model.set_params(**param_grid)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0)
    
        val_preds = model.predict(X_val)
        scores.append(comp_metric(y_val, val_preds))
        
    return np.mean(scores)

In [None]:
def tune_params(feature_set, model, cv, stratify_col, seed, n_trials, direction):
    study = optuna.create_study(
        sampler=optuna.samplers.TPESampler(
            consider_endpoints=True,
            multivariate=True,
            group=True,
            seed=seed),
        pruner=optuna.pruners.HyperbandPruner(),
        direction=direction)
    
    study.optimize(
        func=lambda trial: objective(
            trial, feature_set, model, cv, stratify_col),
        n_trials=n_trials,
        gc_after_trial=True)
    
    return study

# Modeling framework

In [None]:
def comp_metric(y_true, y_pred):
    pass

In [None]:
def custom_cv(feature_set, model, cv, stratify_col, verbose=False):
    X_test = test[feature_set]
    
    oof_preds, test_preds = {}, {}
    scores = []

    for fold, (train_ids, val_ids) in enumerate(cv.split(train, stratify_col)):
        X_train, y_train = train[feature_set].iloc[train_ids], train[TARGET].iloc[train_ids]
        X_val, y_val = train[feature_set].iloc[val_ids], train[TARGET].iloc[val_ids]

        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=False)

        val_preds = model.predict(X_val)
        oof_preds.update(dict(zip(val_ids, val_preds)))
        test_preds[f'fold{fold}'] = model.predict(X_test)

        score = comp_metric(y_val, val_preds)
        scores.append(score)
        if verbose:
            print(f'Fold #{fold:>2}: {score:.5f} ({model.best_iteration_:>4} rounds)')
        _ = gc.collect()

    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mean'] = test_preds.mean(axis=1) # mean of fold-wise predictions
    
    oof_preds = pd.Series(oof_preds).sort_index()
    print(f'\nAvg score: {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    print(f'OOF score: {comp_metric(train[TARGET], oof_preds):.5f}\n')
    
    return oof_preds, test_preds

In [None]:
def run_experiment(feature_set, stratify_col=None, seed=SEED, n_trials=200):
    
    base_params = {
        'booster': 'gbtree',
        'tree_method': 'hist',
        'objective': '',
        'eval_metric': '',
        'learning_rate': 0.01,
        'n_estimators': 10000,
        'early_stopping_rounds': 100,
        'device': DEVICE,
        'enable_categorical': True,
        'verbosity': 0,
        'n_jobs': -1,
        'seed': seed
    }
    model = xgb.XGBRegressor(**base_params)
    
    start = time.time()
    study = tune_params(features, model, cv, seed, n_trials, direction='minimize') 
    end = time.time()
    
    clear_output(wait=True)
    print(f'----------Hyperparameter tuning----------')
    print(f'Best trial: {study.best_trial.number} -> Best value: {study.best_value:.5f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:20} - {v}')
    print(f'\n[Time taken: {end - start:.2f}s]\n')
    
    print(f'-----Cross-validation and prediction-----')
    start = time.time()
    
    model.set_params(**study.best_params)
    oof_preds, test_preds = custom_cv(features, model, cv, stratify_col)
    
    end = time.time()
    print(f'\n[Time taken: {end - start:.2f}s]\n')
    
    return oof_preds, test_preds

In [None]:
def create_submission_files(preds, config, notebook=):
    for col in preds.columns:
        sub = sample_sub.copy()
        sub[TARGET] = preds[col]  # include postprocessing
        sub.to_csv(f'nb{notebook}_{config}_{col}.csv', index=False)

In [None]:
op = {}  # OOF preds
tp = {}  # Test preds

# Experiments

In [None]:
optuna.logging.set_verbosity(optuna.logging.INFO)
_ , _ = run_experiment(features=features, n_trials=3)

In [None]:
%%time
feature_set =
folds = 
stratify_col = 
seed = 
config = f'feat{feature_set}_fold{folds}__seed{seed}'

op[config], tp[config] = run_experiment(
    feature_set=,  
    stratify_col=, 
    seed=seed)

create_submission_files(tp[config], config)