In [None]:
import gc
import glob
import pandas as pd
import numpy as np
from sklearn import model_selection

import xgboost as xgb

import optuna
from optuna.samplers import TPESampler

In [None]:
train = pd.read_pickle('../input/optiver-1st-stage-data/1st_stage_train.pkl')

In [None]:
## ---- GroupKFold ----
class GroupKFold(object):
    """
    GroupKFold with random shuffle with a sklearn-like structure
    """

    def __init__(self, n_splits=4, shuffle=True, random_state=42):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def get_n_splits(self, X=None, y=None, group=None):
        return self.n_splits

    def split(self, X, y, group):
        kf = model_selection.KFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state)
        unique_ids = X[group].unique()
        for fold, (tr_group_idx, va_group_idx) in enumerate(kf.split(unique_ids)):
            # split group
            tr_group, va_group = unique_ids[tr_group_idx], unique_ids[va_group_idx]
            train_idx = np.where(X[group].isin(tr_group))[0]
            val_idx = np.where(X[group].isin(va_group))[0]
            yield train_idx, val_idx

# Function to calculate the root mean squared percentage error
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

In [None]:
NFOLDS = 5
SEED = 42

In [None]:
def objective(trial):

    features_columns = [col for col in train.columns if col not in ['target', 'row_id', 'stock_id', 'fold']]

    param_grid = {
        'tree_method':'gpu_hist', 
        'lambda': trial.suggest_categorical('lambda', [1e-1, 1e-2, 1e-3]),
        'alpha': trial.suggest_categorical('alpha', [1e-1, 1e-2, 1e-3]),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.009,0.01, 0.02]),
        'n_estimators': trial.suggest_categorical('n_estimators', [1000, 3000, 5000, 6000]),
        'max_depth': trial.suggest_categorical('max_depth', [5,6,7,9,11]),
        'min_child_weight': trial.suggest_categorical('min_child_weight', [10, 30, 60, 100, 200])}

    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'val-rmse')
    gsk = GroupKFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

    xgb_score = []
    counter = 1
    for trn_idx, vld_idx in gsk.split(train, train['target'], group='time_id'):

        # train valid separation
        print(f"CV {counter}/{NFOLDS}")
        print('\n')
        d_train = xgb.DMatrix(train.iloc[trn_idx][features_columns], train.iloc[trn_idx]["target"], weight=1/np.square(train.iloc[trn_idx]["target"]))
        d_val = xgb.DMatrix(train.iloc[vld_idx][features_columns], train.iloc[vld_idx]["target"], weight=1/np.square(train.iloc[vld_idx]["target"]))
    
        # Fit and train xgboost
        model = xgb.train(param_grid, d_train, evals=[(d_val, "val")], num_boost_round=10000, verbose_eval=50,callbacks=[pruning_callback],
                          early_stopping_rounds=100)
    
        # Predictions and score on validation data
        pred_val = model.predict(d_val)

        score = rmspe(y_true=train.iloc[vld_idx]["target"], y_pred=pred_val)
        print(f"Fold {counter} Xgboost {score}")
        xgb_score.append(score)
        counter += 1
  
    return np.mean(np.array(xgb_score))

In [None]:
study = optuna.create_study(direction='minimize', pruner=optuna.pruners.MedianPruner(n_warmup_steps=15), sampler=TPESampler(seed=SEED), study_name='XGBRegressor')
study.optimize(objective, n_trials=450)

print('Number of finished trials: ', len(study.trials))
print('Best trial:')
trial = study.best_trial

print('\tValue: {}'.format(trial.value))
print('\tParams: ')
for key, value in trial.params.items():
    print('\t\t{}: {}'.format(key, value))