This is a follow up notebook of [XGBoost 101 - Baseline](https://www.kaggle.com/aayush26/tps-aug-2021-xgboost-101-baseline).


The scope of this notebook is to perform hyperparameter tuning and store the best hyperparameters found for XGB model. 

The stored file can later be loaded into another notebook by using `Kaggle Add data` or directly copy-paste the best params displayed in the output cell.

Check [[TPS Aug 2021] XGBoost 201 - with Optuna Part 2/2](https://www.kaggle.com/aayush26/tps-aug-2021-xgboost-201-with-optuna-part-2-2/notebook), where I am loading the best hyperparameters acquired from this notebook.

### Pre-requisite
1. [GPU version] Change the accelerator to GPU in order to be able to exexute this notebook.
2. [CPU version] Delete the following hyper params present in getXgbHyperparametersgetXgbHyperparameters(): `"tree_method": "gpu_hist"` and `"booster": 'gbtree'`

# Imports

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import optuna
from xgboost import XGBRegressor
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from optuna.samplers import TPESampler
from sklearn.model_selection import KFold
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Load dataset

In [None]:
train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
print('train shape:',train.shape)

In [None]:
# Train data
X_train = train.drop(columns = ['loss','id'])
y_train = train['loss'].values

# Setup XGB hyperparameters for experiment

In [None]:
def getXgbHyperparameters(trial):
    xgb_param = {
            "tree_method": "gpu_hist",
            'n_estimators': trial.suggest_int('n_estimators', 500, 2000, 100),
            "booster": 'gbtree',
            "reg_lambda": trial.suggest_int("reg_lambda", 1, 100),
            "reg_alpha": trial.suggest_int("reg_alpha", 1, 100),
            "subsample": trial.suggest_float("subsample", 0.2, 1.0, step=0.1),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0, step=0.1),
            "max_depth": trial.suggest_int("max_depth", 3, 9),
            "min_child_weight": trial.suggest_int("min_child_weight", 2, 10),
            "learning_rate": 0.01,
            "gamma": trial.suggest_float("gamma", 0, 20)
        }
    return xgb_param

# Define objective function

In [None]:
def objective(trial, X, y):
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.3, random_state=1337)
    
    xgb_param = getXgbHyperparameters(trial)

    eval_set = [(X_valid, y_valid)]
    
    fit_params = dict(eval_set=eval_set, 
                      eval_metric='rmse', 
                      early_stopping_rounds=100, 
                      verbose=False)
    
    xgb_regressor = XGBRegressor(**xgb_param)

    # Fit/predict
    xgb_regressor = xgb_regressor.fit(X_train, y_train)
    preds = xgb_regressor.predict(X_valid)
    
    # Compute rmse
    rmse = np.sqrt(mean_squared_error(y_valid, preds))
    
    return rmse

# Custom logging callback function

In [None]:
# Callback function to print log messages when the best trial is updated

def logging_callback(study, frozen_trial):
    previous_best_value = study.user_attrs.get("previous_best_value", None)
    if previous_best_value != study.best_value:
        study.set_user_attr("previous_best_value", study.best_value)
        print(
            "Trial {} finished with best value: {}. ".format(
            frozen_trial.number,
            frozen_trial.value
            )
        )

# Initiate experiment to find best hyperparameters

In [None]:
%%time

study = optuna.create_study(sampler=TPESampler(seed=1337), direction='minimize', study_name='xgb')
func = lambda trial: objective(trial, X_train, y_train)

study.optimize(func, timeout=60*180, callbacks=[logging_callback]) # timeout = seconds * minutes. Longer timeout will tend to lead to better hyperparameter tuning.

In [None]:
print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")
for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

# Save the study containing the best hyperparameters for the XBG model

In [None]:
import joblib

joblib.dump(study, "xgb_study.pkl")