This is a follow up notebook of [Catboost 101 - Baseline](https://www.kaggle.com/aayush26/tps-aug-2021-catboost-101-baseline).


The scope of this notebook is to perform hyperparameter tuning and store the best hyperparameters found for Catboost model. 

The stored file can later be loaded into another notebook by using `Kaggle Add data` or directly copy-paste the best params displayed in the output cell.

### Pre-requisite
1. [GPU version] Change the accelerator to GPU in order to be able to exexute this notebook.
2. [CPU version] Delete the following hyper params present in getCbHyperparameters(): `'task_type':"GPU",`

# Imports

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import optuna
from catboost import CatBoostRegressor
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from optuna.samplers import TPESampler
from sklearn.model_selection import KFold
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Load dataset

In [None]:
train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
print('train shape:',train.shape)

In [None]:
# Train data
X_train = train.drop(columns = ['loss','id'])
y_train = train['loss'].values

# Setup CatBoost hyperparameters for experiment

In [None]:
def getCbHyperparameters(trial):
    cb_params = {
        'iterations':trial.suggest_int("iterations", 1000, 5000),
        'od_wait':trial.suggest_int('od_wait', 500, 2000),
        'loss_function':'RMSE',
        'task_type':"GPU",
        'eval_metric':'RMSE',
#         'leaf_estimation_method': trial.suggest_categorical("leaf_estimation_method", ["Newton", "Gradient"]),
        'bootstrap_type': 'Bernoulli',
        'learning_rate' : trial.suggest_uniform('learning_rate',0.02,1),
        'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
        'subsample': trial.suggest_uniform('subsample',0,1),
        'random_strength': trial.suggest_uniform('random_strength',10,50),
        'depth': trial.suggest_int('depth',1,15),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15)
#         'grow_policy': trial.suggest_categorical("grow_policy", ["SymmetricTree", "Depthwise", "Lossguide"])
    }
    return cb_params

# Define objective function

In [None]:
def objective(trial, X, y):
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.3, random_state=1337)
    
    cb_param = getCbHyperparameters(trial)

    eval_set = [(X_valid, y_valid)]
    
    cb_regressor = CatBoostRegressor(**cb_param)

    # Model Fit
    cb_regressor = cb_regressor.fit(X_train, y_train)
    
    # Model Prediction
    preds = cb_regressor.predict(X_valid)
    
    # Compute rmse
    rmse = np.sqrt(mean_squared_error(y_valid, preds))
    
    return rmse

# Custom logging callback function

In [None]:
# Callback function to print log messages when the best trial is updated

def logging_callback(study, frozen_trial):
    previous_best_value = study.user_attrs.get("previous_best_value", None)
    if previous_best_value != study.best_value:
        study.set_user_attr("previous_best_value", study.best_value)
        print(
            "Trial {} finished with best value: {}. ".format(
            frozen_trial.number,
            frozen_trial.value
            )
        )

# Initiate experiment to find best hyperparameters

In [None]:
%%time

study = optuna.create_study(sampler=TPESampler(seed=1337), direction='minimize', study_name='cb')
func = lambda trial: objective(trial, X_train, y_train)

study.optimize(func, timeout=60*5, callbacks=[logging_callback]) # timeout = seconds * minutes. Longer timeout will tend to lead to better hyperparameter tuning.

In [None]:
print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")
for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

# Save the study containing the best hyperparameters for the CatBoost model

In [None]:
import joblib

joblib.dump(study, "cb_study.pkl")