![](https://raw.githubusercontent.com/optuna/optuna/master/docs/image/optuna-logo.png)

<link rel="preconnect" href="https://fonts.gstatic.com">
<link href="https://fonts.googleapis.com/css2?family=Open+Sans&display=swap" rel="stylesheet">
<h1 style="text-align: center; font-family: 'Open Sans', sans-serif;"> A SIMPLE GUIDE TO PERFORM HYPERPARAMETER OPTIMIZATION WITH OPTUNA </h1>

<link rel="preconnect" href="https://fonts.gstatic.com">
<link href="https://fonts.googleapis.com/css2?family=Lato&family=Open+Sans&display=swap" rel="stylesheet">
<h2 style="font-family: 'Lato', sans-serif; text-align:center">Let's Begin!</h2>

In [None]:
import pandas as pd
import time
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

import optuna
from optuna import Trial, visualization

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jan-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jan-2021/test.csv')
sample = pd.read_csv('../input/tabular-playground-series-jan-2021/sample_submission.csv')

In [None]:
train.head()

In [None]:
feature_cols = [col for col in train.columns.tolist() if col not in ['id', 'target']]
target_cols = ['target']

## Create Folds

In [None]:
kf = KFold(n_splits=5, random_state=2021, shuffle=True)

for i, (trn, val) in enumerate(kf.split(train)):
    train.loc[val, 'kfold'] = i
train['kfold'] = train['kfold'].astype(int)

## Training Function

1. trial.suggest_categorical : Suggest a value for the categorical parameter. The value is sampled from the `choices`
2. trial.suggest_discrete_uniform: Suggest a value for the discrete parameter. The value is sampled uniformely from the range `[low,high]`, with some step of discretization
3. trial.suggest_logunifrom: Suggest a value for the continuous parameter. The value is sampled from the range `[low,high)` in the log domain.
4. trial.suggest_int: Suggest a value for the integer parameter. The value is sampled from the integers in `[low,high]`

[Visit this site to learn more](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html#optuna.trial.Trial)

In [None]:
def fit_xgb(trial, xtr, ytr, xval, yval):
    params = {
        "n_estimators": trial.suggest_categorical("n_estimators", [150, 200, 250, 300]),
        "subsample": trial.suggest_discrete_uniform("subsample", 0.6,1,0.1),
        "colsample_bytree": trial.suggest_discrete_uniform("colsample_bytree", 0.6,1,0.1),
        "eta": trial.suggest_loguniform("eta",1e-2,0.1),
        # "gamma": trial.suggest_loguniform("gamma",0.05,1),
        "max_depth": trial.suggest_categorical("max_depth",[5,7,9,11,13]),
        "min_child_weight": trial.suggest_int("min_child_weight",5,11),
        "random_state": 2021
    }
    
    model = xgb.XGBRegressor(**params)
    model.fit(xtr, ytr.reshape(-1,))
    
    y_val_pred = model.predict(xval)
    
    log = {
        "train rmse": mean_squared_error(ytr, model.predict(xtr), squared=False), # setting squared=False returns root_mean_squared_error
        "valid rmse": mean_squared_error(yval, y_val_pred, squared=False)  # setting squared=False returns root_mean_squared_error
    }
    
    return model, log

## Objective Function

In [None]:
def objective(trial):
    rmse = 0
    for fold in range(5):
        trn_idx = train['kfold'] != fold
        val_idx = train['kfold'] == fold
        trn = train.loc[trn_idx, :]
        val = train.loc[val_idx, :]

        xtr, ytr = trn[feature_cols].values, trn[target_cols].values
        xval, yval = val[feature_cols].values, val[target_cols].values
        
        model, log = fit_xgb(trial, xtr, ytr, xval, yval)
        rmse += log['valid rmse']/5
        
    return rmse

### **NOTE** : The Objective Function should return the metric to be minimized or maximized 

## CREATE STUDY
### Specify whether to maximize or minimize your metric (which is returned by the Objective Function)

In [None]:
study = optuna.create_study(direction="minimize", study_name='Xgboost optimization')
study.optimize(objective, n_trials=20)

In [None]:
history = study.trials_dataframe()
history.sort_values(by="value", ascending=True)

In [None]:
study.best_params

## Visualization

### The `visualization` module provides utility functions for plotting the optimization process using plotly and matplotlib
[https://optuna.readthedocs.io/en/stable/reference/visualization/index.html](https://optuna.readthedocs.io/en/stable/reference/visualization/index.html)

In [None]:
visualization.plot_optimization_history(study)

In [None]:
visualization.plot_parallel_coordinate(study)

In [None]:
visualization.plot_param_importances(study)

## Retraining on the Entire Data

In [None]:
clf = xgb.XGBRegressor(**(study.best_params))

In [None]:
clf.fit(train[feature_cols], train[target_cols])

## Creating Submission File

In [None]:
preds = pd.Series(clf.predict(test[feature_cols]), name='target')

In [None]:
preds = pd.concat([test['id'], preds], axis=1)

In [None]:
preds.head()

In [None]:
preds.to_csv("submission.csv", index=False)

Public Score: 0.69986

## If you learnt something new, Consider upvoting my kernel :)