In [None]:
import pandas as pd
import numpy as np
import random
import time
import os
from os import path

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

from tqdm import tqdm

from catboost import CatBoostRegressor, CatBoostClassifier
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMClassifier, LGBMRegressor

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, QuantileTransformer
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.ensemble import StackingRegressor, StackingClassifier

import optuna
from optuna.samplers import TPESampler

from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_slice

import gc

def seed_everything(seed=2021):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything()

<h1 id="basics" style="font-family:verdana;"> 
Visualizing optuna results
</h1>

<div style="font-size:15px; font-family:verdana;">

I'm starting my journey in kaggle competition and I'm beginning to realize the importance of hyperparameters tuning.
    
    
So far, I find it kind of hard to get a feeling of the different parameters in boosting models (e.g. catboost, xgboost...) and so I have being relying on the optuna library that helps you maximize/minimize an objective function under a user defined search space.
    
    
I've spent quite some (computer-)time tuning parameters with Optuna and today I'm investigating the plot commodities that are available (relying on plotly)

Here is the code of the objective function i'm maximizing:
 
    
 <code>def objective(trial):

    param_grid = {
        'iterations': trial.suggest_categorical('iterations',[N_ESTIMATORS]),
        'learning_rate' : trial.suggest_uniform('learning_rate', 0.005,0.1),
        'depth': trial.suggest_int('depth', 3, 12),
        'l2_leaf_reg': trial.suggest_uniform('l2_leaf_reg', 1e-5,100),
        'subsample': trial.suggest_uniform('subsample',0,1),
        'random_strength' : trial.suggest_uniform('random_strength', 1, 50),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15)
     } 

    display(param_grid)
    
    model = CatBoostClassifier(
        grow_policy='Depthwise',
        leaf_estimation_method='Newton', 
        bootstrap_type='Bernoulli',
        loss_function= LOSS,
        eval_metric= EVAL_METRIC,
        task_type='GPU',
        silent=True,
        random_seed = SEED,
        **param_grid
    )
     
    scores,_,_ = run_kfold(model, N_SPLITS)

    return scores.mean()
</code>
    
</div>

<h1 id="basics" style="font-family:verdana;"> 
Importing the study (study.db)
</h1>

<div style="font-size:15px; font-family:verdana;">
    <p>
        <ul>
            <li> Moving the .db file from the input directory (which is read-only) to the output to make it works </li>
            <li> The same trick can be used to continue optimizing a study using the same notebook and get over the 9-hour notebook execution time limit </li>
    </p>
</div>

In [None]:
STUDY_PATH = r"../input/tps-10-21-catboost-optuna-baseline/study.db"

if path.exists(STUDY_PATH):
    print("Found existing study")
    !cp "../input/tps-10-21-catboost-optuna-baseline/study.db" "study.db"
    !chmod +rwx "study.db"
    
study = optuna.create_study(direction='maximize', sampler=TPESampler(), study_name='CatClassifier', storage=r"sqlite:///study.db", load_if_exists=True)

<h1 id="basics" style="font-family:verdana;"> 
List all the trials
</h1>

<div style="font-size:15px; font-family:verdana;">
    <p>
        <ul>
            <li> List of all trials (set of parameter) that have been tested and the corresponding objective value </li>
    </p>
</div>

In [None]:
res = study.trials_dataframe(attrs=('number', 'value', 'params'))
display(res.info())
res.tail(10)

In [None]:
DEPTH = "depth"
L2_LEAF_REG = "l2_leaf_reg"
LEARNING_RATE = "learning_rate"
MIN_DATA_IN_LEAF = "min_data_in_leaf"
VALUE = "value"
LEAF_EST = "leaf_estimation_iterations"
RAND = "random_strength"
SUB = "subsample"

<h1 id="history_1" style="font-family:verdana;"> 
History of optimization
</h1>

In [None]:
plot_optimization_history(study)


<h1 id="importance_1" style="font-family:verdana;"> 
Hyperparameters importance
</h1>

In [None]:
plot_param_importances(study)

<h1 id="contour" style="font-family:verdana;"> 
Contour Plot
</h1>

<div style="font-size:15px; font-family:verdana;">
    <p>
        <ul>
            <li> Plotting the 5 most important parameters </li>
    </p>
</div>

In [None]:
fig = plot_contour(study, params=[DEPTH, RAND, SUB, L2_LEAF_REG, LEAF_EST])
fig.update_layout({"width":1200, "height":1200})
def update_colorscale(trace):
    try:
        trace["colorscale"]= "Tealrose"
        trace["reversescale"]= False
    except:
        pass
fig.for_each_trace(update_colorscale)

<h1 id="select" style="font-family:verdana;"> 
Selecting only the N last trials
</h1>

<div style="font-size:15px; font-family:verdana;">
    <p>
        <ul>
            <li> First trials had quite low objective value, plots are diffult to analyze </li>
            <li> Starting again using only the n-last trials </li>
    </p>
</div>

In [None]:
N_trials = 60

N_last_ls = res.iloc[-N_trials:,:].index.to_list()
last_trials = [study.trials[idx] for idx in N_last_ls]
study_N_last = optuna.create_study(direction='maximize', sampler=TPESampler(), study_name='N_last', storage=r"sqlite:///study2.db", load_if_exists=True)
study_N_last.add_trials(last_trials)

In [None]:
plot_optimization_history(study_N_last)

In [None]:
plot_param_importances(study_N_last)

In [None]:
fig = plot_contour(study_N_last, params=[DEPTH, LEARNING_RATE, SUB, L2_LEAF_REG, RAND])
fig.update_layout({"width":1200, "height":1200})
def update_colorscale(trace):
    try:
        trace["colorscale"]= "Tealrose"
    except:
        pass
fig.for_each_trace(update_colorscale)

   
<h1 id="contour" style="font-family:verdana;"> 
Conclusion
</h1>

<div style="font-size:15px; font-family:verdana;">
    <p> This last graph gives me some intuition on how to continue optmizing : </p>
        <ul>
            <li> the "right"  tree-depth for this problem might be 3 !! so I will freeze this parameter </li>
            <li> best results are obtained with small learning rate, I should allow this parameter to go further down </li>
            <li> try to focus the subsample parameter on smaller value</li>
    </ul>
    <code> param_grid = {
        'iterations': trial.suggest_categorical('iterations',[N_ESTIMATORS]),
        'learning_rate' : trial.suggest_uniform('learning_rate', 5e-4, 5e-3),
        'depth': 3,
        'l2_leaf_reg': trial.suggest_uniform('l2_leaf_reg', 1e-5,100),
        'subsample': trial.suggest_uniform('subsample',0,0.5),
        'random_strength' : trial.suggest_uniform('random_strength', 1, 50),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15)
        }</code> 
     
</div>

<div style="font-size:15px; font-family:verdana;">
    <p> Happy tuning ! Please upvote if you find this notebook usefull </p>
</div> 

<div style="font-size:15px; font-family:verdana;">