In [12]:
import optuna
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
import catboost
from catboost import CatBoostRegressor
from sklearn.metrics import (mean_absolute_error,mean_squared_error,
                            mean_squared_log_error,median_absolute_error)
import numpy as np

In [31]:
study_name = 'price-recommendation_v1'  # Unique identifier of the study.
study = optuna.create_study(study_name=study_name, storage='sqlite:///hyperparameter.db')

[I 2020-01-02 20:21:07,483] A new study created with name: price-recommendation_v1


In [24]:
DIR=Path('../listing_price_suggest.csv')
DIR
df=pd.read_csv(DIR)
df.head()

Unnamed: 0,primary_key,y_var,x1var_cat,x2var_cat,x3var_cont,x4var_cont,x5var_cont,x6var_cont,x7var_cont,x8var_cont,x9var_cat,x10var_cont,x11var_cont,x12var_cat,x13var_cont,x14var_cont,x15var_cont
0,9,80,1,2,15.9,1.3043,1.13,15.0787,0.0,0.0,0,0.5479,0.0,0,0.0,1.53,0.0
1,57,0,0,1,58.3,0.0,0.0,0.075,0.0,0.0,1,0.6849,1.44,0,0.0,1.53,0.0533
2,105,0,3,1,143.1,0.0,0.0,0.075,0.0,0.0,0,0.1369,0.0,0,0.0,1.53,0.0
3,153,37,3,1,169.6,4.3478,5.65,27.0817,76.59,0.0,0,44.5205,11.1542,0,0.0,0.3642,0.0
4,201,0,0,2,148.4,0.0,0.0,0.075,0.0,0.0,0,0.0,0.0,0,0.0,1.53,0.0


In [37]:
def objective(trial):
    train_x, test_x, train_y, test_y = train_test_split(df.iloc[:,2:], df['y_var'], test_size=0.3)

    param = {
        'objective': trial.suggest_categorical('objective', ['MAE','RMSE']),
        'depth': trial.suggest_int('depth', 1, 12),
        'learning_rate':trial.suggest_uniform('learning_rate',0.01,0.9),
        'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type',
                                                    ['Bayesian', 'Bernoulli', 'MVS']),
        'used_ram_limit': '3gb'
    }

    if param['bootstrap_type'] == 'Bayesian':
        param['bagging_temperature'] = trial.suggest_uniform('bagging_temperature', 0, 10)
    elif param['bootstrap_type'] == 'Bernoulli':
        param['subsample'] = trial.suggest_uniform('subsample', 0.1, 1)

    gbm = CatBoostRegressor(**param)

    gbm.fit(train_x, train_y, eval_set=[(test_x, test_y)], verbose=0, early_stopping_rounds=100)

    preds = gbm.predict(test_x)
    accuracy = mean_absolute_error(test_y, preds)
    return accuracy



In [38]:
study.optimize(objective, n_trials=15)


[I 2020-01-02 20:22:39,211] Finished trial#3 resulted in value: 42.72380244969867. Current best value is 42.72380244969867 with parameters: {'boosting_type': 'Plain', 'bootstrap_type': 'MVS', 'depth': 11, 'learning_rate': 0.4890426144226916, 'objective': 'MAE'}.
[I 2020-01-02 20:22:42,960] Finished trial#4 resulted in value: 50.59155655991029. Current best value is 42.72380244969867 with parameters: {'boosting_type': 'Plain', 'bootstrap_type': 'MVS', 'depth': 11, 'learning_rate': 0.4890426144226916, 'objective': 'MAE'}.
[I 2020-01-02 20:23:43,307] Finished trial#5 resulted in value: 42.58572726072716. Current best value is 42.58572726072716 with parameters: {'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'depth': 3, 'learning_rate': 0.6835971672700377, 'objective': 'MAE', 'subsample': 0.37490363384070136}.
[I 2020-01-02 20:24:41,426] Finished trial#6 resulted in value: 46.952868241144465. Current best value is 42.58572726072716 with parameters: {'boosting_type': 'Plain', 'boo

In [39]:
df = study.trials_dataframe()
df

Unnamed: 0_level_0,number,state,value,datetime_start,datetime_complete,params,params,params,params,params,params,params,system_attrs,system_attrs
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,bagging_temperature,boosting_type,bootstrap_type,depth,learning_rate,objective,subsample,_number,fail_reason
0,0,TrialState.FAIL,,2020-01-02 20:21:11.863237,2020-01-02 20:21:12.418973,,,,6,,RMSE,,0,Setting status of trial#0 as TrialState.FAIL b...
1,1,TrialState.FAIL,,2020-01-02 20:21:38.181381,2020-01-02 20:21:38.818264,,,,7,,MAE,,1,Setting status of trial#1 as TrialState.FAIL b...
2,2,TrialState.FAIL,,2020-01-02 20:22:14.453568,2020-01-02 20:22:15.072724,,,,2,,RMSE,,2,Setting status of trial#2 as TrialState.FAIL b...
3,3,TrialState.COMPLETE,42.723802,2020-01-02 20:22:23.798479,2020-01-02 20:22:39.130715,,Plain,MVS,11,0.489043,MAE,,3,
4,4,TrialState.COMPLETE,50.591557,2020-01-02 20:22:39.215907,2020-01-02 20:22:42.882038,,Plain,Bernoulli,5,0.529281,RMSE,0.607423,4,
5,5,TrialState.COMPLETE,42.585727,2020-01-02 20:22:42.963390,2020-01-02 20:23:43.219132,,Plain,Bernoulli,3,0.683597,MAE,0.374904,5,
6,6,TrialState.COMPLETE,46.952868,2020-01-02 20:23:43.311247,2020-01-02 20:24:41.338719,,Plain,MVS,1,0.013789,MAE,,6,
7,7,TrialState.COMPLETE,44.548957,2020-01-02 20:24:41.431578,2020-01-02 20:25:29.054821,1.913685,Ordered,Bayesian,1,0.384213,MAE,,7,
8,8,TrialState.COMPLETE,41.916973,2020-01-02 20:25:29.147049,2020-01-02 20:26:22.714733,,Ordered,MVS,5,0.226351,MAE,,8,
9,9,TrialState.COMPLETE,50.108205,2020-01-02 20:26:22.825344,2020-01-02 20:26:34.244275,,Plain,MVS,12,0.547259,RMSE,,9,


In [40]:
from pandas import DataFrame
from tabulate import tabulate


print(tabulate(df, tablefmt="pipe", headers="keys"))

|    |   ('number', '') | ('state', '')       |   ('value', '') | ('datetime_start', '')     | ('datetime_complete', '')   |   ('params', 'bagging_temperature') | ('params', 'boosting_type')   | ('params', 'bootstrap_type')   |   ('params', 'depth') |   ('params', 'learning_rate') | ('params', 'objective')   |   ('params', 'subsample') |   ('system_attrs', '_number') | ('system_attrs', 'fail_reason')                                                                                                                            |
|---:|-----------------:|:--------------------|----------------:|:---------------------------|:----------------------------|------------------------------------:|:------------------------------|:-------------------------------|----------------------:|------------------------------:|:--------------------------|--------------------------:|------------------------------:|:---------------------------------------------------------------------------------------------------

| 17 |               17 | TrialState.COMPLETE |         42.9598 | 2020-01-02 20:28:46.558330 | 2020-01-02 20:30:10.208094  |                           nan       | Ordered                       | MVS                            |                    10 |                     0.0427293 | MAE                       |                nan        |                            17 | nan                                                                                                                                                        |
