# How To Use Bayesian Optimization
In this kernel I'll try to demonstrate how easy it is to use 'hyperopt' to do hyperparams search using bayesian optimization.
This is not supposed to be an in-depth tutorial but more a simple notebook to show how to use this great searching method.


I intentionally won't do any EDA or feature extraction from the data.
I'll do a simple one hot encoding to categorical features and run a model !


#### We actually need only 2 things:
1. The parameters' values space to search
2. An objective function to minimize

In [45]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import category_encoders
from sklearn.model_selection import train_test_split, cross_validate, KFold
from sklearn.pipeline import Pipeline
from sklearn import metrics
from hyperopt import hp, tpe, fmin, space_eval
import os

from sklearn.decomposition import PCA
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.cluster import FeatureAgglomeration

np.random.seed(123)


### Load the train data and the test data:

In [46]:
train = pd.read_csv(os.path.join('..', 'input', 'train.csv'), index_col='ID')
train.head()

Unnamed: 0_level_0,y,X0,X1,X2,X3,X4,X5,X6,X8,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23,X24,X26,X27,X28,X29,X30,X31,X32,X33,X34,X35,X36,X37,X38,X39,X40,X41,...,X345,X346,X347,X348,X349,X350,X351,X352,X353,X354,X355,X356,X357,X358,X359,X360,X361,X362,X363,X364,X365,X366,X367,X368,X369,X370,X371,X372,X373,X374,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
0,130.81,k,v,at,a,d,u,j,o,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
6,88.53,k,t,av,e,d,y,l,o,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
7,76.26,az,w,n,c,d,x,j,x,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,0,1,0,1,0,0,0,0,...,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
9,80.62,az,t,n,f,d,x,l,e,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,0,1,0,1,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
13,78.02,az,v,n,f,d,h,d,n,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,0,1,0,1,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [47]:
test = pd.read_csv(os.path.join('..', 'input', 'test.csv'), index_col='ID')
test.head()

Unnamed: 0_level_0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23,X24,X26,X27,X28,X29,X30,X31,X32,X33,X34,X35,X36,X37,X38,X39,X40,X41,X42,...,X345,X346,X347,X348,X349,X350,X351,X352,X353,X354,X355,X356,X357,X358,X359,X360,X361,X362,X363,X364,X365,X366,X367,X368,X369,X370,X371,X372,X373,X374,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,az,v,n,f,d,t,a,w,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,0,1,0,1,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,t,b,ai,a,d,b,g,y,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,az,v,as,f,d,a,j,j,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,1,0,1,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,az,l,n,f,d,z,l,n,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,0,1,0,1,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
5,w,s,as,c,d,y,i,m,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [48]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4209 entries, 0 to 8417
Columns: 377 entries, y to X385
dtypes: float64(1), int64(368), object(8)
memory usage: 12.1+ MB


In [49]:
def test_model(x_train, x_test, y_train, y_test, model):
    """ fit the model and print the train and test result """
    np.random.seed(1)
    model.fit(x_train, y_train)
    print('train score: ', model.score(x_train, y_train))
    print('test score: ', model.score(x_test, y_test))

In [50]:
# Split to X and y and then to train and test sets:
X = train.drop('y', axis=1)
y = train['y']
x_train, x_test, y_train, y_test = train_test_split(X, y)

One hot encoding to the categorical columns in the data:

In [None]:
# One hot encoding to the categorical columns in the data:
one_hot = category_encoders.OneHotEncoder(cols=['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8'], drop_invariant=True, use_cat_names=True)
x_train_one_hot = one_hot.fit_transform(x_train)
x_test_one_hot = one_hot.transform(x_test)

#### Test a first inilized model for a baseline

In [None]:
test_model(x_train_one_hot, x_test_one_hot, y_train, y_test, model=SVR())

Our init model is not bad, lets do a simple hyperparams search

In [None]:
def get_model(args):
    """Construct the mode based on the args choosen in the current step of the bayesian optimization process"""
    feature_selector = args['selection']
        
    model = Pipeline([
        ('scaler', args['scaler']()),
        ('selection', feature_selector['selection_algo'](**feature_selector['selection_params'])),
        ('clf', args['clf'](**args['clf_params']))
    ])

    return model

In [None]:
def objective_func(args, x_train=x_train_one_hot, y_train=y_train):
    """
    Run a cross validation on the train data and return the mean test score.
    This function output will be value the bayesian optimization process will try to minimize.
    """
    np.random.seed(123)
    model = get_model(args)

    cv_results = cross_validate(estimator=model, X=x_train, y=y_train, n_jobs=-1, scoring='r2',
                                cv=KFold(n_splits=4))
    return - cv_results['test_score'].mean() # minus is because we optimize to the minimum

#### A few notes about the search space:
- You need to specify for each parameter it's distribution.<br/>I offen user uniformal distribution if I'm not sure which is the right distribution (**Do you know a better way? I'll be happy to learn, please leave a comment**
- I'm considering the choise of which data scaler to use as a hyperparameter
- I assume I need some feature selection but I'm not sure which method will be the best.<br/>So I have three different methods which have different params and all this will be considered as hyperparam as well.
- There is more options and maybe better models to use ..

In [None]:
search_space = {
    'scaler': hp.choice('scaler', [StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler]),
    'selection':  hp.choice('selection',[
        {
        'selection_algo': SelectKBest,
        'selection_params': 
            {
            'k': hp.choice('k', ['all'] + list(range(1, x_train_one_hot.shape[1]))),
            'score_func': hp.choice('score_func', [f_regression, mutual_info_regression])
            }
        },
        {
            'selection_algo': PCA,
            'selection_params': {'n_components': hp.uniformint('n_components', 1, x_train_one_hot.shape[1])}
        },
        {
            'selection_algo': FeatureAgglomeration,
            'selection_params': {'n_clusters': hp.uniformint('n_clusters', 1, x_train_one_hot.shape[1])}
        }
    ]),

    'clf': SVR,
    'clf_params': 
        {
            'kernel': hp.choice('kernel', ['rbf', 'poly', 'linear']),
            'C': hp. uniform('C', 0.0001, 30)
        }

}

In [None]:
np.random.seed(123)
best_space = fmin(objective_func, space=search_space, algo=tpe.suggest, max_evals=100)
best_model =  get_model(space_eval(search_space, best_space))
print(best_model)

In [None]:
space_eval(search_space, best_space)

In [None]:
test_model(x_train_one_hot, x_test_one_hot, y_train, y_test, model=best_model)

**Great** improvement only by searching some hyperparms (100 evaluations, which in my opinion is a low amount) .

Of course a simple grid search would find the same params as well and if you are any lucky even random search would. But it would be a question of running time.<br/>
I believe that this bayesian way improves the random searching and offers a bit better searching method.

In [None]:
# Run on the real test
# X_one_hot = one_hot.fit_transform(X)
# test_one_hot = one_hot.transform(test)

# best_model.fit(X_one_hot, y)
# pd.DataFrame({'ID':test.index, 'y': best_model.predict(test_one_hot)}).to_csv(r'subs.csv', index=False)