# Hyper-parameter Optimization
Wrap __[hyperopt](https://hyperopt.github.io/hyperopt/)__ into **HyperParameterOpt** to do hyper-parameter optimization. Use random search and logged the parameters have been searched.

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from hyperopt import hp, fmin, tpe, Trials, space_eval, STATUS_OK
# from schema.columns_added_filled import LABEL_COL, NUMERICAL_COLS, CATEGORICAL_COLS, LOG_COLS
from schema.columns_added import LABEL_COL, NUMERICAL_COLS, CATEGORICAL_COLS, LOG_COLS
from data_process.data_transform_processor import DataTransformProcessor
from models.nn_models.dnn import DNN
from lightgbm import LGBMRegressor
from sklearn.linear_model import ElasticNet
from models.tree_models.lgbm import LGBM
from models.hyperparameter_opt import HyperParameterOpt
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

Using TensorFlow backend.


# Prepare data

In [2]:
df_all = pd.read_csv('/Users/shuyangdu/Desktop/ZillowChallenge/data/df_merged_20170923.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
data_processor_dummy = DataTransformProcessor(
    use_dummy=True, use_scale=True,
    numerical_cols=NUMERICAL_COLS, 
    categorical_cols=CATEGORICAL_COLS,
    log_cols=LOG_COLS, 
    label_col=LABEL_COL,
)
data_processor_tree = DataTransformProcessor(
    numerical_cols=NUMERICAL_COLS, 
    categorical_cols=CATEGORICAL_COLS,
    log_cols=LOG_COLS, 
    label_col=LABEL_COL,
)

In [4]:
X_all = data_processor_tree.pre_process(df_all)
y_all = df_all['logerror'].values

# Define search space

## Search space for DNN

In [7]:
space = {
    'learning_rate': hp.loguniform('learning_rate', -4*np.log(10), -1*np.log(10)),
    'reg': hp.loguniform('reg', -4*np.log(10), -1*np.log(10)),
    'decay': hp.loguniform('decay', -1*np.log(10), -3*np.log(10)),
#     'dim_hidden_lst': hp.choice('dim_hidden_lst', [(20,), (30,)]),
}

## Search space for LightGBM

In [6]:
space = {
    'learning_rate': hp.loguniform('learning_rate', -2*np.log(10), -1*np.log(10)),  # 1e-2 ~ 1e-1
    'n_estimators': 50 * (4 + hp.randint('n_estimators', 7)),  # 200 ~ 500
    'num_leaves': 10 * (5 + hp.randint('num_leaves', 16)),  # 50 ~ 200
    'lambda_l2': hp.loguniform('lambda_l2', -3*np.log(10), 2*np.log(10)),  # 1e-3 ~ 1e2
    'feature_fraction': hp.uniform('feature_fraction', 0.8, 1.0),  #  0.5 ~ 1.0
    'bagging_freq': 10 * (2 + hp.randint('bagging_freq', 7)),  # 20 ~ 80
    'bagging_fraction': hp.uniform('bagging_fraction', 0.7, 1.0),
    'max_bin': 40 * (1 + hp.randint('max_bin', 5)),  # 40 ~ 200
}
                              
fixed_params = {
    'objective': 'regression_l1',
    'categorical_feature': data_processor_tree.categorical_col_idx,
}

## Search space for Linear Regression ElasticNet

In [10]:
space = {
    'alpha': hp.loguniform('alpha', -2*np.log(10), 2*np.log(10)),
    'l1_ratio': hp.uniform('l1_ratio', 0.0, 1.0),
}
fixed_params={}

# Optimize for hyper-parameters

In [8]:
hyper_opt = HyperParameterOpt(
    model_class=LGBM, 
    data_processor=data_processor_tree,                           
    search_space=space, 
    max_evals=100,
    fixed_params=fixed_params
)

In [9]:
hyper_opt.optimize(X_all, y_all, 52)

In [10]:
hyper_opt.trial_results

Unnamed: 0,bagging_fraction,bagging_freq,feature_fraction,lambda_l2,learning_rate,max_bin,n_estimators,num_leaves,loss,status
0,0.902728,70,0.911443,39.121277,0.014884,40,400,150,0.067017,ok
1,0.932207,70,0.865260,78.124466,0.015197,160,400,170,0.067019,ok
2,0.941794,30,0.868556,69.711741,0.015732,160,400,200,0.067021,ok
3,0.924219,50,0.832095,65.357793,0.015774,160,400,170,0.067023,ok
4,0.991991,30,0.806619,74.221614,0.019042,80,400,110,0.067027,ok
5,0.884564,50,0.836755,66.846924,0.013976,160,350,200,0.067029,ok
6,0.923154,50,0.841135,63.834266,0.016015,160,400,170,0.067033,ok
7,0.999042,20,0.851968,43.220763,0.016536,160,400,170,0.067035,ok
8,0.889429,70,0.971455,97.302619,0.030615,120,450,80,0.067041,ok
9,0.976324,50,0.821371,34.570797,0.016978,160,400,170,0.067042,ok


# Save opt log history

In [11]:
hyper_opt.trial_results.to_csv('/Users/shuyangdu/Desktop/ZillowChallenge/hyper-parameter-opt/LGBM_added_features_20170923.csv')