# Hyper-parameter Optimization
Wrap __[hyperopt](https://hyperopt.github.io/hyperopt/)__ into **HyperParameterOpt** to do hyper-parameter optimization. Use random search and logged the parameters have been searched.

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from hyperopt import hp, fmin, tpe, Trials, space_eval, STATUS_OK
# from schema.columns_added_filled import LABEL_COL, NUMERICAL_COLS, CATEGORICAL_COLS, LOG_COLS
from schema.columns_added import LABEL_COL, NUMERICAL_COLS, CATEGORICAL_COLS, LOG_COLS
from data_process.data_transform_processor import DataTransformProcessor
from models.nn_models.dnn import DNN
from lightgbm import LGBMRegressor
from sklearn.linear_model import ElasticNet
from models.tree_models.lgbm import LGBM
from models.hyperparameter_opt import HyperParameterOpt
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

Using TensorFlow backend.


# Prepare data

In [2]:
df_all = pd.read_csv('/Users/shuyangdu/Desktop/ZillowChallenge/data/df_merged_20170924.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
data_processor_dummy = DataTransformProcessor(
    use_dummy=True, use_scale=True,
    numerical_cols=NUMERICAL_COLS, 
    categorical_cols=CATEGORICAL_COLS,
    log_cols=LOG_COLS, 
    label_col=LABEL_COL,
)
data_processor_tree = DataTransformProcessor(
    numerical_cols=NUMERICAL_COLS, 
    categorical_cols=CATEGORICAL_COLS,
    log_cols=LOG_COLS, 
    label_col=LABEL_COL,
)

In [4]:
X_all = data_processor_tree.pre_process(df_all)
y_all = df_all['logerror'].values

# Define search space

## Search space for DNN

In [7]:
space = {
    'learning_rate': hp.loguniform('learning_rate', -4*np.log(10), -1*np.log(10)),
    'reg': hp.loguniform('reg', -4*np.log(10), -1*np.log(10)),
    'decay': hp.loguniform('decay', -1*np.log(10), -3*np.log(10)),
#     'dim_hidden_lst': hp.choice('dim_hidden_lst', [(20,), (30,)]),
}

## Search space for LightGBM

In [5]:
space = {
    'learning_rate': hp.loguniform('learning_rate', -2*np.log(10), -1*np.log(10)),  # 1e-2 ~ 1e-1
    'n_estimators': 50 * (4 + hp.randint('n_estimators', 7)),  # 200 ~ 500
    'num_leaves': 10 * (5 + hp.randint('num_leaves', 16)),  # 50 ~ 200
    'lambda_l2': hp.loguniform('lambda_l2', -3*np.log(10), 2*np.log(10)),  # 1e-3 ~ 1e2
    'feature_fraction': hp.uniform('feature_fraction', 0.8, 1.0),  #  0.5 ~ 1.0
    'bagging_freq': 10 * (2 + hp.randint('bagging_freq', 7)),  # 20 ~ 80
    'bagging_fraction': hp.uniform('bagging_fraction', 0.7, 1.0),
    'max_bin': 40 * (1 + hp.randint('max_bin', 5)),  # 40 ~ 200
}
                              
fixed_params = {
    'objective': 'regression_l1',
    'categorical_feature': data_processor_tree.categorical_col_idx,
}

## Search space for Linear Regression ElasticNet

In [10]:
space = {
    'alpha': hp.loguniform('alpha', -2*np.log(10), 2*np.log(10)),
    'l1_ratio': hp.uniform('l1_ratio', 0.0, 1.0),
}
fixed_params={}

# Optimize for hyper-parameters

In [8]:
hyper_opt = HyperParameterOpt(
    model_class=LGBM, 
    data_processor=data_processor_tree,                           
    search_space=space, 
    max_evals=50,
    fixed_params=fixed_params
)

In [9]:
hyper_opt.optimize(X_all, y_all, 52)

In [10]:
hyper_opt.trial_results

Unnamed: 0,bagging_fraction,bagging_freq,feature_fraction,lambda_l2,learning_rate,max_bin,n_estimators,num_leaves,loss,status
0,0.918951,60,0.936023,81.619435,0.010137,200,200,50,0.067807,ok
1,0.805606,40,0.802998,25.213472,0.014757,160,250,60,0.067878,ok
2,0.841711,40,0.936454,69.623965,0.013682,160,250,50,0.067903,ok
3,0.859552,40,0.937134,48.131611,0.01321,160,200,130,0.067919,ok
4,0.812652,80,0.802884,33.742274,0.014646,40,300,60,0.06792,ok
5,0.915849,60,0.954988,16.444967,0.010136,80,200,130,0.067921,ok
6,0.890311,60,0.981622,98.189493,0.010082,200,400,110,0.067959,ok
7,0.996732,60,0.961364,14.236041,0.012088,80,200,120,0.067974,ok
8,0.997156,60,0.997117,14.121415,0.011973,80,200,120,0.06802,ok
9,0.825608,20,0.878792,8.840671,0.01189,200,250,190,0.068044,ok


# Save opt log history

In [11]:
hyper_opt.trial_results.to_csv('/Users/shuyangdu/Desktop/ZillowChallenge/hyper-parameter-opt/LGBM_added_features_20170923.csv')