In [42]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from hyperopt import hp, fmin, tpe, Trials, space_eval, STATUS_OK
from data_process.column_schema import (PROPERTIES_RENAME_DICT, TRANSACTION_RENAME_DICT, 
                                        NUMERICAL_COLS, CATEGORICAL_COLS)
from data_process.data_process_pipeline import DataProcessPipeline
from models.nn_models.dnn import DNN
from lightgbm import LGBMRegressor
from models.hyperparameter_opt import HyperParameterOpt
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Prepare data

In [2]:
df_all = pd.read_csv('/Users/shuyangdu/Desktop/ZillowChallenge/data/df_merged.csv')

In [3]:
data_pipeline = DataProcessPipeline(encode_mode='label')

In [4]:
df = data_pipeline.pre_process(df_all)

# Define search space

## Search space for DNN

In [None]:
space = dict()
space['learning_rate'] = hp.loguniform('learning_rate', -4*np.log(10), -1*np.log(10))
space['reg'] = hp.loguniform('reg', -4*np.log(10), -1*np.log(10))
# space['dim_hidden_lst'] = hp.choice('dim_hidden_lst', [(20,), (30,)])
space['decay'] = hp.loguniform('decay', -1*np.log(10), -3*np.log(10))

## Search space for LightGBM

In [12]:
space = dict()
space['learning_rate'] = hp.loguniform('learning_rate', -3*np.log(10), -1*np.log(10))
space['n_estimators'] = 10 * (1 + hp.randint('n_estimators', 5))
space['num_leaves'] = 5 * (4 + hp.randint('num_leaves', 10))
space['lambda_l2'] = hp.loguniform('lambda_l2', -3*np.log(10), 2*np.log(10))
space['feature_fraction'] = hp.uniform('feature_fraction', 0.7, 1.0)
space['bagging_freq'] = 3 * (1 + hp.randint('bagging_freq', 5))
space['bagging_fraction'] = hp.uniform('bagging_fraction', 0.7, 1.0)

fixed_params = {
    'objective': 'regression_l1',
}

# Optimize for hyper-parameters

In [43]:
hyper_opt = HyperParameterOpt(model_class=LGBMRegressor, data_process_pipeline=data_pipeline, df=df, 
                              search_space=space, max_evals=2,
                              fixed_params=fixed_params)

In [46]:
hyper_opt.optimize(52)

In [47]:
hyper_opt.trial_results

Unnamed: 0,bagging_fraction,bagging_freq,feature_fraction,lambda_l2,learning_rate,n_estimators,num_leaves,loss,loss_std,status
0,0.941095,6,0.940539,0.020274,0.024974,20,55,0.067906,0.000715,ok
1,0.999801,6,0.986725,0.003873,0.002579,50,35,0.06826,0.000708,ok


In [11]:
hyper_opt.trial_results

Unnamed: 0,bagging_fraction,bagging_freq,feature_fraction,lambda_l2,learning_rate,n_estimators,num_leaves,loss,loss_std,status
0,0.702523,1,0.975571,27.069019,0.053317,4,7,0.067482,0.000701,ok
0,0.700818,1,0.999777,85.617386,0.05511,4,7,0.067485,0.000714,ok
0,0.7022,1,0.993072,59.702786,0.091936,4,7,0.067488,0.000726,ok
0,0.706881,1,0.988042,73.867873,0.089171,4,0,0.067495,0.000722,ok
0,0.727174,1,0.919586,24.224228,0.049505,4,7,0.067497,0.000708,ok
0,0.737501,1,0.925122,20.592141,0.057275,4,7,0.067499,0.000711,ok
0,0.739529,1,0.945776,6.106629,0.090556,4,4,0.067511,0.000744,ok
0,0.71405,1,0.998667,54.440891,0.089012,4,0,0.067512,0.000728,ok
0,0.811183,0,0.902755,1.635136,0.028619,4,9,0.067581,0.000727,ok
0,0.987912,1,0.810555,0.042853,0.03479,4,0,0.067644,0.000728,ok
