In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from hyperopt import hp, fmin, tpe, Trials, space_eval, STATUS_OK
from data_process.column_schema import (PROPERTIES_RENAME_DICT, TRANSACTION_RENAME_DICT, 
                                        NUMERICAL_COLS, CATEGORICAL_COLS)
from data_process.data_process_pipeline import DataProcessPipeline
from models.nn_models.dnn import DNN
from lightgbm import LGBMRegressor
from sklearn.linear_model import ElasticNet
from models.tree_models.lgbm import LGBM
from models.hyperparameter_opt import HyperParameterOpt
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

Using TensorFlow backend.


In [1]:
import lightgbm

In [2]:
lightgbm.__version__

'2.0.1'

# Prepare data

In [2]:
df_all = pd.read_csv('/Users/shuyangdu/Desktop/ZillowChallenge/data/df_merged.csv')

In [3]:
data_pipeline = DataProcessPipeline(encode_mode='label')

In [4]:
df = data_pipeline.pre_process(df_all)

# Define search space

## Search space for DNN

In [None]:
space = {
    'learning_rate': hp.loguniform('learning_rate', -4*np.log(10), -1*np.log(10)),
    'reg': hp.loguniform('reg', -4*np.log(10), -1*np.log(10)),
    'decay': hp.loguniform('decay', -1*np.log(10), -3*np.log(10)),
#     'dim_hidden_lst': hp.choice('dim_hidden_lst', [(20,), (30,)]),
}

## Search space for LightGBM

In [21]:
space = {
    'learning_rate': hp.loguniform('learning_rate', -2*np.log(10), -1*np.log(10)),  # 1e-2 ~ 1e-1
    'n_estimators': 50 * (4 + hp.randint('n_estimators', 7)),  # 200 ~ 500
    'num_leaves': 10 * (5 + hp.randint('num_leaves', 16)),  # 50 ~ 200
    'lambda_l2': hp.loguniform('lambda_l2', -3*np.log(10), 2*np.log(10)),  # 1e-3 ~ 1e2
    'feature_fraction': hp.uniform('feature_fraction', 0.8, 1.0),  #  0.5 ~ 1.0
    'bagging_freq': 10 * (2 + hp.randint('bagging_freq', 7)),  # 20 ~ 80
    'bagging_fraction': hp.uniform('bagging_fraction', 0.7, 1.0),
    'max_bin': 40 * (1 + hp.randint('max_bin', 5)),  # 40 ~ 200
}
                              
fixed_params = {
    'objective': 'regression_l1',
    'feature_name': data_pipeline.original_feature_cols,
    'categorical_feature': data_pipeline.categorical_cols,
}

## Search space for Linear Regression ElasticNet

In [5]:
space = {
    'alpha': hp.loguniform('alpha', -2*np.log(10), 2*np.log(10)),
    'l1_ratio': hp.uniform('l1_ratio', 0.0, 1.0),
}
fixed_params={}

# Optimize for hyper-parameters

In [11]:
hyper_opt = HyperParameterOpt(
    model_class=ElasticNet, 
    data_process_pipeline=data_pipeline,                           
    search_space=space, 
    max_evals=50,
    fixed_params=fixed_params
)

In [12]:
hyper_opt.optimize(df, 52)

In [13]:
hyper_opt.trial_results

Unnamed: 0,alpha,l1_ratio,loss,status
0,94.729727,0.867189,0.068467,ok
1,97.822821,0.746634,0.068467,ok
2,82.02739,0.861083,0.068467,ok
3,75.649159,0.87406,0.068468,ok
4,84.911671,0.768019,0.068468,ok
5,54.392289,0.998152,0.068469,ok
6,93.723304,0.486951,0.068469,ok
7,49.928764,0.832652,0.06847,ok
8,56.17391,0.521782,0.068471,ok
9,27.248091,0.918379,0.068471,ok


In [24]:
hyper_opt.trial_results

Unnamed: 0,bagging_fraction,bagging_freq,feature_fraction,lambda_l2,learning_rate,max_bin,n_estimators,num_leaves,loss,loss_std,status
0,0.845993,80,0.936884,86.858573,0.011628,80,450,110,0.067135,0.000733,ok
1,0.879819,30,0.879025,1.02361,0.038622,40,450,50,0.067258,0.000737,ok


In [11]:
hyper_opt.trial_results

Unnamed: 0,bagging_fraction,bagging_freq,feature_fraction,lambda_l2,learning_rate,max_bin,n_estimators,num_leaves,loss,loss_std,status
0,0.991488,30,0.875060,0.045371,0.021406,40,220,70,0.067425,0.000699,ok
1,0.962986,25,0.936228,13.857093,0.024187,200,220,105,0.067425,0.000727,ok
2,0.959036,40,0.942805,0.003406,0.014963,40,220,90,0.067429,0.000706,ok
3,0.975439,40,0.991875,0.004833,0.016095,40,240,100,0.067431,0.000699,ok
4,0.867274,35,0.932289,0.014961,0.018952,50,220,65,0.067438,0.000695,ok
5,0.960383,35,0.908210,0.136476,0.031466,110,220,65,0.067442,0.000714,ok
6,0.943880,30,0.868552,0.014900,0.027637,30,220,55,0.067444,0.000705,ok
7,0.917334,30,0.885922,0.013964,0.026668,30,220,55,0.067445,0.000710,ok
8,0.885644,15,0.904018,0.259020,0.019683,50,150,95,0.067445,0.000716,ok
9,0.731328,25,0.867275,5.697474,0.039177,80,170,55,0.067451,0.000752,ok


In [10]:
hyper_opt.trial_results.to_csv('/Users/shuyangdu/Desktop/ZillowChallenge/hyper-parameter-opt/LGBM_raw.csv')