# Final Back Test
Full back test doing 5-fold cross validation for 10 different random seed, used as a final performance evaluation after model selection

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from hyperopt import hp, fmin, tpe, Trials, space_eval, STATUS_OK
from data_process.data_transform_processor import DataTransformProcessor
from models.model_flow import ModelFlow
from models.nn_models.dnn import DNN
from lightgbm import LGBMRegressor
from sklearn.linear_model import ElasticNet
from models.tree_models.lgbm import LGBM
from models.backtest import BackTest
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

Using TensorFlow backend.


# Prepare data

In [2]:
df_all = pd.read_csv('/Users/shuyangdu/Desktop/ZillowChallenge/data/df_merged.csv')

# Candidate Data Processors

In [3]:
data_processor_dummy = DataTransformProcessor(use_scale=True, use_pca=False, use_dummy=True)
data_processor_tree = DataTransformProcessor(use_scale=False, use_pca=False, use_dummy=False)

## Data preprocess

In [5]:
X_all = data_processor_tree.pre_process(df_all)
y_all = df_all['logerror'].values

# Candidate Models

## ElasticNet

In [6]:
model_elt = ElasticNet(alpha=90, l1_ratio=0.85)

## LightGBM

In [8]:
params = {
    'max_bin': 80,
    'learning_rate': 0.0116,
    'boosting_type': 'gbdt',
    'objective': 'regression_l1',
    'feature_fraction': 0.94,
    'bagging_fraction': 0.85,
    'bagging_freq': 80,
    'num_leaves': 110,
    'lambda_l2': 86.9,
    'n_estimators': 450,
}

In [10]:
model_lgbm = LGBM(
    categorical_feature = data_processor_tree.categorical_col_idx,
    **params
)

## NeuralNetwork

In [18]:
model_dnn = DNN(
    dim_hidden_lst=[30], 
    learning_rate=0.01, 
    decay=0.0001,
    batch_size=128, 
    epochs=5, 
    verbose=0
)

# Back test

In [11]:
model_flow = ModelFlow(model=model_lgbm, data_processor=data_processor_tree)

In [12]:
backtest = BackTest(model_flow=model_flow)

## LightGBM

In [17]:
backtest.full_cv(X_all, y_all)

0.067105600663580456

## ElasticNet

In [10]:
backtest.full_cv(X_all, y_all)

0.068458717051321921

## NeuralNetwork

In [9]:
backtest.full_cv(X_all, y_all)

0.06844671392965937