# Submission
Make final predictions for submission using the best model.

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from hyperopt import hp, fmin, tpe, Trials, space_eval, STATUS_OK
from data_process.data_transform_processor import DataTransformProcessor
from models.model_flow import ModelFlow
from schema import columns_added_filled
from schema import columns_added
from models.nn_models.dnn import DNN
from lightgbm import LGBMRegressor
import lightgbm
from sklearn.linear_model import ElasticNet, LinearRegression, Ridge
from models.tree_models.lgbm import LGBM
from models.backtest import BackTest
from mlens.ensemble import SuperLearner
from sklearn.metrics import mean_absolute_error
from models.util import get_hyper_params
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

Using TensorFlow backend.


# Load data

In [2]:
df_all = pd.read_csv('/Users/shuyangdu/Desktop/ZillowChallenge/data/df_merged_20171008.csv')

In [3]:
df_properties = pd.read_csv('/Users/shuyangdu/Desktop/ZillowChallenge/data/properties_2017_engineered_20171008.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# add fake y for data processing
df_properties['logerror'] = 0

In [5]:
df_submission = pd.read_csv('/Users/shuyangdu/Desktop/ZillowChallenge/data/raw_data/sample_submission.csv')

In [17]:
df_submission.shape

(2985217, 7)

# Pre-process training data

## Candidate Data Processors

In [7]:
data_processor_dct = {
    'dummy': DataTransformProcessor(
    use_dummy=True, use_scale=True,
    numerical_cols=columns_added.NUMERICAL_COLS, 
    categorical_cols=columns_added.CATEGORICAL_COLS,
    log_cols=columns_added.LOG_COLS, 
    label_col=columns_added.LABEL_COL,
),
    'tree': DataTransformProcessor(
    numerical_cols=columns_added.NUMERICAL_COLS, 
    categorical_cols=columns_added.CATEGORICAL_COLS,
    log_cols=columns_added.LOG_COLS, 
    label_col=columns_added.LABEL_COL,
),
}

In [8]:
X_all = data_processor_dct['tree'].pre_process(df_all)
y_all = df_all[columns_added.LABEL_COL].values

In [9]:
X_all.shape

(167888, 81)

# Train model

## Candidate Models

## ElasticNet

In [10]:
eln_params_lst = get_hyper_params(
    '/Users/shuyangdu/Desktop/ZillowChallenge/hyper-parameter-opt/eln_added_features_201701010.csv',
    n=1,
)
model_eln_lst = [ElasticNet(**params) for params in eln_params_lst]

## LightGBM

In [11]:
gbdt_params_lst = get_hyper_params(
    '/Users/shuyangdu/Desktop/ZillowChallenge/hyper-parameter-opt/gbdt_l1_added_features_201701014.csv',
    n=5,
    fixed_params={'boosting_type': 'gbdt', 'objective': 'regression_l1', 
                  'categorical_feature': data_processor_dct['tree'].categorical_col_idx}
)
model_gbdt_lst = [LGBM(**params) for params in gbdt_params_lst]

In [12]:
dart_params_lst = get_hyper_params(
    '/Users/shuyangdu/Desktop/ZillowChallenge/hyper-parameter-opt/dart_l1_added_features_201701009.csv',
    n=2,
    fixed_params={'boosting_type': 'dart', 'objective': 'regression_l1', 
                  'categorical_feature': data_processor_dct['tree'].categorical_col_idx}
)
model_dart_lst = [LGBM(**params) for params in dart_params_lst]

## Stacking (Super Learner)

In [14]:
ens = SuperLearner(folds=2, verbose=1, backend='threading', scorer=mean_absolute_error)

preprocessing_dict = {
    'tree': [data_processor_dct['tree']],
    'numeric': [data_processor_dct['dummy']],
}

estimator_dict = {
    'tree': model_gbdt_lst + model_dart_lst,
    'numeric': model_eln_lst,
}

ens.add(estimators=estimator_dict, preprocessing=preprocessing_dict)

ens.add_meta(Ridge())

SuperLearner(array_check=2, backend='threading', folds=2,
       layers=LayerContainer(backend='threading',
        layers=OrderedDict([('layer-1', Layer(cls='stack', cls_kwargs=None, dtype=<type 'numpy.float32'>,
   estimators={'tree': [('lgbm-1', <models.tree_models.lgbm.LGBM object at 0x1267232d0>), ('lgbm-2', <models.tree_models.lgbm.LGBM object at 0x1267235d...bsolute_error at 0x11817e758>, verbose=1))]),
        n_jobs=-1, raise_on_exception=True, verbose=1),
       n_jobs=-1, raise_on_exception=True, random_state=None,
       scorer=<function mean_absolute_error at 0x11817e758>, shuffle=False,
       verbose=1)

In [None]:
%%time
ens.fit(X_all, y_all)


Fitting 2 layers

[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    8.4s finished
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 28.7min finished
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.2s finished
Fit complete | 00:29:31



CPU times: user 2h 44min 16s, sys: 1h 3min, total: 3h 47min 16s
Wall time: 29min 32s


SuperLearner(array_check=2, backend='threading', folds=2,
       layers=LayerContainer(backend='threading',
        layers=OrderedDict([('layer-1', Layer(cls='stack', cls_kwargs=None, dtype=<type 'numpy.float32'>,
   estimators={'tree': [('lgbm-1', <models.tree_models.lgbm.LGBM object at 0x1267232d0>), ('lgbm-2', <models.tree_models.lgbm.LGBM object at 0x1267235d...bsolute_error at 0x11817e758>, verbose=1))]),
        n_jobs=-1, raise_on_exception=True, verbose=1),
       n_jobs=-1, raise_on_exception=True, random_state=None,
       scorer=<function mean_absolute_error at 0x11817e758>, shuffle=False,
       verbose=1)

In [18]:
model_flow = ens

## Single Model (Model Flow)

In [22]:
# model_flow = ModelFlow(model=model_lgbm, data_processor=data_processor)

In [23]:
# model_flow.fit(X_all, y_all)

# Make prediction

In [19]:
df_submission.head()

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,0,0,0,0,0,0
1,10759547,0,0,0,0,0,0
2,10843547,0,0,0,0,0,0
3,10859147,0,0,0,0,0,0
4,10879947,0,0,0,0,0,0


In [None]:
for col in df_submission.columns[1:]:
    print 'Predict for {}'.format(col)
    # add feature transaction_month based on prediction date
    df_pred = df_properties.copy()
    df_pred['transaction_month'] = col[-2:]
    df_pred['transaction_year'] = col[:-2]
    
    print 'Start pre-processing...'
    # process data
    X_pred = data_processor_dct['tree'].pre_process(df_pred, fit=False)
    print 'Pre-processing finished, start predicting...'
    y_pred = model_flow.predict(X_pred)
    
    print 'Line up predictions'
    # line up predictions
    y_pred = pd.DataFrame(y_pred, index=df_properties['id_parcel'], columns=['pred'])
    df_merge = pd.merge(df_submission, y_pred, how='left', left_on='ParcelId', right_index=True)
    
    df_submission.loc[:, col] = df_merge['pred']
    print '{} finished'.format(col)

Predict for 201610
Start pre-processing...
Pre-processing finished, start predicting...



Predicting with 2 layers

[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 21.4min remaining: 32.2min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 46.8min finished
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
Prediction complete | 00:47:05



Line up predictions
201610 finished
Predict for 201611
Start pre-processing...
Pre-processing finished, start predicting...



Predicting with 2 layers

[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 19.2min remaining: 28.8min


# Create submission file

In [None]:
df_submission.head()

In [None]:
df_submission.to_csv('/Users/shuyangdu/Desktop/ZillowChallenge/submission/ens_20171014.csv', index=False)

# Save parameters

In [46]:
ens.layer_1.estimators

{'numeric': [('elasticnet',
   ElasticNet(alpha=90, copy_X=True, fit_intercept=True, l1_ratio=0.85,
         max_iter=1000, normalize=False, positive=False, precompute=False,
         random_state=None, selection='cyclic', tol=0.0001, warm_start=False))],
 'tree': [('lgbmregressor-1',
   LGBMRegressor(bagging_fraction=0.85, bagging_freq=80, boosting_type='gbdt',
          categorical_feature=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
          colsample_bytree=1, feature_fraction=0.94, lambda_l2=86.9,
          learning_rate=0.0116, max_bin=80, max_depth=-1,
          min_child_samples=10, min_child_weight=5, min_split_gain=0,
          n_estimators=250, nthread=1, num_leaves=110,
          objective='regression_l1', reg_alpha=0, reg_lambda=0, seed=0,
          silent=True, subsample=1, subsample_for_bin=50000, subsample_freq=1)),
  ('lgbmregressor-2',
   LGBMRegressor(bagging_fraction=0.85, bagging_freq=80, boosting_type='gbdt',
          categorical_f

In [42]:
path_params = '/Users/shuyangdu/Desktop/ZillowChallenge/submission/params_20170910.txt'

In [45]:
with open(path_params, 'w') as file:
     file.write(str(ens.layer_1.estimators))