# Submission
Make final predictions for submission using the best model.

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from hyperopt import hp, fmin, tpe, Trials, space_eval, STATUS_OK
from data_process.data_transform_processor import DataTransformProcessor
from models.model_flow import ModelFlow
from schema import columns_added_filled
from schema import columns_added
from models.nn_models.dnn import DNN
from lightgbm import LGBMRegressor
import lightgbm
from sklearn.linear_model import ElasticNet, LinearRegression, Ridge
from models.tree_models.lgbm import LGBM
from models.backtest import BackTest
from mlens.ensemble import SuperLearner
from sklearn.metrics import mean_absolute_error
from models.util import get_hyper_params
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

Using TensorFlow backend.


# Load data

In [2]:
df_all = pd.read_csv('/Users/shuyangdu/Desktop/ZillowChallenge/data/df_merged_20171008.csv')

In [3]:
df_properties = pd.read_csv('/Users/shuyangdu/Desktop/ZillowChallenge/data/properties_2017_engineered_20171008.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# add fake y for data processing
df_properties['logerror'] = 0

In [5]:
df_submission = pd.read_csv('/Users/shuyangdu/Desktop/ZillowChallenge/data/raw_data/sample_submission.csv')

In [6]:
df_submission.shape

(2985217, 7)

# Pre-process training data

## Candidate Data Processors

In [7]:
data_processor_dct = {
    'dummy': DataTransformProcessor(
    use_dummy=True, use_scale=True,
    numerical_cols=columns_added.NUMERICAL_COLS, 
    categorical_cols=columns_added.CATEGORICAL_COLS,
    log_cols=columns_added.LOG_COLS, 
    label_col=columns_added.LABEL_COL,
),
    'tree': DataTransformProcessor(
    numerical_cols=columns_added.NUMERICAL_COLS, 
    categorical_cols=columns_added.CATEGORICAL_COLS,
    log_cols=columns_added.LOG_COLS, 
    label_col=columns_added.LABEL_COL,
),
}

In [8]:
X_all = data_processor_dct['tree'].pre_process(df_all)
y_all = df_all[columns_added.LABEL_COL].values

In [9]:
X_all.shape

(167888, 81)

# Train model

## Candidate Models

## ElasticNet

In [10]:
eln_params_lst = get_hyper_params(
    '/Users/shuyangdu/Desktop/ZillowChallenge/hyper-parameter-opt/eln_added_features_201701010.csv',
    n=1, # number of top models
)
model_eln_lst = [ElasticNet(**params) for params in eln_params_lst]

## LightGBM

In [11]:
gbdt_params_lst = get_hyper_params(
    '/Users/shuyangdu/Desktop/ZillowChallenge/hyper-parameter-opt/gbdt_l1_added_features_201701014.csv',
    n=5, # number of top models 
    fixed_params={'boosting_type': 'gbdt', 'objective': 'regression_l1', 
                  'categorical_feature': data_processor_dct['tree'].categorical_col_idx}
)
model_gbdt_lst = [LGBM(**params) for params in gbdt_params_lst]

In [12]:
dart_params_lst = get_hyper_params(
    '/Users/shuyangdu/Desktop/ZillowChallenge/hyper-parameter-opt/dart_l1_added_features_201701009.csv',
    n=2, # number of top models
    fixed_params={'boosting_type': 'dart', 'objective': 'regression_l1', 
                  'categorical_feature': data_processor_dct['tree'].categorical_col_idx}
)
model_dart_lst = [LGBM(**params) for params in dart_params_lst]

## Stacking (Super Learner)

In [None]:
ens = SuperLearner(folds=2, verbose=1, backend='threading', scorer=mean_absolute_error)

preprocessing_dict = {
    'tree': [data_processor_dct['tree']],
    'numeric': [data_processor_dct['dummy']],  # comment this out if only use tree models
}

estimator_dict = {
    'tree': model_gbdt_lst + model_dart_lst,
    'numeric': model_eln_lst,  # comment this out if only use tree models
}

ens.add(estimators=estimator_dict, preprocessing=preprocessing_dict)

ens.add_meta(Ridge())

In [None]:
%%time
ens.fit(X_all, y_all)

In [18]:
model_flow = ens

## Single Model (Model Flow)

In [13]:
model_flow = ModelFlow(model=model_gbdt_lst[0], data_processor=data_processor_dct['tree'])

In [14]:
model_flow.fit(X_all, y_all)

# Make prediction

In [15]:
df_submission.head()

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,0,0,0,0,0,0
1,10759547,0,0,0,0,0,0
2,10843547,0,0,0,0,0,0
3,10859147,0,0,0,0,0,0
4,10879947,0,0,0,0,0,0


In [16]:
for col in df_submission.columns[1:]:
    print 'Predict for {}'.format(col)
    # add feature transaction_month based on prediction date
    df_pred = df_properties.copy()
    df_pred['transaction_month'] = col[-2:]
    df_pred['transaction_year'] = col[:-2]
    
    print 'Start pre-processing...'
    # process data
    X_pred = data_processor_dct['tree'].pre_process(df_pred, fit=False)
    print 'Pre-processing finished, start predicting...'
    y_pred = model_flow.predict(X_pred)
    
    print 'Line up predictions'
    # line up predictions
    y_pred = pd.DataFrame(y_pred, index=df_properties['id_parcel'], columns=['pred'])
    df_merge = pd.merge(df_submission, y_pred, how='left', left_on='ParcelId', right_index=True)
    
    df_submission.loc[:, col] = df_merge['pred']
    print '{} finished'.format(col)

Predict for 201610
Start pre-processing...
Pre-processing finished, start predicting...
Line up predictions
201610 finished
Predict for 201611
Start pre-processing...
Pre-processing finished, start predicting...
Line up predictions
201611 finished
Predict for 201612
Start pre-processing...
Pre-processing finished, start predicting...
Line up predictions
201612 finished
Predict for 201710
Start pre-processing...
Pre-processing finished, start predicting...
Line up predictions
201710 finished
Predict for 201711
Start pre-processing...
Pre-processing finished, start predicting...
Line up predictions
201711 finished
Predict for 201712
Start pre-processing...
Pre-processing finished, start predicting...
Line up predictions
201712 finished


# Create submission file

In [17]:
df_submission.head()

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,0.0051,0.005293,0.003346,0.0051,0.005293,0.003346
1,10759547,0.016198,0.013331,0.015504,0.016198,0.013331,0.015504
2,10843547,0.038332,0.04065,0.038176,0.038332,0.04065,0.038176
3,10859147,0.062374,0.064474,0.062457,0.062374,0.064474,0.062457
4,10879947,0.009164,0.010191,0.011813,0.009164,0.010191,0.011813


In [18]:
df_submission.to_csv('/Users/shuyangdu/Desktop/ZillowChallenge/submission/ens_20171014.csv', index=False)

# Save parameters

In [19]:
gbdt_params_lst[0]

{'bagging_fraction': 0.90164475300000002,
 'bagging_freq': 70,
 'boosting_type': 'gbdt',
 'categorical_feature': [61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80],
 'feature_fraction': 0.81028743000000003,
 'lambda_l2': 3344.7357489999999,
 'learning_rate': 0.019029016999999999,
 'max_bin': 380,
 'n_estimators': 1450,
 'num_leaves': 210,
 'objective': 'regression_l1'}

In [46]:
ens.layer_1.estimators

{'numeric': [('elasticnet',
   ElasticNet(alpha=90, copy_X=True, fit_intercept=True, l1_ratio=0.85,
         max_iter=1000, normalize=False, positive=False, precompute=False,
         random_state=None, selection='cyclic', tol=0.0001, warm_start=False))],
 'tree': [('lgbmregressor-1',
   LGBMRegressor(bagging_fraction=0.85, bagging_freq=80, boosting_type='gbdt',
          categorical_feature=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
          colsample_bytree=1, feature_fraction=0.94, lambda_l2=86.9,
          learning_rate=0.0116, max_bin=80, max_depth=-1,
          min_child_samples=10, min_child_weight=5, min_split_gain=0,
          n_estimators=250, nthread=1, num_leaves=110,
          objective='regression_l1', reg_alpha=0, reg_lambda=0, seed=0,
          silent=True, subsample=1, subsample_for_bin=50000, subsample_freq=1)),
  ('lgbmregressor-2',
   LGBMRegressor(bagging_fraction=0.85, bagging_freq=80, boosting_type='gbdt',
          categorical_f

In [42]:
path_params = '/Users/shuyangdu/Desktop/ZillowChallenge/submission/params_20170910.txt'

In [45]:
with open(path_params, 'w') as file:
     file.write(str(ens.layer_1.estimators))