# Submission
Make final predictions for submission using the best model.

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from data_process.data_transform_processor import DataTransformProcessor
from data_process.feature_engineer import FeatureEngineer
from models.nn_models.dnn import DNN
from lightgbm import LGBMRegressor
from sklearn.linear_model import ElasticNet, LinearRegression, Ridge
from models.tree_models.lgbm import LGBM
from mlens.ensemble import SuperLearner
from models.model_flow import ModelFlow
# from schema.columns_added_filled import LABEL_COL, NUMERICAL_COLS, CATEGORICAL_COLS, LOG_COLS
from schema.columns_added import LABEL_COL, NUMERICAL_COLS, CATEGORICAL_COLS, LOG_COLS
# from schema.columns_original import LABEL_COL, NUMERICAL_COLS, CATEGORICAL_COLS, LOG_COLS
from sklearn.metrics import mean_absolute_error
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

Using TensorFlow backend.


# Load data

In [2]:
df_all = pd.read_csv('/Users/shuyangdu/Desktop/ZillowChallenge/data/df_merged_20170923.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df_properties = pd.read_csv('/Users/shuyangdu/Desktop/ZillowChallenge/data/properties_2016_engineered_20170923.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# add fake y for data processing
df_properties['logerror'] = 0

In [5]:
df_submission = pd.read_csv('/Users/shuyangdu/Desktop/ZillowChallenge/data/raw_data/sample_submission.csv')

In [6]:
df_submission.shape

(2985217, 7)

# Pre-process training data

## Candidate Data Processors

In [7]:
data_processor_dummy = DataTransformProcessor(
    use_dummy=True, use_scale=True,
    numerical_cols=NUMERICAL_COLS, 
    categorical_cols=CATEGORICAL_COLS,
    log_cols=LOG_COLS, 
    label_col=LABEL_COL,
)
data_processor = DataTransformProcessor(
    numerical_cols=NUMERICAL_COLS, 
    categorical_cols=CATEGORICAL_COLS,
    log_cols=LOG_COLS, 
    label_col=LABEL_COL,
)

In [8]:
X_all = data_processor.pre_process(df_all)
y_all = df_all[LABEL_COL].values

# Train model

## Candidate Models

In [9]:
# params = {
#     'max_bin': 80,
#     'learning_rate': 0.0116,
#     'boosting_type': 'gbdt',
#     'objective': 'regression_l1',
#     'feature_fraction': 0.94,
#     'bagging_fraction': 0.85,
#     'bagging_freq': 80,
#     'num_leaves': 110,
#     'lambda_l2': 86.9,
#     'n_estimators': 450,
# }
params = {
    'max_bin': 160,
    'learning_rate': 0.015197,
    'boosting_type': 'gbdt',
    'objective': 'regression_l1',
    'feature_fraction': 0.865260,
    'bagging_fraction': 0.932207,
    'bagging_freq': 70,
    'num_leaves': 170,
    'lambda_l2': 78.124466,
    'n_estimators': 400,
}
model_lgbm = LGBMRegressor(
    categorical_feature = data_processor.categorical_col_idx,
    **params
)

In [11]:
params = {
    'max_bin': 80,
    'learning_rate': 0.0116,
    'boosting_type': 'gbdt',
    'objective': 'regression_l1',
    'feature_fraction': 0.94,
    'bagging_fraction': 0.85,
    'bagging_freq': 80,
    'num_leaves': 110,
    'lambda_l2': 86.9,
    'n_estimators': 150,
    'nthread': 1,
}
model_lgbm2 = LGBMRegressor(
    categorical_feature = data_processor.categorical_col_idx,
    **params
)

## Stacking (Super Learner)

In [12]:
ens = SuperLearner(folds=5, verbose=1, backend='threading', scorer=mean_absolute_error)

In [13]:
preprocessing_dict = {
    'tree': [data_processor],
    'numeric': [data_processor_dummy],
}

In [14]:
estimator_dict = {
    'tree': [model_lgbm, model_lgbm2],
    'numeric': [ElasticNet(alpha=90, l1_ratio=0.85)],
}

In [15]:
ens.add(estimators=estimator_dict, preprocessing=preprocessing_dict)

SuperLearner(array_check=2, backend='threading', folds=5,
       layers=LayerContainer(backend='threading',
        layers=OrderedDict([('layer-1', Layer(cls='stack', cls_kwargs=None, dtype=<type 'numpy.float32'>,
   estimators={'tree': [('lgbmregressor-1', LGBMRegressor(bagging_fraction=0.85, bagging_freq=80, boosting_type='gbdt',
       categorical_feature=[0, ...bsolute_error at 0x11917d2a8>, verbose=1))]),
        n_jobs=-1, raise_on_exception=True, verbose=1),
       n_jobs=-1, raise_on_exception=True, random_state=None,
       scorer=<function mean_absolute_error at 0x11917d2a8>, shuffle=False,
       verbose=1)

In [16]:
ens.add_meta(LinearRegression())

SuperLearner(array_check=2, backend='threading', folds=5,
       layers=LayerContainer(backend='threading',
        layers=OrderedDict([('layer-1', Layer(cls='stack', cls_kwargs=None, dtype=<type 'numpy.float32'>,
   estimators={'tree': [('lgbmregressor-1', LGBMRegressor(bagging_fraction=0.85, bagging_freq=80, boosting_type='gbdt',
       categorical_feature=[0, ...bsolute_error at 0x11917d2a8>, verbose=1))]),
        n_jobs=-1, raise_on_exception=True, verbose=1),
       n_jobs=-1, raise_on_exception=True, random_state=None,
       scorer=<function mean_absolute_error at 0x11917d2a8>, shuffle=False,
       verbose=1)

In [17]:
ens.fit(X_all, y_all)


Fitting 2 layers

[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:    1.3s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    1.3s finished
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   25.7s finished
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.1s finished
Fit complete | 00:00:30



SuperLearner(array_check=2, backend='threading', folds=5,
       layers=LayerContainer(backend='threading',
        layers=OrderedDict([('layer-1', Layer(cls='stack', cls_kwargs=None, dtype=<type 'numpy.float32'>,
   estimators={'tree': [('lgbmregressor-1', LGBMRegressor(bagging_fraction=0.85, bagging_freq=80, boosting_type='gbdt',
       categorical_feature=[0, ...bsolute_error at 0x11917d2a8>, verbose=1))]),
        n_jobs=-1, raise_on_exception=True, verbose=1),
       n_jobs=-1, raise_on_exception=True, random_state=None,
       scorer=<function mean_absolute_error at 0x11917d2a8>, shuffle=False,
       verbose=1)

In [20]:
model_flow = ens

## Single Model (Model Flow)

In [10]:
model_flow = ModelFlow(model=model_lgbm, data_processor=data_processor)

In [11]:
model_flow.fit(X_all, y_all)

# Make prediction

In [12]:
df_submission.head()

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,0,0,0,0,0,0
1,10759547,0,0,0,0,0,0
2,10843547,0,0,0,0,0,0
3,10859147,0,0,0,0,0,0
4,10879947,0,0,0,0,0,0


In [None]:
# only make prediction for 201610, 201611, 201612 since only month matters
for col in df_submission.columns[1:4]:
    print 'Predict for {}'.format(col)
    # add feature transaction_month based on prediction date
    df_pred = df_properties.copy()
    df_pred['transaction_month'] = col[-2:]
    
    print 'Start pre-processing...'
    # process data
    X_pred = data_processor.pre_process(df_pred)
    print 'Pre-processing finished, start predicting...'
    y_pred = model_flow.predict(X_pred)
    
    print 'Line up predictions'
    # line up predictions
    y_pred = pd.DataFrame(y_pred, index=df_properties['id_parcel'], columns=['pred'])
    df_merge = pd.merge(df_submission, y_pred, how='left', left_on='ParcelId', right_index=True)
    
    df_submission.loc[:, col] = df_merge['pred']
    print '{} finished'.format(col)

Predict for 201610
Start pre-processing...
Pre-processing finished, start predicting...


In [None]:
# copy 201610, 201611 and 201612 to 2017
for i in range(4, df_submission.shape[1]):
    df_submission.iloc[:, i] = df_submission.iloc[:, i-3]

# Create submission file

In [None]:
df_submission.head()

In [None]:
df_submission.to_csv('/Users/shuyangdu/Desktop/ZillowChallenge/submission/lightgbm_20170924.csv', index=False)

# Save parameters

In [46]:
ens.layer_1.estimators

{'numeric': [('elasticnet',
   ElasticNet(alpha=90, copy_X=True, fit_intercept=True, l1_ratio=0.85,
         max_iter=1000, normalize=False, positive=False, precompute=False,
         random_state=None, selection='cyclic', tol=0.0001, warm_start=False))],
 'tree': [('lgbmregressor-1',
   LGBMRegressor(bagging_fraction=0.85, bagging_freq=80, boosting_type='gbdt',
          categorical_feature=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
          colsample_bytree=1, feature_fraction=0.94, lambda_l2=86.9,
          learning_rate=0.0116, max_bin=80, max_depth=-1,
          min_child_samples=10, min_child_weight=5, min_split_gain=0,
          n_estimators=250, nthread=1, num_leaves=110,
          objective='regression_l1', reg_alpha=0, reg_lambda=0, seed=0,
          silent=True, subsample=1, subsample_for_bin=50000, subsample_freq=1)),
  ('lgbmregressor-2',
   LGBMRegressor(bagging_fraction=0.85, bagging_freq=80, boosting_type='gbdt',
          categorical_f

In [42]:
path_params = '/Users/shuyangdu/Desktop/ZillowChallenge/submission/params_20170910.txt'

In [45]:
with open(path_params, 'w') as file:
     file.write(str(ens.layer_1.estimators))