In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from hyperopt import hp, fmin, tpe, Trials, space_eval, STATUS_OK
from data_process.column_schema import (PROPERTIES_RENAME_DICT, TRANSACTION_RENAME_DICT, 
                                        NUMERICAL_COLS, CATEGORICAL_COLS)
from data_process.data_transformer import TransformerNumerical, TransformerCategorical
from data_process.data_transform_processor import DataProcessor
from models.nn_models.dnn import DNN
from lightgbm import LGBMRegressor
from sklearn.linear_model import ElasticNet, LinearRegression, Ridge
from models.tree_models.lgbm import LGBM
from mlens.ensemble import SuperLearner
from sklearn.metrics import mean_absolute_error
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

Using TensorFlow backend.


In [2]:
df_all = pd.read_csv('/Users/shuyangdu/Desktop/ZillowChallenge/data/df_merged.csv')

In [3]:
X_all = DataProcessor.pre_process(df_all)

In [4]:
y_all = df_all['logerror'].values

In [5]:
data_processor_dummy = DataProcessor(use_dummy=True, use_scale=True)
data_processor = DataProcessor()

In [6]:
params = {
    'max_bin': 80,
    'learning_rate': 0.0116,
    'boosting_type': 'gbdt',
    'objective': 'regression_l1',
    'feature_fraction': 0.94,
    'bagging_fraction': 0.85,
    'bagging_freq': 80,
    'num_leaves': 110,
    'lambda_l2': 86.9,
    'n_estimators': 250,
    'nthread': 1,
}
model_lgbm = LGBMRegressor(
    categorical_feature = data_processor.categorical_col_idx,
    **params
)

In [7]:
params = {
    'max_bin': 80,
    'learning_rate': 0.0116,
    'boosting_type': 'gbdt',
    'objective': 'regression_l1',
    'feature_fraction': 0.94,
    'bagging_fraction': 0.85,
    'bagging_freq': 80,
    'num_leaves': 110,
    'lambda_l2': 86.9,
    'n_estimators': 150,
    'nthread': 1,
}
model_lgbm2 = LGBMRegressor(
    categorical_feature = data_processor.categorical_col_idx,
    **params
)

In [8]:
ens = SuperLearner(folds=5, verbose=1, backend='threading', scorer=mean_absolute_error)

In [9]:
preprocessing_dict = {
    'tree': [data_processor],
    'numeric': [data_processor_dummy],
}

In [10]:
estimator_dict = {
    'tree': [model_lgbm, model_lgbm2],
    'numeric': [ElasticNet(alpha=90, l1_ratio=0.85)],
}

In [11]:
ens.add(estimators=estimator_dict, preprocessing=preprocessing_dict)

SuperLearner(array_check=2, backend='threading', folds=5,
       layers=LayerContainer(backend='threading',
        layers=OrderedDict([('layer-1', Layer(cls='stack', cls_kwargs=None, dtype=<type 'numpy.float32'>,
   estimators={'tree': [('lgbmregressor-1', LGBMRegressor(bagging_fraction=0.85, bagging_freq=80, boosting_type='gbdt',
       categorical_feature=[0, ...bsolute_error at 0x1144dfde8>, verbose=1))]),
        n_jobs=-1, raise_on_exception=True, verbose=1),
       n_jobs=-1, raise_on_exception=True, random_state=None,
       scorer=<function mean_absolute_error at 0x1144dfde8>, shuffle=False,
       verbose=1)

In [12]:
ens.add_meta(LinearRegression())

SuperLearner(array_check=2, backend='threading', folds=5,
       layers=LayerContainer(backend='threading',
        layers=OrderedDict([('layer-1', Layer(cls='stack', cls_kwargs=None, dtype=<type 'numpy.float32'>,
   estimators={'tree': [('lgbmregressor-1', LGBMRegressor(bagging_fraction=0.85, bagging_freq=80, boosting_type='gbdt',
       categorical_feature=[0, ...bsolute_error at 0x1144dfde8>, verbose=1))]),
        n_jobs=-1, raise_on_exception=True, verbose=1),
       n_jobs=-1, raise_on_exception=True, random_state=None,
       scorer=<function mean_absolute_error at 0x1144dfde8>, shuffle=False,
       verbose=1)

In [13]:
ens.fit(X_all, y_all)


Fitting 2 layers

[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:    1.1s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   24.9s finished
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.1s finished
Fit complete | 00:00:29



SuperLearner(array_check=2, backend='threading', folds=5,
       layers=LayerContainer(backend='threading',
        layers=OrderedDict([('layer-1', Layer(cls='stack', cls_kwargs=None, dtype=<type 'numpy.float32'>,
   estimators={'tree': [('lgbmregressor-1', LGBMRegressor(bagging_fraction=0.85, bagging_freq=80, boosting_type='gbdt',
       categorical_feature=[0, ...bsolute_error at 0x1144dfde8>, verbose=1))]),
        n_jobs=-1, raise_on_exception=True, verbose=1),
       n_jobs=-1, raise_on_exception=True, random_state=None,
       scorer=<function mean_absolute_error at 0x1144dfde8>, shuffle=False,
       verbose=1)

In [14]:
ens.scores_

{'score_mean': {('layer-1', 'numeric__elasticnet'): 0.068513830709068357,
  ('layer-1', 'tree__lgbmregressor-1'): 0.06732823645801192,
  ('layer-1', 'tree__lgbmregressor-2'): 0.06746545092378331},
 'score_std': {('layer-1', 'numeric__elasticnet'): 0.0030373080071470198,
  ('layer-1', 'tree__lgbmregressor-1'): 0.0030450210540279368,
  ('layer-1', 'tree__lgbmregressor-2'): 0.0030242851282584388}}

In [15]:
y_pred = ens.predict(X_all)


Predicting with 2 layers

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.3s finished
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
Prediction complete | 00:00:01



In [16]:
np.unique(y_pred)

array([-0.15906987, -0.15872392, -0.15803486, ...,  0.22715664,
        0.22804198,  0.23159331], dtype=float32)

In [17]:
mean_absolute_error(y_all, y_pred)

0.064952949150373432

In [43]:
model_lgbm.fit(X_all, y_all)

LGBMRegressor(bagging_fraction=0.85, bagging_freq=80, boosting_type='gbdt',
       categorical_feature=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
       colsample_bytree=1, feature_fraction=0.94, lambda_l2=86.9,
       learning_rate=0.0116, max_bin=80, max_depth=-1,
       min_child_samples=10, min_child_weight=5, min_split_gain=0,
       n_estimators=300, nthread=1, num_leaves=110,
       objective='regression_l1', reg_alpha=0, reg_lambda=0, seed=0,
       silent=True, subsample=1, subsample_for_bin=50000, subsample_freq=1)

In [44]:
y_pred = model_lgbm.predict(X_all)

In [45]:
mean_absolute_error(y_all, y_pred)

0.065426202462288391