In [24]:
import sys
sys.path.append('/Users/shuyangdu/Desktop/ZillowChallenge/zillow-kaggle-challenge')
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from data_process.column_schema import (PROPERTIES_RENAME_DICT, TRANSACTION_RENAME_DICT, 
                                        NUMERICAL_COLS, CATEGORICAL_COLS)
from data_process.data_process_pipeline import DataProcessPipeline
from lightgbm import LGBMRegressor
from models.tree_models.lgbm import LGBM
from sklearn.metrics import mean_absolute_error
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
df_all = pd.read_csv('/Users/shuyangdu/Desktop/ZillowChallenge/data/df_merged.csv')

In [27]:
data_pipeline = DataProcessPipeline(encode_mode='label')

In [28]:
df = data_pipeline.pre_process(df_all)

In [29]:
iterator = data_pipeline.k_fold(df, 52)

In [30]:
df_train, df_val = iterator.next()

In [31]:
df_train = data_pipeline.post_process(df_train, is_train=True)
df_val = data_pipeline.post_process(df_val, is_train=False)

In [32]:
X_train = df_train[data_pipeline.final_feature_cols].values
y_train = df_train[data_pipeline.label_col].values
X_val = df_val[data_pipeline.final_feature_cols].values
y_val = df_val[data_pipeline.label_col].values

# Construct Model

In [33]:
params = {
    'max_bin': 100,
    'learning_rate': 0.05,
    'boosting_type': 'gbdt',
    'objective': 'regression_l1',
    'feature_fraction': 0.9,
    'bagging_fraction': 0.9,
    'num_leaves': 200,
    'min_data': 500,
    'lambda_l2': 5,
    'n_estimators': 50
}

In [34]:
model = LGBM(
    feature_name=data_pipeline.original_feature_cols,
    categorical_feature=data_pipeline.categorical_cols,
    **params
)

In [35]:
model.fit(X_train, y_train)

In [10]:
model = LGBMRegressor(**params)

In [11]:
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='l1')

[1]	valid_0's l1: 0.0671474
[2]	valid_0's l1: 0.0670616
[3]	valid_0's l1: 0.066982
[4]	valid_0's l1: 0.0669204
[5]	valid_0's l1: 0.0668443
[6]	valid_0's l1: 0.0667752
[7]	valid_0's l1: 0.0667347
[8]	valid_0's l1: 0.0666934
[9]	valid_0's l1: 0.0666488
[10]	valid_0's l1: 0.0666155
[11]	valid_0's l1: 0.066592
[12]	valid_0's l1: 0.0665599
[13]	valid_0's l1: 0.0665264
[14]	valid_0's l1: 0.0665024
[15]	valid_0's l1: 0.0664755
[16]	valid_0's l1: 0.0664528
[17]	valid_0's l1: 0.0664354
[18]	valid_0's l1: 0.0664192
[19]	valid_0's l1: 0.0663977
[20]	valid_0's l1: 0.0663751
[21]	valid_0's l1: 0.0663699
[22]	valid_0's l1: 0.0663579
[23]	valid_0's l1: 0.0663349
[24]	valid_0's l1: 0.0663293
[25]	valid_0's l1: 0.0663091
[26]	valid_0's l1: 0.0663112
[27]	valid_0's l1: 0.0663113
[28]	valid_0's l1: 0.0662896
[29]	valid_0's l1: 0.0662922
[30]	valid_0's l1: 0.066274
[31]	valid_0's l1: 0.0662652
[32]	valid_0's l1: 0.0662617
[33]	valid_0's l1: 0.0662598
[34]	valid_0's l1: 0.0662567
[35]	valid_0's l1: 0.06623

LGBMRegressor(bagging_fraction=0.9, boosting_type='gbdt', colsample_bytree=1,
       feature_fraction=0.9, lambda_l2=5, learning_rate=0.05, max_bin=100,
       max_depth=-1, min_child_samples=10, min_child_weight=5,
       min_data=500, min_split_gain=0, n_estimators=50, nthread=-1,
       num_leaves=200, objective='regression_l1', reg_alpha=0,
       reg_lambda=0, seed=0, silent=True, subsample=1,
       subsample_for_bin=50000, subsample_freq=1)

In [12]:
y_pred = model.predict(X_val)

In [13]:
mean_absolute_error(y_val, y_pred)

0.066213533816818729

In [24]:
categorical_col_idx = []
for i in range(len(data_pipeline.feature_cols)):
    if data_pipeline.feature_cols[i] in CATEGORICAL_COLS:
        categorical_col_idx.append(i)

In [25]:
categorical_col_idx

[31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]

In [14]:
model.fit(df_train[data_pipeline.final_feature_cols], y_train,
          categorical_feature=CATEGORICAL_COLS)

LGBMRegressor(bagging_fraction=0.9, boosting_type='gbdt', colsample_bytree=1,
       feature_fraction=0.9, lambda_l2=5, learning_rate=0.05, max_bin=100,
       max_depth=-1, min_child_samples=10, min_child_weight=5,
       min_data=500, min_split_gain=0, n_estimators=50, nthread=-1,
       num_leaves=200, objective='regression_l1', reg_alpha=0,
       reg_lambda=0, seed=0, silent=True, subsample=1,
       subsample_for_bin=50000, subsample_freq=1)

In [36]:
y_pred = model.predict(X_val)

In [37]:
mean_absolute_error(y_val, y_pred)

0.066071034977829346