In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from data_process.column_schema import (PROPERTIES_RENAME_DICT, TRANSACTION_RENAME_DICT, 
                                        NUMERICAL_COLS, CATEGORICAL_COLS)
from data_process.data_process_pipeline import DataProcessPipeline
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Load data

In [2]:
df_all = pd.read_csv('/Users/shuyangdu/Desktop/ZillowChallenge/data/df_merged.csv')

In [3]:
df_properties = pd.read_csv('/Users/shuyangdu/Desktop/ZillowChallenge/data/properties_2016.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# add fake y for data processing
df_properties['logerror'] = 0

In [20]:
df_submission = pd.read_csv('/Users/shuyangdu/Desktop/ZillowChallenge/data/sample_submission.csv')

In [22]:
df_submission.shape

(2985217, 7)

# Train model

In [5]:
data_pipeline = DataProcessPipeline(encode_mode='label')

In [6]:
df_train = data_pipeline.pre_process(df_all)

In [7]:
df_train = data_pipeline.post_process(df_train, is_train=True)

In [8]:
X_train = df_train[data_pipeline.feature_cols].values
y_train = df_train[data_pipeline.label_col].values

In [9]:
model = LGBMRegressor(objective='regression_l1', n_estimators=50, num_leaves=55, learning_rate=0.05,
                      feature_fraction=1.0, bagging_freq=6, bagging_fraction=0.7, lambda_l2=27)

In [10]:
model.fit(X_train, y_train)

LGBMRegressor(bagging_fraction=0.7, bagging_freq=6, boosting_type='gbdt',
       colsample_bytree=1, feature_fraction=1.0, lambda_l2=27,
       learning_rate=0.05, max_bin=255, max_depth=-1, min_child_samples=10,
       min_child_weight=5, min_split_gain=0, n_estimators=50, nthread=-1,
       num_leaves=55, objective='regression_l1', reg_alpha=0, reg_lambda=0,
       seed=0, silent=True, subsample=1, subsample_for_bin=50000,
       subsample_freq=1)

# Make prediction

In [12]:
# rename df_properties
df_properties.rename(columns=data_pipeline.properties_rename_dict, inplace=True)

In [13]:
df_pred = data_pipeline.pre_process(df_properties)

In [14]:
df_pred = data_pipeline.post_process(df_pred, is_train=False)

In [15]:
X_pred = df_pred[data_pipeline.feature_cols].values

In [36]:
y_pred = model.predict(X_pred)

In [37]:
y_pred = pd.DataFrame(y_pred, index=df_properties['id_parcel'], columns=['pred'])

# Create submission file

In [39]:
df_merge = pd.merge(df_submission, y_pred, how='left', left_on='ParcelId', right_index=True)

In [41]:
for i in range(1, df_submission.shape[1]):
    df_submission.iloc[:, i] = df_merge['pred']

In [43]:
df_submission.to_csv('/Users/shuyangdu/Desktop/ZillowChallenge/submission/lightgbm_20170730.csv', index=False)