In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from data_process.column_schema import (PROPERTIES_RENAME_DICT, TRANSACTION_RENAME_DICT, 
                                        NUMERICAL_COLS, CATEGORICAL_COLS)
from data_process.data_process_pipeline import DataProcessPipeline
from lightgbm import LGBMRegressor
from models.tree_models.lgbm import LGBM
from sklearn.metrics import mean_absolute_error
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Load data

In [3]:
df_all = pd.read_csv('/Users/shuyangdu/Desktop/ZillowChallenge/data/df_merged.csv')

In [4]:
df_properties = pd.read_csv('/Users/shuyangdu/Desktop/ZillowChallenge/data/properties_2016.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
# add fake y for data processing
df_properties['logerror'] = 0

In [6]:
df_submission = pd.read_csv('/Users/shuyangdu/Desktop/ZillowChallenge/data/sample_submission.csv')

In [7]:
df_submission.shape

(2985217, 7)

# Process training data

In [8]:
data_pipeline = DataProcessPipeline(encode_mode='label')

In [9]:
df_train = data_pipeline.pre_process(df_all)

In [10]:
df_train = data_pipeline.post_process(df_train, is_train=True)

In [11]:
X_train = df_train[data_pipeline.final_feature_cols].values
y_train = df_train[data_pipeline.label_col].values

# Train model

## Light GBM

In [1]:
params = {
    'max_bin': 80,
    'learning_rate': 0.0116,
    'boosting_type': 'gbdt',
    'objective': 'regression_l1',
    'feature_fraction': 0.94,
    'bagging_fraction': 0.85,
    'bagging_freq': 80,
    'num_leaves': 110,
    'lambda_l2': 86.9,
    'n_estimators': 450,
}

In [12]:
model = LGBM(
    feature_name=data_pipeline.final_feature_cols,
    categorical_feature=data_pipeline.categorical_cols,
    **params
)

In [13]:
model.fit(X_train, y_train)

# Make prediction

In [14]:
# rename df_properties
df_properties.rename(columns=data_pipeline.properties_rename_dict, inplace=True)

In [22]:
# only make prediction for 201610, 201611, 201612 since only month matters
for col in df_submission.columns[1:4]:
    print 'Predict for {}'.format(col)
    # add feature transaction_month based on prediction date
    df_pred = df_properties.copy()
    df_pred['transaction_month'] = col[-2:]
    
    print 'Start pre-processing...'
    # process data
    df_pred = data_pipeline.pre_process(df_pred)
    print 'Pre-processing finished, start post-processing...'
    df_pred = data_pipeline.post_process(df_pred, is_train=False)
    
    X_pred = df_pred[data_pipeline.final_feature_cols].values
    y_pred = model.predict(X_pred)
    
    print 'Line up predictions'
    # line up predictions
    y_pred = pd.DataFrame(y_pred, index=df_properties['id_parcel'], columns=['pred'])
    df_merge = pd.merge(df_submission, y_pred, how='left', left_on='ParcelId', right_index=True)
    
    df_submission.loc[:, col] = df_merge['pred']
    print '{} finished'.format(col)

Predict for 201610
Start processing data...
Line up predictions
201610 finished
Predict for 201611
Start processing data...
Line up predictions
201611 finished
Predict for 201612
Start processing data...
Line up predictions
201612 finished
Predict for 201710
Start processing data...
Line up predictions
201710 finished
Predict for 201711
Start processing data...
Line up predictions
201711 finished
Predict for 201712
Start processing data...
Line up predictions
201712 finished


In [41]:
# copy 201610, 201611 and 201612 to 2017
for i in range(4, df_submission.shape[1]):
    df_submission.iloc[:, i] = df_submission.iloc[:, i-3]

# Create submission file

In [23]:
df_submission.to_csv('/Users/shuyangdu/Desktop/ZillowChallenge/submission/lightgbm_20170806.csv', index=False)

In [25]:
df_submission.head()

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,0.011453,0.014164,0.016392,0.011453,0.014164,0.016392
1,10759547,0.017489,0.014622,0.015951,0.017489,0.014622,0.015951
2,10843547,0.063491,0.05936,0.057898,0.063491,0.05936,0.057898
3,10859147,0.037147,0.027718,0.028046,0.037147,0.027718,0.028046
4,10879947,0.064347,0.060387,0.062283,0.064347,0.060387,0.062283


In [29]:
pd.DataFrame(params, index=[0]).to_csv('/Users/shuyangdu/Desktop/ZillowChallenge/submission/params_20170806.csv', index=False)