# Model Training

Let's take a look at our expected dataset and see if we can improve model performance

In [1]:
import json
import pickle
from pathlib import Path

import giskard
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pendulum as pdt
import xgboost as xgb
from sklearn.feature_selection import VarianceThreshold, SelectFromModel
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

plt.style.use("ggplot")

In [2]:
DATA_DIR = Path("../data/")
TEST_DATA = DATA_DIR / "future_unseen_examples.csv"
TRAIN_DATA = DATA_DIR / "kc_house_data.csv"
DEMOGRAPHICS_DATA = DATA_DIR / "zipcode_demographics.csv"
INITIAL_MODEL_COLUMNS = [
    'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
    'sqft_above', 'sqft_basement', 'zipcode'
]
MODEL_PATH = Path("../model/model.pkl")
MODEL_FEATURES_PATH = Path("../model/model_features.json")
DAYS_PER_MONTH = 31  # an approximation to be sure
MONTH_PER_MONTH = 12  # a more accurate approximation

## Explore the Data

In [3]:
test_df = pd.read_csv(TEST_DATA)
train_df = pd.read_csv(TRAIN_DATA)
demographics_df = pd.read_csv(DEMOGRAPHICS_DATA)

train_df = train_df.merge(
    demographics_df, 
    how="left",
    on="zipcode"
)

In [4]:
# convert date to a datetime object
train_df['date'] = pd.to_datetime(train_df['date'], format='%Y%m%dT%H%M%S')

# break date out into year, month, and day columns
train_df['year'] = train_df['date'].dt.year
train_df['month_sin'] = np.sin(2*np.pi*((train_df['date'].dt.month - 1) / 12))
train_df['month_cos'] = np.cos(2*np.pi*((train_df['date'].dt.month - 1) / 12))
train_df['day_sin'] = np.sin(2*np.pi*((train_df['date'].dt.day - 1) / 31))
train_df['day_cos'] = np.cos(2*np.pi*((train_df['date'].dt.day - 1) / 31))

In [5]:
dropcols = ["id", "date", "zipcode"]
train_price = train_df.pop("price")
train_df = train_df.drop(dropcols, axis=1)

## split both datasets (given and new features)

In [68]:
idx = int(len(train_df) * 0.80)
indices = list(range(len(train_df)))
np.random.shuffle(indices)

X_train = train_df.iloc[indices[:idx], :]
X_test = train_df.iloc[indices[idx:], :]

y_train = train_price[indices[:idx]]
y_test = train_price[indices[idx:]]

X_train_merged_data = merged_data.iloc[indices[:idx], :]
X_test_merged_data = merged_data.iloc[indices[idx:], :]

y_train_merged_data = y[indices[:idx]]
y_test_merged_data = y[indices[idx:]]


## Check Initial Model

In [7]:
with open(MODEL_PATH, "rb") as fil:
    model = pickle.load(fil)

In [8]:
merged_data = pd.read_csv(TRAIN_DATA)[INITIAL_MODEL_COLUMNS].merge(
    demographics_df, 
    how="left",
    on="zipcode"
).drop(columns="zipcode")

y = merged_data.pop("price")
x = merged_data

In [46]:
y_preds_knn = model.predict(merged_data)

In [48]:
print(f"KNN{{2}} MAPE: {mean_absolute_percentage_error(y, y_preds_knn)*100:.4f}%")

KNN{2} MAPE: 15.0346%


In [73]:
y_preds_knn = model.predict(X_test_merged_data)

## LightGBM Model

In [69]:
early_stopping = lgb.callback.early_stopping(stopping_rounds=30)
lgbm_model = lgb.LGBMRegressor(n_jobs=8, n_estimators=1_000, objective="regression")

lgbm_model.fit(
    X_train,
    np.log(y_train),  # skewed dataset but normally distributed after log transform
    eval_set=[(X_test, np.log(y_test))],
    callbacks=[early_stopping]
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002124 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3354
[LightGBM] [Info] Number of data points in the train set: 17290, number of used features: 48
[LightGBM] [Info] Start training from score 13.050242
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[287]	valid_0's l2: 0.0267301


In [70]:
y_preds = np.exp(lgbm_model.predict(X_test))  # undo the log-transform above

# MAPE is nice for judging different scales of errors
mean_absolute_percentage_error(y_test, y_preds)

0.11780179213722279

## XGBoost

In [71]:
xgb_early_stopping = xgb.callback.EarlyStopping(rounds=30)
xgb_model = xgb.XGBRegressor(n_estimators=1000, n_jobs=8, callbacks=[xgb_early_stopping], objective="reg:squaredlogerror")
xgb_model.fit(
    X_train,
    np.log(y_train),
    eval_set=[(X_test, np.log(y_test))],
)

[0]	validation_0-rmsle:2.04756
[1]	validation_0-rmsle:1.86397
[2]	validation_0-rmsle:1.68566
[3]	validation_0-rmsle:1.51319
[4]	validation_0-rmsle:1.34722
[5]	validation_0-rmsle:1.18845
[6]	validation_0-rmsle:1.03765
[7]	validation_0-rmsle:0.89567
[8]	validation_0-rmsle:0.76337
[9]	validation_0-rmsle:0.64161
[10]	validation_0-rmsle:0.53119
[11]	validation_0-rmsle:0.43278
[12]	validation_0-rmsle:0.34682
[13]	validation_0-rmsle:0.27343
[14]	validation_0-rmsle:0.21237
[15]	validation_0-rmsle:0.16276
[16]	validation_0-rmsle:0.12355
[17]	validation_0-rmsle:0.09334
[18]	validation_0-rmsle:0.07043
[19]	validation_0-rmsle:0.05350
[20]	validation_0-rmsle:0.04125
[21]	validation_0-rmsle:0.03265
[22]	validation_0-rmsle:0.02666
[23]	validation_0-rmsle:0.02251


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


[24]	validation_0-rmsle:0.01978
[25]	validation_0-rmsle:0.01792
[26]	validation_0-rmsle:0.01662
[27]	validation_0-rmsle:0.01580
[28]	validation_0-rmsle:0.01519
[29]	validation_0-rmsle:0.01471
[30]	validation_0-rmsle:0.01437
[31]	validation_0-rmsle:0.01411
[32]	validation_0-rmsle:0.01388
[33]	validation_0-rmsle:0.01372
[34]	validation_0-rmsle:0.01357
[35]	validation_0-rmsle:0.01344
[36]	validation_0-rmsle:0.01336
[37]	validation_0-rmsle:0.01327
[38]	validation_0-rmsle:0.01320
[39]	validation_0-rmsle:0.01314
[40]	validation_0-rmsle:0.01308
[41]	validation_0-rmsle:0.01302
[42]	validation_0-rmsle:0.01296
[43]	validation_0-rmsle:0.01290
[44]	validation_0-rmsle:0.01287
[45]	validation_0-rmsle:0.01284
[46]	validation_0-rmsle:0.01279
[47]	validation_0-rmsle:0.01277
[48]	validation_0-rmsle:0.01273
[49]	validation_0-rmsle:0.01271
[50]	validation_0-rmsle:0.01265
[51]	validation_0-rmsle:0.01261
[52]	validation_0-rmsle:0.01259
[53]	validation_0-rmsle:0.01256
[54]	validation_0-rmsle:0.01252
[55]	val

In [72]:
y_preds_xgb = np.exp(xgb_model.predict(X_test))

mean_absolute_percentage_error(y_test, y_preds_xgb)

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


0.1192010996566033

## Analyze Results

In [86]:
out_df = pd.DataFrame()
out_df["y_test"] = y_test
out_df["y_xgb"] = y_preds_xgb
out_df["y_lgb"] = y_preds
out_df["y_knn"] = y_preds_knn

# average the results from the two models
out_df["comb"] = out_df[["y_xgb", "y_lgb"]].mean(axis=1)

In [87]:
# ensure predictions are correlated with the labels
out_df.corr()

Unnamed: 0,y_test,y_xgb,y_lgb,y_knn,comb
y_test,1.0,0.940927,0.946828,0.907382,0.949234
y_xgb,0.940927,1.0,0.977606,0.937305,0.99428
y_lgb,0.946828,0.977606,1.0,0.93031,0.994491
y_knn,0.907382,0.937305,0.93031,1.0,0.939046
comb,0.949234,0.99428,0.994491,0.939046,1.0


In [88]:
print("MAPE %")
print(f"LightGBM: {mean_absolute_percentage_error(y_test, out_df['y_lgb'])*100:.4f}%")
print(f"XGBoost:  {mean_absolute_percentage_error(y_test, out_df['y_xgb'])*100:.4f}%")
print(f"Averaged: {mean_absolute_percentage_error(y_test, out_df['comb'])*100:.4f}%")

MAPE %
LightGBM: 11.7802%
XGBoost:  11.9201%
Averaged: 11.5506%
