# KC House Sales Predictive Models

In this notebook, we present 3 models for the King County house price prediction task.
- A CatBoost Model
- A Linear Regression Model
- A Multi-Layered Perceptron Model

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
print('Tensorflow version:', tf.__version__)

from tensorflow import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Conv1D, Input, Flatten, Dropout
from tensorflow.keras.losses import MeanAbsoluteError, MeanAbsolutePercentageError, MeanSquaredLogarithmicError, Huber
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.callbacks import LearningRateScheduler, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from pickle import dump

np.random.seed(7)
tf.random.set_seed(7)

%matplotlib inline

# Loading and preprocessing the data

In [None]:
data = pd.read_csv('../input/housesalesprediction/kc_house_data.csv')

In [None]:
feature_list=['bedrooms','bathrooms','sqft_living','sqft_lot','floors','waterfront','view','condition',
         'grade','sqft_above','sqft_basement','yr_built','yr_renovated','zipcode','lat','long']

In [None]:
Y = data.price
X = data[feature_list].copy()

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
dump(scaler, open('scaler.pkl', 'wb'))

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(X_scaled, Y, test_size=0.2, random_state=7)

## CatBoost Model

The 3 following cells are used to run a grid search on CatBoost models to tune the hyper-parameters. The process takes several minutes (~15 min). 
You can skip these cells and train the model with the best set of parameters.

In [None]:
#parameters={
#    'n_estimators' : [800,900,1000],
#    'learning_rate' : [0.05,0.1,0.15],
#    'depth' : [4,5,6],
#    'l2_leaf_reg' : [1,3,5]
#}
#scorer = make_scorer(r2_score)

In [None]:
#CBR_GS = CatBoostRegressor()
#grid = GridSearchCV(estimator=CBR_GS, param_grid = parameters, cv = 4, verbose=False, scoring=scorer)
#grid.fit(X_train, Y_train)

In [None]:
#print(" Results from Grid Search " )
#print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
#print("\n The best score across ALL searched params:\n", grid.best_score_)
#print("\n The best parameters across ALL searched params:\n", grid.best_params_)

**Training the CatBoost model with the tuned parameters**

In [None]:
CBR_Best = CatBoostRegressor(n_estimators=1000, depth=5, learning_rate=0.1, l2_leaf_reg=3)
CBR_Best_fit = CBR_Best.fit(X_train, Y_train, eval_set=(X_val, Y_val), verbose=False, plot=True)

In [None]:
plt.plot(CBR_Best_fit.evals_result_['validation']['RMSE'])
plt.plot(CBR_Best_fit.evals_result_['learn']['RMSE'])

In [None]:
plt.barh(y=feature_list,width=CBR_Best_fit.feature_importances_)

In [None]:
CBR_Best_fit.best_score_

## Linear Regression Model

In [None]:
regr = LinearRegression()

In [None]:
regr.fit(X_train, Y_train)
pred = regr.predict(X_val)

In [None]:
regr.score(X_val,Y_val), r2_score(pred, Y_val)

In [None]:
np.sqrt(mean_squared_error(pred, Y_val))

In [None]:
pred = regr.predict(X_train)

In [None]:
regr.score(X_train, Y_train), r2_score(pred, Y_train)

In [None]:
# Run this cell if you want to save the model.

#dump(regr, open('regr.pkl', 'wb'))

## MLP Model

In [None]:
X_train = np.expand_dims(X_train, axis=1)
X_train = X_train.reshape(-1,16,1)
X_val = np.expand_dims(X_val, axis=1)
X_val = X_val.reshape(-1,16,1)

In [None]:
X_train.shape, X_val.shape

In [None]:
tf.keras.backend.clear_session()
np.random.seed(7)
tf.random.set_seed(7)

In [None]:
Model_input = Input(shape=(16,1))
x = Conv1D(filters=128, kernel_size=7, activation='relu')(Model_input)
x = Dropout(0.3)(x)
x = Conv1D(filters=64, kernel_size=5, activation='relu')(x)
x = Dropout(0.3)(x)
x = Conv1D(filters=32, kernel_size=3, activation='relu')(x)
x = Flatten()(x)
x = Dense(64, activation='relu')(x)
x = Dense(32, activation='relu')(x)
x = Dense(16, activation='relu')(x)
x = Dense(1, activation='relu')(x)
MLP = Model(inputs=Model_input, outputs=x)

MLP.summary()

In [None]:
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.8,patience=10, min_lr=1e-7, cooldown=7, verbose=1)
optimizer = Adam(lr=0.001)
MLP.compile(loss=MeanAbsoluteError(), optimizer=optimizer, metrics=[RootMeanSquaredError()])
history = MLP.fit(x=X_train, y=Y_train, batch_size=128, validation_data=(X_val, Y_val), callbacks=[reduce_lr], epochs=300, verbose=1)

In [None]:
xticks=[i+1 for i in history.epoch[:]]
plt.figure(figsize=(20,6))
plt.subplot(121)
plt.title('Loss (MAE)')
plt.plot(xticks, history.history['loss'])
plt.plot(xticks, history.history['val_loss'])
plt.xticks(xticks[0::15])
plt.legend(['Train','Validation'])
plt.subplot(122)
plt.title('RMSE')
plt.plot(xticks, history.history['root_mean_squared_error'])
plt.plot(xticks, history.history['val_root_mean_squared_error'])
plt.xticks(xticks[0::15])
plt.legend(['Train','Validation']);

In [None]:
# Run this cell if you want to save the model.

#MLP.save('MLP_kc')

In [None]:
# Run this cell if you want to load the saved model.

#MLP = keras.models.load_model('MLP_kc')

In [None]:
MLP.evaluate(X_train,Y_train)

In [None]:
MLP.evaluate(X_val,Y_val)

In [None]:
r2_score(MLP.predict(X_train), Y_train),r2_score(MLP.predict(X_val), Y_val)

In [None]:
plt.figure(figsize=(20,10))
plt.scatter(x=range(len(Y_train)), y=Y_train, alpha=0.5)
plt.scatter(x=range(len(Y_train)), y=MLP.predict(X_train), color='r', alpha=0.5)

In [None]:
plt.figure(figsize=(20,10))
plt.scatter(x=range(len(Y_val)), y=Y_val, alpha=0.5)
plt.scatter(x=range(len(Y_val)), y=MLP.predict(X_val), color='r', alpha=0.5)