# Modeling and Hyperparameter Tuning

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, KFold, RandomizedSearchCV
import joblib
import time
from sqlalchemy import create_engine
import psycopg2
import io
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LeakyReLU
from tensorflow.keras.callbacks import EarlyStopping

## Read the data
We have json data from RentalBeast, and the csv cleaned data from the Bridge API

In [13]:
import json

with open("listing_full.json") as infile:
    d = json.load(infile)
l = d['data']
l2 = [{
    "bedrooms":float(r['bedrooms']),
    "bathrooms":float(r['bathrooms']),
    "sqft":float(r['square_footage']),
    "latitude":float(r['latitude']),
    "longitude":float(r['longitude']),
    "heating":1 if r['heat']=="Yes" else 0,
    "cooling":1 if r['air_conditioning']=="Yes" else 0,
    "price":int(r['rent'])
} for r in l]

df01 = pd.read_csv("cleaned_BridgeAPI_data.csv").dropna()
df02 = pd.DataFrame(l2)



## Data Prep for modeling
Apply standard scaling, and then split the data

In [3]:
X = df02[['bedrooms','bathrooms','sqft','latitude','longitude']]#,'heating','cooling']]
y = df02.price

ss1 = StandardScaler()
X_scaled = ss1.fit_transform(X)
X_train,X_test,y_train,y_test = train_test_split(X_scaled,y)

## Modeling

**Neural Network**

In [17]:
# create model
model = Sequential()

# get input shape
n_cols = X.shape[1]

# add layers
model.add(Dense(32, activation='relu', input_shape=(n_cols,)))
model.add(Dense(32))
model.add(Dense(1))

# compile
model.compile(optimizer='adam', loss='mean_absolute_error')

# add early stopping
early_stopping_monitor = EarlyStopping(patience=3)

# fit
model.fit(X_scaled, y, validation_split=0.2, epochs=30, callbacks=[early_stopping_monitor])
print(mean_absolute_error(model.predict(X_scaled),y))

Train on 5101 samples, validate on 1276 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
262.61114910640777


**K-Nearest Neighbor**

In [5]:
knn_params = {
    'n_neighbors':[2,3,4,5,6,7,8,9],
    'weights':['uniform','distance'],
    'metric':['euclidean','manhattan']
}
knn_search = GridSearchCV(KNeighborsRegressor(), knn_params, cv=5)
knn_search.fit(X_train,y_train)
knn_final = knn_search.best_estimator_
print(mean_absolute_error(knn_final.predict(X_test),y_test))

94.37824561534045


Save the model

In [6]:
joblib.dump(knn_final,"knn_final.joblib")

['knn_final.joblib']

**Random Forest Regressor**

In [7]:
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 250, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

rfr_params = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rfr_search = RandomizedSearchCV(RandomForestRegressor(),rfr_params,cv=5)
rfr_search.fit(X_train,y_train)
rfr_final = rfr_search.best_estimator_
print(mean_absolute_error(rfr_final.predict(X_test),y_test))

75.54531998941498


Save the model

In [8]:
joblib.dump(rfr_final,"rfr_final.joblib")

['rfr_final.joblib']

**Boosted Trees**

We will re-use some tuning parameters from the random forest

In [9]:
gbr_params = {
    'n_estimators':n_estimators,
    'learning_rate': [0.1, 0.05, 0.02, 0.01],
    'max_depth':max_depth,
    'min_samples_leaf':min_samples_leaf,
    'max_features':[0.1,0.3,1.0]
}

gbr_search = RandomizedSearchCV(GradientBoostingRegressor(),gbr_params,cv=5)
gbr_search.fit(X_train,y_train)
gbr_final = gbr_search.best_estimator_
print(mean_absolute_error(gbr_final.predict(X_test),y_test))

84.8363069899642


Save the model

In [10]:
joblib.dump(gbr_final,"gbr_final.joblib")

['gbr_final.joblib']

## Run inference using the best model on the for-sale dataset

In [14]:
df001 = df01[['BedroomsTotal','BathroomsTotal','LivingArea','latitude','Longitude']]
rfr_final.predict(ss1.transform(df001))
df01['RentValue'] = rfr_final.predict(ss1.transform(df001))
df01.head()

Unnamed: 0,ADDRESS,City,PostalCode,ListingId,MlsStatus,ListingContractDate,ListPrice,LivingArea,LotSizeArea,BedroomsTotal,...,FireplacesTotal,AnnualTaxAmount,PropertySubType,Levels,Waterfront,YearBuilt,latitude,Longitude,UUID,RentValue
0,"3703 Laurel Ledge LN, Austin TX 78731",Austin,78731.0,4744208,Sold,2014-04-05,800000,5301.0,0.367,5.0,...,1.0,18810.0,Single Family Residence,Two,False,1973.0,30.346987,-97.762045,19a68e23-159b-42ae-be93-2e874068f61a,9023.103333
1,"6113 Highlandale DR, Austin TX 78731",Austin,78731.0,2988324,Sold,2016-04-09,500000,2256.0,0.413,3.0,...,1.0,11434.0,Single Family Residence,Two,False,1977.0,30.347782,-97.762706,c1866e36-5d13-42d5-bf7a-cb1ef8b38f35,3646.008333
2,"3711 Laurel Ledge LN, Austin TX 78731",Austin,78731.0,3789662,Sold,2018-05-25,700000,3076.0,0.281,5.0,...,1.0,15043.0,Single Family Residence,Two,False,1973.0,30.347944,-97.762363,4ec78ff5-0847-44b9-ad9a-223d06a9e556,5007.226667
3,"3702 Laurel Ledge LN, Austin TX 78731",Austin,78731.0,4345115,Sold,2006-02-07,300000,2102.0,0.0,4.0,...,1.0,7239.0,Single Family Residence,Two,False,1977.0,30.34733,-97.761688,3285779f-a40a-4314-b17b-279c527b9646,4338.935
5,"6105 Mountainclimb DR, Austin TX 78731",Austin,78731.0,5444450,Sold,2008-12-28,400000,2103.0,0.271,3.0,...,1.0,8500.96,Single Family Residence,One,False,1965.0,30.347051,-97.763738,47f7c7bf-f912-4503-b747-4bc06687cd49,3655.226667
