# Houses and Empty Lots for Sale in New Brunswick (as of June 12, 2020)

In this project, I scraped data from this [website](https://www.point2homes.com/CA/Real-Estate-Listings/NB.html) which lists houses and 
empty lots for sale. The selling price is listed along with the lot size. For houses, the number of bedrooms, bathrooms, the house size the house type are also given.

## 1.2 Modelling

In [1]:
# load libraries
import numpy as np
import pandas as pd
import re
from tqdm.notebook import tqdm
from plotnine import *

train = pd.read_csv('train_cleaned.csv')
train.head(5)

Unnamed: 0,Beds,Baths,House Size (sqft),Lot Size (ac),Type,Price,Postal,Missing Beds,Missing Baths,Rooms,Missing House,Missing Land
0,3.0,2.0,1575.0,0.68,Residential,43000,E4,1,1,5.0,1,1
1,3.0,2.0,10000.0,0.68,Residential,899900,E2,1,1,5.0,0,1
2,3.0,2.0,1575.0,1.02,Residential,75000,E4,1,1,5.0,1,0
3,0.0,0.0,0.0,1.8,Empty Land,99000,E4,0,0,0.0,0,1
4,3.0,2.0,1245.0,0.68,Residential,94900,E2,0,0,5.0,0,1


In [2]:
# convert Price to thousands
train['Price'] = round(train['Price']/1000)
train = train.rename(columns={'Price':'Price (thousands)'})
train.head(5)

Unnamed: 0,Beds,Baths,House Size (sqft),Lot Size (ac),Type,Price (thousands),Postal,Missing Beds,Missing Baths,Rooms,Missing House,Missing Land
0,3.0,2.0,1575.0,0.68,Residential,43.0,E4,1,1,5.0,1,1
1,3.0,2.0,10000.0,0.68,Residential,900.0,E2,1,1,5.0,0,1
2,3.0,2.0,1575.0,1.02,Residential,75.0,E4,1,1,5.0,1,0
3,0.0,0.0,0.0,1.8,Empty Land,99.0,E4,0,0,0.0,0,1
4,3.0,2.0,1245.0,0.68,Residential,95.0,E2,0,0,5.0,0,1


In [3]:
train.loc[:,train.columns != 'Price (thousands)']

Unnamed: 0,Beds,Baths,House Size (sqft),Lot Size (ac),Type,Postal,Missing Beds,Missing Baths,Rooms,Missing House,Missing Land
0,3.0,2.0,1575.0,0.68,Residential,E4,1,1,5.0,1,1
1,3.0,2.0,10000.0,0.68,Residential,E2,1,1,5.0,0,1
2,3.0,2.0,1575.0,1.02,Residential,E4,1,1,5.0,1,0
3,0.0,0.0,0.0,1.80,Empty Land,E4,0,0,0.0,0,1
4,3.0,2.0,1245.0,0.68,Residential,E2,0,0,5.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
532,3.0,2.0,1826.0,0.68,Residential,E4,0,0,5.0,0,1
533,4.0,1.0,1575.0,0.68,Residential,E4,0,0,5.0,1,1
534,5.0,4.0,3641.0,0.68,Residential,E2,0,0,9.0,0,1
535,0.0,0.0,0.0,1.80,Empty Land,E1,0,0,0.0,0,1


In [4]:
# perform one-hot encoding on type and postal
from sklearn.preprocessing import OneHotEncoder

cat_features = ['Type', 'Postal']

X_train = train.loc[:,train.columns != 'Price (thousands)']
y_train = train['Price (thousands)']

OneHot = OneHotEncoder(handle_unknown='ignore',sparse=False)
cat_train = pd.DataFrame(OneHot.fit_transform(X_train[cat_features]))
cat_train.columns = OneHot.get_feature_names(['Type','Postal'])

cat_train.index = X_train.index

noncat_X_train = X_train.drop(cat_features, axis = 1)

OneHot_X_train = pd.concat([noncat_X_train, cat_train], axis = 1)
feat_labels = OneHot_X_train.columns
OneHot_X_train

Unnamed: 0,Beds,Baths,House Size (sqft),Lot Size (ac),Missing Beds,Missing Baths,Rooms,Missing House,Missing Land,Type_Empty Land,Type_Residential,Postal_E1,Postal_E2,Postal_E3,Postal_E4,Postal_E5,Postal_E6,Postal_E7,Postal_E8,Postal_E9
0,3.0,2.0,1575.0,0.68,1,1,5.0,1,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,3.0,2.0,10000.0,0.68,1,1,5.0,0,1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.0,2.0,1575.0,1.02,1,1,5.0,1,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.80,0,0,0.0,0,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,3.0,2.0,1245.0,0.68,0,0,5.0,0,1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
532,3.0,2.0,1826.0,0.68,0,0,5.0,0,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
533,4.0,1.0,1575.0,0.68,0,0,5.0,1,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
534,5.0,4.0,3641.0,0.68,0,0,9.0,0,1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
535,0.0,0.0,0.0,1.80,0,0,0.0,0,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators = 100,n_jobs=-1)
model.fit(OneHot_X_train,y_train)
model.score(OneHot_X_train,y_train)

0.90640529496734

In [6]:
model.get_params

<bound method BaseEstimator.get_params of RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)>

In [8]:
print(list(zip(feat_labels,model.feature_importances_)))

[('Beds', 0.01884042439186936), ('Baths', 0.3558961970338276), ('House Size (sqft)', 0.31775624967605787), ('Lot Size (ac)', 0.13536963503257718), ('Missing Beds', 0.021293904172612333), ('Missing Baths', 0.009905582688098576), ('Rooms', 0.04808962076122056), ('Missing House', 0.008132510559934168), ('Missing Land', 0.011777870822389519), ('Type_Empty Land', 0.000758211555446535), ('Type_Residential', 0.0004989975434257143), ('Postal_E1', 0.01137254756892778), ('Postal_E2', 0.010626134613413236), ('Postal_E3', 0.014091192209847786), ('Postal_E4', 0.009918050999412295), ('Postal_E5', 0.016558484819134366), ('Postal_E6', 0.0033086125774683182), ('Postal_E7', 0.0018243099639265183), ('Postal_E8', 0.0035124555138234003), ('Postal_E9', 0.0004690074965871145)]


In [19]:
new_df = OneHot_X_train[['Baths','House Size (sqft)','Lot Size (ac)']]
model = RandomForestRegressor(n_estimators = 100,n_jobs=-1)
model.fit(new_df,y_train)
model.score(new_df,y_train)

0.8652348381051568

In [20]:
val = pd.read_csv('val_cleaned.csv')
val.head(5)

Unnamed: 0,Beds,Baths,House Size (sqft),Lot Size (ac),Type,Price,Postal,Missing Beds,Missing Baths,Rooms,Missing House,Missing Land
0,3.0,1.0,1512.0,1.16,Residential,99000,E3,0,0,4.0,0,1
1,3.0,2.0,1512.0,1.16,Residential,189000,E7,0,0,5.0,1,1
2,0.0,0.0,0.0,1.0,Empty Land,16700,E6,0,0,0.0,0,0
3,3.0,1.0,1488.0,1.16,Residential,255000,E4,0,0,4.0,0,1
4,2.0,1.0,1512.0,1.16,Residential,115000,E3,0,0,3.0,1,1


In [21]:
# convert Price to thousands
val['Price'] = round(val['Price']/1000)
val = val.rename(columns={'Price':'Price (thousands)'})
val.head(5)

Unnamed: 0,Beds,Baths,House Size (sqft),Lot Size (ac),Type,Price (thousands),Postal,Missing Beds,Missing Baths,Rooms,Missing House,Missing Land
0,3.0,1.0,1512.0,1.16,Residential,99.0,E3,0,0,4.0,0,1
1,3.0,2.0,1512.0,1.16,Residential,189.0,E7,0,0,5.0,1,1
2,0.0,0.0,0.0,1.0,Empty Land,17.0,E6,0,0,0.0,0,0
3,3.0,1.0,1488.0,1.16,Residential,255.0,E4,0,0,4.0,0,1
4,2.0,1.0,1512.0,1.16,Residential,115.0,E3,0,0,3.0,1,1


In [22]:
cat_features = ['Type', 'Postal']

X_val = val.loc[:,val.columns != 'Price (thousands)']
y_val = val['Price (thousands)']

OneHot = OneHotEncoder(handle_unknown='ignore',sparse=False)
cat_val = pd.DataFrame(OneHot.fit_transform(X_val[cat_features]))
cat_val.columns = OneHot.get_feature_names(['Type','Postal'])

cat_val.index = X_val.index

noncat_X_val = X_val.drop(cat_features, axis = 1)

OneHot_X_val = pd.concat([noncat_X_val, cat_val], axis = 1)
OneHot_X_val

Unnamed: 0,Beds,Baths,House Size (sqft),Lot Size (ac),Missing Beds,Missing Baths,Rooms,Missing House,Missing Land,Type_Empty Land,Type_Residential,Postal_E1,Postal_E2,Postal_E3,Postal_E4,Postal_E5,Postal_E6,Postal_E7,Postal_E8,Postal_E9
0,3.0,1.0,1512.0,1.16,0,0,4.0,0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.0,2.0,1512.0,1.16,0,0,5.0,1,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,1.00,0,0,0.0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,3.0,1.0,1488.0,1.16,0,0,4.0,0,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,2.0,1.0,1512.0,1.16,0,0,3.0,1,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,0.0,0.0,0.0,1.30,0,0,0.0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
91,0.0,0.0,0.0,4.72,0,0,0.0,0,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
92,5.0,2.0,1872.0,1.16,0,0,7.0,0,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93,0.0,0.0,0.0,1.50,0,0,0.0,0,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [23]:
y_pred = model.predict(OneHot_X_val[['Baths','House Size (sqft)', 'Lot Size (ac)']])
y_pred

array([ 93.98965228, 278.52      ,  56.58753276,  93.39745607,
        93.98965228, 278.52      ,  29.75      ,  29.75      ,
       122.17      ,  93.98965228, 175.71      , 169.84      ,
       372.41033384, 203.19      ,  25.25      , 169.84      ,
       227.51      , 289.9       ,  24.73833333, 236.82      ,
       173.72      , 265.63      , 125.35      , 278.52      ,
        29.75      , 204.8       , 154.66      ,  93.98965228,
       360.15      ,  93.98965228, 113.87333333,  29.75      ,
       278.52      ,  29.75      , 240.32      , 266.7       ,
       783.9215    , 278.52      , 234.36      , 142.82      ,
        29.75      , 278.52      , 278.52      , 298.79      ,
       257.24833333, 142.82      , 137.148     ,  29.75      ,
       171.676     , 276.44      ,  93.98965228,  22.85      ,
        29.75      , 125.06      ,  93.98965228,  29.75      ,
        29.75      , 240.32      , 234.12      ,  93.98965228,
       295.05      , 153.72833333,  93.98965228,  87.81

In [24]:
y_val

0      99.0
1     189.0
2      17.0
3     255.0
4     115.0
      ...  
90     25.0
91     25.0
92    190.0
93     10.0
94    160.0
Name: Price (thousands), Length: 95, dtype: float64

In [25]:
# accuracy on test set
model.score(OneHot_X_val[['Baths','House Size (sqft)', 'Lot Size (ac)']],y_val)

0.24619975164668662