# CKME136 - Capstone Project - Toronto Real Estate Listings
## Step 4: Feature Engineering - with Scikit Learn

<div class="alert alert-block alert-info">
4A. Imports for predictions

In [70]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

<div class="alert alert-block alert-info">
4B. Read in cleaned dataset by using provided index and check output

In [27]:
df_all_features_dataset = pd.read_csv('cleaned_combined_listing_features_v1.csv', index_col=0)

In [28]:
list(df_all_features_dataset)

['listing_url',
 'listing_id',
 'listing_price_int',
 'comparable_sold_price_1_int',
 'comparable_list_price_1_int',
 'comparable_sold_price_2_int',
 'comparable_list_price_2_int',
 'comparable_sold_price_3_int',
 'comparable_list_price_3_int',
 'comparable_sold_price_4_int',
 'comparable_list_price_4_int',
 'comparable_sold_price_5_int',
 'comparable_list_price_5_int',
 'main_rooms',
 'additional_rooms',
 'total_rooms',
 'parking_spaces',
 'main_kitchen',
 'additional_kitchen',
 'total_kitchens',
 'comparable_sold_date_1_datetime',
 'comparable_sold_date_2_datetime',
 'comparable_sold_date_3_datetime',
 'comparable_sold_date_4_datetime',
 'comparable_sold_date_5_datetime',
 'extras_fridge',
 'extras_stove',
 'extras_stainless_steel',
 'extras_pool',
 'extras_gym',
 'extras_storage',
 'extras_locker',
 'extras_gas',
 'extras_dishwasher',
 'features_view',
 'features_park',
 'features_public_transit',
 'features_basementbrick',
 'features_concrete_exterior',
 'features_hospital',
 'feat

In [29]:
df_all_features_dataset.head()

Unnamed: 0,listing_url,listing_id,listing_price_int,comparable_sold_price_1_int,comparable_list_price_1_int,comparable_sold_price_2_int,comparable_list_price_2_int,comparable_sold_price_3_int,comparable_list_price_3_int,comparable_sold_price_4_int,...,SUNROOM,GREAT ROOM,LAUNDRY,SITTING,SOLARIUM,LOCKER,BEDROOM,BREAKFAST,MEDIA/ENT,UTILITY
0,https://toronto.listing.ca/168-bonis-ave-1211.E4349723.htm#15-1dr,4349723,899900,605000,625000,585000,588000,578000,593000,525000,...,False,False,True,False,False,True,False,True,False,False
1,https://toronto.listing.ca/3300-don-mills-rd-310.C4359682.htm#15-oy,4359682,425000,456000,460000,387000,399900,396000,408000,420000,...,False,False,True,False,False,True,False,False,False,False
2,https://toronto.listing.ca/65-east-liberty-st-627.C4353207.htm#15-16k,4353207,479900,588000,599900,555000,500000,496500,439900,525000,...,False,False,False,False,False,False,True,False,False,False
3,https://toronto.listing.ca/45-skylark-rd.W4341830.htm#15-1r3,4341830,949000,659000,699000,975000,929000,844000,849000,830000,...,True,False,True,False,False,False,False,False,False,False
4,https://toronto.listing.ca/88-north-bonnington-ave.E4252454.htm#15-2n1,4252454,680000,630000,649000,735000,719900,630000,649900,689000,...,False,False,False,False,False,False,False,False,False,False


<div class="alert alert-block alert-info">
4C. Generate training and test data sets

In [94]:
# values to predict
labels = np.array(df_all_features_dataset['listing_price_int'])
# Remove the labels from the features
features = df_all_features_dataset.drop('listing_price_int', axis = 1).drop('listing_url', axis = 1)
# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25)

<div class="alert alert-block alert-info">
4C. Create linear regression model

In [36]:
# create set of independent and dependent variables
x = df_all_features_dataset.iloc[:,3:]
y = df_all_features_dataset['listing_price_int']

In [80]:
# Train model
lm = linear_model.LinearRegression()
model = lm.fit(train_features, train_labels)

In [104]:
# Make some predictions
lm_predictions = model.predict(test_features)

number_of_predictions = 10

predictions_show = np.array((lm_predictions)[0:number_of_predictions]).astype(int)

actuals_show = np.array((test_labels)[0:number_of_predictions]).astype(int)

linear_model_compare = pd.DataFrame({'predictions': predictions_show, 'actuals': actuals_show})

linear_model_compare['absolute_error'] = abs(linear_model_compare.predictions - linear_model_compare.actuals)

linear_model_compare.head(number_of_predictions)

Unnamed: 0,actuals,predictions,absolute_error
0,689000,500241,188759
1,589000,469213,119787
2,557500,525938,31562
3,1069000,884292,184708
4,629000,600301,28699
5,679000,615075,63925
6,1549000,865728,683272
7,734850,892870,158020
8,495000,431140,63860
9,886000,901825,15825


In [105]:
print("Mean squared error: %.2f"
      % mean_squared_error(test_labels, lm_predictions))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(test_labels, lm_predictions))

print ('Linear model r^2 score: %.2f' % lm.score(train_features, train_labels))

Mean squared error: 283524852960.23
Variance score: 0.77
Linear model r^2 score: 0.65


<div class="alert alert-block alert-info">
4D. Create random forest regression model

In [106]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

In [None]:
# Train the model on training data
rf.fit(train_features, train_labels)

In [102]:
# Use the forest's predict method on the test data
rf_predictions = rf.predict(test_features)
# Calculate the absolute errors
rf_errors = abs(rf_predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(rf_errors), 2), 'dollars.')
# Mean Absolute Error: 3.83 degrees.

('Mean Absolute Error:', 234896.37, 'dollars.')


In [13]:
predictions[:5]

array([325214.268, 682819.044, 487892.605, 428239.996, 488103.05 ])

In [14]:
test_labels[:5]

array([269900, 725000, 519900, 399950, 488000])

In [103]:
print("Mean squared error: %.2f"
      % mean_squared_error(test_labels, rf_predictions))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(test_labels, rf_predictions))

Mean squared error: 296770771737.22
Variance score: 0.76
