# Predicting Home Sale Price

This will attempt to be similar to the Zestimate used on Zillow, the Redfin estimate used on Redfin, or any of the valuations provided by Collateral Analytics, CoreLogic, or Quantarium on Realtor.com.

I will create a regression model and use it to predict valuation and compare to the other valuations on currently listed homes.

In [1]:
%run src/imports.py

In [2]:
sns.set(rc={'figure.figsize':(16,4)})

In [3]:
neighborhoods_cleaned = pd.read_pickle('data/neighborhoods_cleaned.pkl')
neighborhoods_cleaned.reset_index(inplace=True)
neighborhoods_cleaned.drop(['index','DAYS ON MARKET','PROPERTY TYPE','ADDRESS','CITY','STATE OR PROVINCE'],axis=1,inplace=True)
#neighborhoods_cleaned = neighborhoods_cleaned.merge(pd.get_dummies(neighborhoods_cleaned['LOCATION'],drop_first=True), how='outer',left_index=True,right_index=True)
neighborhoods_cleaned.drop(['LOCATION'],axis=1,inplace=True)
neighborhoods_cleaned['SOLD DATE'] = pd.to_datetime(neighborhoods_cleaned['SOLD DATE'])
neighborhoods_cleaned['MONTH'] = pd.DatetimeIndex(neighborhoods_cleaned['SOLD DATE']).month
neighborhoods_cleaned['YEAR'] = pd.DatetimeIndex(neighborhoods_cleaned['SOLD DATE']).year
neighborhoods_cleaned.drop(['SOLD DATE'],axis=1,inplace=True)

In [4]:
neighborhoods_cleaned

Unnamed: 0,ZIP OR POSTAL CODE,PRICE,SQUARE FEET,LOT SIZE,YEAR BUILT,HOA/MONTH,LATITUDE,LONGITUDE,ROOMS PER SQFT * 1000,MONTH,YEAR
0,78257,345000,1989.0,5662.0,2005.0,230.0,29.647801,-98.614186,2.011061,9,2020
1,78256,337500,2166.0,9583.0,2004.0,54.0,29.650304,-98.629082,2.770083,6,2020
2,78256,375500,2969.0,6605.0,2004.0,54.0,29.651842,-98.630650,2.189289,3,2021
3,78255,462000,2716.0,9147.0,2014.0,95.0,29.617650,-98.643430,2.577320,10,2018
4,78256,624000,3012.0,75794.0,2000.0,25.0,29.620646,-98.622295,1.826029,2,2021
...,...,...,...,...,...,...,...,...,...,...,...
20007,78222,153000,1577.0,6969.0,1983.0,,29.383410,-98.376520,3.170577,9,2019
20008,78222,189900,1703.0,4791.0,2018.0,25.0,29.386700,-98.379900,3.523194,7,2018
20009,78222,128000,1102.0,5662.0,1985.0,,29.383092,-98.376562,4.537205,3,2019
20010,78222,196499,2324.0,5662.0,2014.0,6.0,29.379346,-98.370269,3.012048,3,2019


In [5]:
neighborhoods_cleaned.dropna(inplace=True)

In [7]:
neighborhoods_cleaned

Unnamed: 0,ZIP OR POSTAL CODE,PRICE,SQUARE FEET,LOT SIZE,YEAR BUILT,HOA/MONTH,LATITUDE,LONGITUDE,ROOMS PER SQFT * 1000,MONTH,YEAR
0,78257,345000,1989.0,5662.0,2005.0,230.0,29.647801,-98.614186,2.011061,9,2020
1,78256,337500,2166.0,9583.0,2004.0,54.0,29.650304,-98.629082,2.770083,6,2020
2,78256,375500,2969.0,6605.0,2004.0,54.0,29.651842,-98.630650,2.189289,3,2021
3,78255,462000,2716.0,9147.0,2014.0,95.0,29.617650,-98.643430,2.577320,10,2018
4,78256,624000,3012.0,75794.0,2000.0,25.0,29.620646,-98.622295,1.826029,2,2021
...,...,...,...,...,...,...,...,...,...,...,...
20002,78222,137000,2773.0,5227.0,2005.0,6.0,29.379318,-98.373976,2.163722,6,2018
20003,78222,165000,2085.0,8276.0,2014.0,7.0,29.388351,-98.370269,3.117506,1,2020
20006,78222,165900,1246.0,5662.0,2018.0,7.0,29.374355,-98.385532,4.815409,2,2019
20008,78222,189900,1703.0,4791.0,2018.0,25.0,29.386700,-98.379900,3.523194,7,2018


-----
Data Density  
= 11,654 / 11
= 1000
-----

## Building a Model

Smaller set:

In [8]:
df_small = neighborhoods_cleaned.sample(1000)

In [9]:
y = df_small['PRICE'].values
X = df_small.drop('PRICE',axis=1).values

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [11]:
rfr = RandomForestRegressor()
rfr.fit(X_train,y_train)
y_predict = rfr.predict(X_test)
print("score:", rfr.score(X_test, y_test))
print('MSE: {:2e}'.format(mean_squared_error(y_test, y_predict)))

score: 0.8745121606288391
MSE: 1.661042e+09


In [12]:
y_predict

array([522765.        , 188045.47      , 279743.51      , 303831.48      ,
       198873.95      , 385375.08      , 220758.69      , 458038.56      ,
       231361.02      , 482946.54      , 367788.25      , 232461.33      ,
       177143.5       , 387449.66      , 267629.17      , 356192.21      ,
       199617.38      , 314556.97      , 349288.35      , 408082.05      ,
       219777.94      , 237689.7       , 307787.89      , 216482.71      ,
       234794.36      , 551449.        , 232339.03      , 194256.47      ,
       218093.55      , 240331.        , 203289.2       , 226609.75      ,
       184862.67      , 330245.12      , 269110.17      , 242707.74      ,
       189377.96      , 207673.96      , 168901.1       , 195329.98      ,
       332753.14      , 233517.6       , 186974.71      , 378142.53      ,
       209502.98      , 703172.        , 342498.03      , 198296.12      ,
       191476.38      , 335292.99      , 210103.12      , 260810.3       ,
       205575.        , 2

In [13]:
y_test

array([ 390000,  146000,  246500,  295000,  205000,  333000,  214900,
        455500,  210000,  395000,  363900,  229000,  174900,  358990,
        335000,  420000,  214000,  222981,  335000,  460000,  196000,
        242900,  355000,  210888,  241900,  660000,  235000,  196000,
        205400,  235000,  185500,  250000,  162000,  378000,  266000,
        255000,  185000,  202000,  153000,  204000,  310000,  205000,
        201499,  365000,  219900,  850000,  343499,  208000,  170000,
        229000,  175000,  267000,  195000,  242000,  217500,  365000,
        523000,  190000,  296000,  192900,  205000,  205000,  192900,
        295000,  216900,  299800,  145000,  182000,  227500,  317900,
        190000,  175000,  520000,  177000,  268000,  234999,  152000,
        255000,  422500,  224990,  211000,  177500,  520000,  260900,
        179900,  188000,  221000,  204000,  254000,  190900,  174500,
        394000,  383600,  436000,  190000,  687000,  162000,  250000,
        218000,  579

In [14]:
search_space = [{'regressor__alpha': 10 ** (np.linspace(-5, 5, 10)), 
                 'regressor__l1_ratio': np.linspace(0,1,6)}]

# l_lasso / (l_lasso + l_ridge)
# if l_lasso = 0: 0 / 1 = 0 (this is ridge regression)
# if l_ridge = 0: l_lasso / l_lasso = 1 (this is LASSSO regression)

pipe = Pipeline([('scaler', StandardScaler()), ('regressor', ElasticNet())])

enet_cv = GridSearchCV(pipe, 
                      search_space, 
                      cv=KFold(10, shuffle=True), 
                      scoring='neg_mean_squared_error',
                      return_train_score = True)

In [15]:
enet_cv.fit(X_train,y_train)

enet_cv.best_params_

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

{'regressor__alpha': 7742.636826811277, 'regressor__l1_ratio': 1.0}

In [16]:
y_predict = enet_cv.predict(X_test)
print('MSE: {:2e}'.format(mean_squared_error(y_test, y_predict)))

MSE: 3.145272e+09


In [17]:
ridge = Ridge(alpha=1)
ridge.fit(X_train,y_train)
y_predict = ridge.predict(X_test)
print("score:", ridge.score(X_test, y_test))
print('MSE: {:2e}'.format(mean_squared_error(y_test, y_predict)))

score: 0.7911878789305266
MSE: 2.763979e+09


In [18]:
X_train.shape

(750, 10)

In [19]:
y_train.shape

(750,)

In [20]:
lr = LinearRegression()
lr.fit(X_train,y_train)
y_predict = lr.predict(X_test)
print("score:", lr.score(X_test, y_test))
print('MSE: {:2e}'.format(mean_squared_error(y_test, y_predict)))

score: 0.7919286504566754
MSE: 2.754173e+09


In [21]:
np.arange(-5,5,1)

array([-5, -4, -3, -2, -1,  0,  1,  2,  3,  4])

In [22]:
type(y_train)

numpy.ndarray

In [23]:
kf = KFold()
skf = StratifiedKFold()
ridge_pipe = Pipeline([('scaler', StandardScaler()), ('regressor', Ridge())])
ridge_grid_params = {'regressor__alpha': np.arange(0,50,1)}
ridge_grid = GridSearchCV(ridge_pipe, ridge_grid_params, cv=kf, scoring = 'neg_mean_squared_error')
ridge_grid.fit(X_train, y_train)
print("Ridge Accuracy: {:.2e}".format(ridge_grid.score(X_test, y_test)))

Ridge Accuracy: -2.76e+09


In [24]:
kf = KFold()
skf = StratifiedKFold()
lasso_pipe = Pipeline([('scaler', StandardScaler()), ('regressor', Lasso())])
lasso_grid_params = {'regressor__alpha': np.arange(0.1,0.9,0.1)}
lasso_grid = GridSearchCV(lasso_pipe, lasso_grid_params, cv=kf, scoring = 'neg_mean_squared_error')
lasso_grid.fit(X_train, y_train)
print("Lasso Accuracy: {:.2e}".format(lasso_grid.score(X_test, y_test)))

Lasso Accuracy: -2.75e+09


In [32]:
kf = KFold()
skf = StratifiedKFold()
gbr_pipe = Pipeline([('scaler', StandardScaler()), ('regressor', GradientBoostingRegressor(loss='huber'))])
gbr_grid_params = {'regressor__alpha': np.arange(0.1,0.9,0.2),
                  'regressor__max_depth': np.arange(1,9,2),
                  'regressor__learning_rate': np.arange(0.01,0.2,0.03)}
gbr_grid = GridSearchCV(gbr_pipe, gbr_grid_params, cv=kf, scoring = 'neg_mean_squared_error')
gbr_grid.fit(X_train, y_train)
print("gbr Accuracy: {:.2e}".format(gbr_grid.score(X_test, y_test)))

gbr Accuracy: -1.66e+09


In [27]:
gbr_grid.best_params_

{'regressor__alpha': 0.4,
 'regressor__learning_rate': 0.16000000000000003,
 'regressor__max_depth': 4}

In [26]:
kf = KFold()
skf = StratifiedKFold()
gbr_pipe2 = Pipeline([('scaler', StandardScaler()), ('regressor', GradientBoostingRegressor(loss='huber'))])
gbr_grid_params2 = {'regressor__alpha': np.arange(0.1,0.9,0.3)}
gbr_grid2 = GridSearchCV(gbr_pipe2, gbr_grid_params2, cv=kf, scoring = 'neg_mean_squared_error')
gbr_grid2.fit(X_train, y_train)
print("gbr Accuracy: {:.2e}".format(gbr_grid.score(X_test, y_test)))

gbr Accuracy: -1.61e+09


In [30]:
gbr_grid2.best_params_

{'regressor__alpha': 0.7000000000000001}

In [183]:
SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

In [184]:
np.arange(0,10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [185]:
X_df = pd.DataFrame(X)

In [186]:
X_df.corr() > 0.2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,True,False,False,False,False,False,False,False,False,False
1,False,True,False,False,False,True,False,False,False,False
2,False,False,True,False,False,False,False,False,False,False
3,False,False,False,True,False,False,False,True,False,False
4,False,False,False,False,True,False,False,False,False,False
5,False,True,False,False,False,True,False,False,False,False
6,False,False,False,False,False,False,True,False,False,False
7,False,False,False,True,False,False,False,True,False,False
8,False,False,False,False,False,False,False,False,True,False
9,False,False,False,False,False,False,False,False,False,True


In [199]:
X_train = sm.add_constant(X_train)

In [200]:
X_opt = X_train[:,np.arange(0,10)]
regressor_OLS = sm.OLS(endog = y_train, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.702
Model:,OLS,Adj. R-squared:,0.702
Method:,Least Squares,F-statistic:,2286.0
Date:,"Wed, 24 Mar 2021",Prob (F-statistic):,0.0
Time:,22:03:31,Log-Likelihood:,-108740.0
No. Observations:,8740,AIC:,217500.0
Df Residuals:,8730,BIC:,217600.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.401e+06,9.26e+05,1.513,0.130,-4.14e+05,3.22e+06
x1,-90.6304,11.428,-7.930,0.000,-113.033,-68.228
x2,105.7373,1.427,74.093,0.000,102.940,108.535
x3,0.0002,0.000,0.920,0.357,-0.000,0.001
x4,871.5939,77.830,11.199,0.000,719.028,1024.160
x5,1055.0176,21.998,47.960,0.000,1011.896,1098.139
x6,8.278e+04,6999.336,11.827,0.000,6.91e+04,9.65e+04
x7,-1.451e+04,4057.741,-3.576,0.000,-2.25e+04,-6556.809
x8,1.668e+04,1847.358,9.030,0.000,1.31e+04,2.03e+04

0,1,2,3
Omnibus:,6365.773,Durbin-Watson:,1.956
Prob(Omnibus):,0.0,Jarque-Bera (JB):,719308.317
Skew:,2.697,Prob(JB):,0.0
Kurtosis:,47.115,Cond. No.,5070000000.0


In [202]:
X_opt = X_train[:,[0,1,2,4,5,6,7,8,9,10]]

In [203]:
regressor_OLS = sm.OLS(endog = y_train, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.713
Model:,OLS,Adj. R-squared:,0.713
Method:,Least Squares,F-statistic:,2409.0
Date:,"Wed, 24 Mar 2021",Prob (F-statistic):,0.0
Time:,22:03:59,Log-Likelihood:,-108570.0
No. Observations:,8740,AIC:,217200.0
Df Residuals:,8730,BIC:,217200.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.625e+07,1.77e+06,-14.793,0.000,-2.97e+07,-2.28e+07
x1,-94.1917,11.221,-8.394,0.000,-116.188,-72.196
x2,108.0740,1.407,76.822,0.000,105.316,110.832
x3,831.0016,76.436,10.872,0.000,681.169,980.834
x4,1033.5446,21.627,47.789,0.000,991.150,1075.939
x5,8.316e+04,6871.368,12.103,0.000,6.97e+04,9.66e+04
x6,-1.834e+04,3989.132,-4.597,0.000,-2.62e+04,-1.05e+04
x7,1.941e+04,1819.451,10.669,0.000,1.58e+04,2.3e+04
x8,1133.9101,205.944,5.506,0.000,730.212,1537.608

0,1,2,3
Omnibus:,6564.327,Durbin-Watson:,1.952
Prob(Omnibus):,0.0,Jarque-Bera (JB):,786991.236
Skew:,2.821,Prob(JB):,0.0
Kurtosis:,49.144,Cond. No.,216000000.0


In [205]:
y_predict = regressor_OLS.predict(X_test)

In [208]:
print('MSE: {:.2e}'.format(mean_squared_error(y_test, y_predict)))

MSE: 4.21e+24
