# Predicting Home Sale Price

This will attempt to be similar to the Zestimate used on Zillow, the Redfin estimate used on Redfin, or any of the valuations provided by Collateral Analytics, CoreLogic, or Quantarium on Realtor.com.

I will create a regression model and use it to predict valuation and compare to the other valuations on currently listed homes.

In [1]:
%run src/imports.py
from src.helpers import calculate_five_percent, calculate_ten_percent

In [2]:
sns.set(rc={'figure.figsize':(16,4)})

In [3]:
modeldata = pd.read_pickle('data/modeldata.pkl')

-----
Data Density  
= 11,654 / 11  
= 1059
-----
With smaller dataset  
= 1,000 / 11  
= 91
----

## Building a Model

Smaller set:

In [8]:
df_small = modeldata.sample(1000)
y = df_small['PRICE'].values
X = df_small.drop('PRICE',axis=1).values
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [9]:
rfr = RandomForestRegressor()
rfr.fit(X_train,y_train)
y_predict = rfr.predict(X_test)
print("score:", rfr.score(X_test, y_test))
print('MSE: {:2e}'.format(mean_squared_error(y_test, y_predict)))

score: 0.8120338496848534
MSE: 3.712353e+09


In [10]:
calculate_five_percent(y_predict,y_test)

34.4

In [11]:
calculate_ten_percent(y_predict,y_test)

55.60000000000001

In [12]:
search_space = [{'regressor__alpha': 10 ** (np.linspace(-5, 5, 10)), 
                 'regressor__l1_ratio': np.linspace(0,1,6)}]

# l_lasso / (l_lasso + l_ridge)
# if l_lasso = 0: 0 / 1 = 0 (this is ridge regression)
# if l_ridge = 0: l_lasso / l_lasso = 1 (this is LASSSO regression)

pipe = Pipeline([('scaler', StandardScaler()), ('regressor', ElasticNet())])

enet_cv = GridSearchCV(pipe, 
                      search_space, 
                      cv=KFold(10, shuffle=True), 
                      scoring='neg_mean_squared_error',
                      return_train_score = True)

In [13]:
enet_cv.fit(X_train,y_train)

enet_cv.best_params_

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

{'regressor__alpha': 1e-05, 'regressor__l1_ratio': 1.0}

In [14]:
y_predict = enet_cv.predict(X_test)
print('MSE: {:2e}'.format(mean_squared_error(y_test, y_predict)))

MSE: 8.452559e+09


In [15]:
calculate_five_percent(y_predict,y_test)

10.8

In [16]:
calculate_ten_percent(y_predict,y_test)

27.200000000000003

In [17]:
ridge = Ridge(alpha=1)
ridge.fit(X_train,y_train)
y_predict = ridge.predict(X_test)
print("score:", ridge.score(X_test, y_test))
print('MSE: {:2e}'.format(mean_squared_error(y_test, y_predict)))

score: 0.5721873723338067
MSE: 8.449348e+09


In [18]:
print(calculate_five_percent(y_predict,y_test))
print(calculate_ten_percent(y_predict,y_test))

10.4
26.8


In [20]:
lr = LinearRegression()
lr.fit(X_train,y_train)
y_predict = lr.predict(X_test)
print("score:", lr.score(X_test, y_test))
print('MSE: {:2e}'.format(mean_squared_error(y_test, y_predict)))

score: 0.5720247858690821
MSE: 8.452559e+09


In [21]:
print(calculate_five_percent(y_predict,y_test))
print(calculate_ten_percent(y_predict,y_test))

10.8
27.200000000000003


In [22]:
kf = KFold()
skf = StratifiedKFold()
ridge_pipe = Pipeline([('scaler', StandardScaler()), ('regressor', Ridge())])
ridge_grid_params = {'regressor__alpha': np.arange(0,50,1)}
ridge_grid = GridSearchCV(ridge_pipe, ridge_grid_params, cv=kf, scoring = 'neg_mean_squared_error')
ridge_grid.fit(X_train, y_train)
y_predict = ridge_grid.predict(X_test)
print("Ridge Accuracy: {:.2e}".format(ridge_grid.score(X_test, y_test)))
print(calculate_five_percent(y_predict,y_test))
print(calculate_ten_percent(y_predict,y_test))

Ridge Accuracy: -8.45e+09
10.8
27.200000000000003


In [23]:
kf = KFold()
skf = StratifiedKFold()
lasso_pipe = Pipeline([('scaler', StandardScaler()), ('regressor', Lasso())])
lasso_grid_params = {'regressor__alpha': np.arange(0.1,0.9,0.1)}
lasso_grid = GridSearchCV(lasso_pipe, lasso_grid_params, cv=kf, scoring = 'neg_mean_squared_error')
lasso_grid.fit(X_train, y_train)
y_predict = lasso_grid.predict(X_test)
print("Lasso Accuracy: {:.2e}".format(lasso_grid.score(X_test, y_test)))
print(calculate_five_percent(y_predict,y_test))
print(calculate_ten_percent(y_predict,y_test))

Lasso Accuracy: -8.45e+09
10.8
27.200000000000003


In [24]:
kf = KFold()
skf = StratifiedKFold()
gbr_pipe = Pipeline([('scaler', StandardScaler()), ('regressor', GradientBoostingRegressor(loss='huber'))])
gbr_grid_params = {'regressor__alpha': np.arange(0.1,0.9,0.2),
                  'regressor__max_depth': np.arange(1,9,2),
                  'regressor__learning_rate': np.arange(0.01,0.2,0.03)}
gbr_grid = GridSearchCV(gbr_pipe, gbr_grid_params, cv=kf, scoring = 'neg_mean_squared_error')
gbr_grid.fit(X_train, y_train)
y_predict = gbr_grid.predict(X_test)
print("gbr Accuracy: {:.2e}".format(gbr_grid.score(X_test, y_test)))
print(calculate_five_percent(y_predict,y_test))
print(calculate_ten_percent(y_predict,y_test))

gbr Accuracy: -4.40e+09
32.4
53.2


In [25]:
gbr_grid.best_params_

{'regressor__alpha': 0.7000000000000001,
 'regressor__learning_rate': 0.06999999999999999,
 'regressor__max_depth': 7}

In [26]:
kf = KFold()
skf = StratifiedKFold()
gbr_pipe2 = Pipeline([('scaler', StandardScaler()), ('regressor', GradientBoostingRegressor(loss='huber'))])
gbr_grid_params2 = {'regressor__alpha': np.arange(0.1,0.9,0.3)}
gbr_grid2 = GridSearchCV(gbr_pipe2, gbr_grid_params2, cv=kf, scoring = 'neg_mean_squared_error')
gbr_grid2.fit(X_train, y_train)
y_predict = gbr_grid2.predict(X_test)
print("gbr Accuracy: {:.2e}".format(gbr_grid.score(X_test, y_test)))
print(calculate_five_percent(y_predict,y_test))
print(calculate_ten_percent(y_predict,y_test))

gbr Accuracy: -1.61e+09


In [30]:
gbr_grid2.best_params_

{'regressor__alpha': 0.7000000000000001}

In [95]:
X_df = pd.DataFrame(X)

In [96]:
X_df.corr() > 0.2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,True,False,False,False,False,False,False,False,False,False
1,False,True,True,False,True,True,False,True,False,False
2,False,True,True,False,True,False,False,True,False,False
3,False,False,False,True,False,False,False,False,False,False
4,False,True,True,False,True,True,False,True,False,False
5,False,True,False,False,True,True,False,True,False,False
6,False,False,False,False,False,False,True,False,False,False
7,False,True,True,False,True,True,False,True,False,False
8,False,False,False,False,False,False,False,False,True,False
9,False,False,False,False,False,False,False,False,False,True


In [97]:
X_train = sm.add_constant(X_train)

In [98]:
X_opt = X_train[:,np.arange(0,10)]
regressor_OLS = sm.OLS(endog = y_train, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.751
Model:,OLS,Adj. R-squared:,0.748
Method:,Least Squares,F-statistic:,248.1
Date:,"Sat, 27 Mar 2021",Prob (F-statistic):,1.0799999999999999e-216
Time:,15:00:55,Log-Likelihood:,-9383.2
No. Observations:,750,AIC:,18790.0
Df Residuals:,740,BIC:,18830.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.111e+07,3.74e+06,-2.971,0.003,-1.84e+07,-3.77e+06
x1,79.9889,46.982,1.703,0.089,-12.245,172.223
x2,94.9924,6.432,14.768,0.000,82.365,107.620
x3,6.4329,0.690,9.330,0.000,5.079,7.787
x4,1601.0958,287.692,5.565,0.000,1036.307,2165.885
x5,1319.0049,87.941,14.999,0.000,1146.361,1491.649
x6,7.166e+04,2.5e+04,2.871,0.004,2.27e+04,1.21e+05
x7,4888.7993,1.51e+04,0.323,0.747,-2.48e+04,3.46e+04
x8,-7844.5272,3576.188,-2.194,0.029,-1.49e+04,-823.844

0,1,2,3
Omnibus:,767.532,Durbin-Watson:,1.956
Prob(Omnibus):,0.0,Jarque-Bera (JB):,77395.951
Skew:,4.441,Prob(JB):,0.0
Kurtosis:,51.967,Cond. No.,122000000.0


In [202]:
X_opt = X_train[:,[0,1,2,4,5,6,7,8,9,10]]

In [203]:
regressor_OLS = sm.OLS(endog = y_train, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.713
Model:,OLS,Adj. R-squared:,0.713
Method:,Least Squares,F-statistic:,2409.0
Date:,"Wed, 24 Mar 2021",Prob (F-statistic):,0.0
Time:,22:03:59,Log-Likelihood:,-108570.0
No. Observations:,8740,AIC:,217200.0
Df Residuals:,8730,BIC:,217200.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.625e+07,1.77e+06,-14.793,0.000,-2.97e+07,-2.28e+07
x1,-94.1917,11.221,-8.394,0.000,-116.188,-72.196
x2,108.0740,1.407,76.822,0.000,105.316,110.832
x3,831.0016,76.436,10.872,0.000,681.169,980.834
x4,1033.5446,21.627,47.789,0.000,991.150,1075.939
x5,8.316e+04,6871.368,12.103,0.000,6.97e+04,9.66e+04
x6,-1.834e+04,3989.132,-4.597,0.000,-2.62e+04,-1.05e+04
x7,1.941e+04,1819.451,10.669,0.000,1.58e+04,2.3e+04
x8,1133.9101,205.944,5.506,0.000,730.212,1537.608

0,1,2,3
Omnibus:,6564.327,Durbin-Watson:,1.952
Prob(Omnibus):,0.0,Jarque-Bera (JB):,786991.236
Skew:,2.821,Prob(JB):,0.0
Kurtosis:,49.144,Cond. No.,216000000.0


In [99]:
y_predict = regressor_OLS.predict(X_test)

In [100]:
print('MSE: {:.2e}'.format(mean_squared_error(y_test, y_predict)))

MSE: 7.55e+23


In [101]:
calculate_five_percent(y_predict,y_test)
calculate_ten_percent(y_predict,y_test)

0.0