# Evaluating Performance: Houseprices

## Imports and connection

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from sqlalchemy import create_engine

# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

import warnings
warnings.filterwarnings(action="ignore")

  import pandas.util.testing as tm
  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)


In [2]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

house_prices_df = pd.read_sql_query('select * from houseprices',con=engine)

# no need for an open connection, as we're only doing a single query
engine.dispose()

## Understanding the Data

In [3]:
house_prices_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             1460 non-null   int64  
 1   mssubclass     1460 non-null   int64  
 2   mszoning       1460 non-null   object 
 3   lotfrontage    1201 non-null   float64
 4   lotarea        1460 non-null   int64  
 5   street         1460 non-null   object 
 6   alley          91 non-null     object 
 7   lotshape       1460 non-null   object 
 8   landcontour    1460 non-null   object 
 9   utilities      1460 non-null   object 
 10  lotconfig      1460 non-null   object 
 11  landslope      1460 non-null   object 
 12  neighborhood   1460 non-null   object 
 13  condition1     1460 non-null   object 
 14  condition2     1460 non-null   object 
 15  bldgtype       1460 non-null   object 
 16  housestyle     1460 non-null   object 
 17  overallqual    1460 non-null   int64  
 18  overallc

In [4]:
house_prices_df.head()

Unnamed: 0,id,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,...,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition,saleprice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Evaluating Performance

#### Assessing goodness of fit using F-test, R-squared, adjusted R-squared, AIC and BIC

In order to improve the goodness of fit of the model, I will try different model specifications by adding or removing some variables.

In [5]:
house_prices_df = pd.concat([house_prices_df,pd.get_dummies(house_prices_df.mszoning, prefix="mszoning", drop_first=True)], axis=1)
house_prices_df = pd.concat([house_prices_df,pd.get_dummies(house_prices_df.street, prefix="street", drop_first=True)], axis=1)

dummy_column_names = list(pd.get_dummies(house_prices_df.mszoning, prefix="mszoning", drop_first=True).columns)
dummy_column_names = dummy_column_names + list(pd.get_dummies(house_prices_df.street, prefix="street", drop_first=True).columns)

In [6]:
# Y is the target variable

Y = house_prices_df['saleprice']

# X is the feature set

X = house_prices_df[['overallqual', 'grlivarea', 'garagecars', 'garagearea', 'totalbsmtsf'] + dummy_column_names]

X = X.drop(columns=['garagearea', 'mszoning_FV', 'mszoning_RH', 'mszoning_RM', 'street_Pave'])

X = sm.add_constant(X)

results = sm.OLS(Y, X).fit()

results.summary()

0,1,2,3
Dep. Variable:,saleprice,R-squared:,0.767
Model:,OLS,Adj. R-squared:,0.766
Method:,Least Squares,F-statistic:,956.8
Date:,"Sat, 16 May 2020",Prob (F-statistic):,0.0
Time:,22:27:30,Log-Likelihood:,-17481.0
No. Observations:,1460,AIC:,34970.0
Df Residuals:,1454,BIC:,35010.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.083e+05,4804.236,-22.540,0.000,-1.18e+05,-9.89e+04
overallqual,2.396e+04,1060.549,22.588,0.000,2.19e+04,2.6e+04
grlivarea,45.4093,2.452,18.517,0.000,40.599,50.220
garagecars,1.763e+04,1731.766,10.183,0.000,1.42e+04,2.1e+04
totalbsmtsf,28.8729,2.862,10.088,0.000,23.259,34.487
mszoning_RL,1.596e+04,2558.589,6.238,0.000,1.09e+04,2.1e+04

0,1,2,3
Omnibus:,402.656,Durbin-Watson:,1.979
Prob(Omnibus):,0.0,Jarque-Bera (JB):,35429.68
Skew:,-0.08,Prob(JB):,0.0
Kurtosis:,27.133,Cond. No.,9530.0


* R-squared: 0.767
* Adjusted R-squared: 0.766
* F-stat: p-value < 0.05
* AIC: 3.497e+04
* BIC: 3.501e+04

In [7]:
# Y is the target variable

Y = house_prices_df['saleprice']

# X is the feature set

X = house_prices_df[['overallqual', 'grlivarea', 'garagecars', 'garagearea'] + dummy_column_names]

X = X.drop(columns=['garagearea', 'mszoning_FV', 'mszoning_RH', 'mszoning_RM', 'street_Pave'])

X = sm.add_constant(X)

results = sm.OLS(Y, X).fit()

results.summary()

0,1,2,3
Dep. Variable:,saleprice,R-squared:,0.751
Model:,OLS,Adj. R-squared:,0.75
Method:,Least Squares,F-statistic:,1095.0
Date:,"Sat, 16 May 2020",Prob (F-statistic):,0.0
Time:,22:27:31,Log-Likelihood:,-17530.0
No. Observations:,1460,AIC:,35070.0
Df Residuals:,1455,BIC:,35100.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.109e+05,4960.483,-22.360,0.000,-1.21e+05,-1.01e+05
overallqual,2.709e+04,1048.606,25.833,0.000,2.5e+04,2.91e+04
grlivarea,49.6123,2.499,19.853,0.000,44.710,54.514
garagecars,1.963e+04,1778.979,11.036,0.000,1.61e+04,2.31e+04
mszoning_RL,2.123e+04,2589.895,8.198,0.000,1.62e+04,2.63e+04

0,1,2,3
Omnibus:,463.844,Durbin-Watson:,1.993
Prob(Omnibus):,0.0,Jarque-Bera (JB):,11440.264
Skew:,0.908,Prob(JB):,0.0
Kurtosis:,16.593,Cond. No.,7850.0


* R-squared: 0.751
* Adjusted R-squared: 0.750
* F-stat: p-value < 0.05
* AIC: 3.507e+04
* BIC: 3.510e+04

In [8]:
# Y is the target variable

Y = house_prices_df['saleprice']

# X is the feature set

X = house_prices_df[['overallqual', 'grlivarea', 'garagecars', 'garagearea', 'totalbsmtsf', 'yearbuilt'] + dummy_column_names]

X = X.drop(columns=['garagearea', 'mszoning_FV', 'mszoning_RH', 'mszoning_RM', 'street_Pave'])

X = sm.add_constant(X)

results = sm.OLS(Y, X).fit()

results.summary()

0,1,2,3
Dep. Variable:,saleprice,R-squared:,0.772
Model:,OLS,Adj. R-squared:,0.771
Method:,Least Squares,F-statistic:,819.0
Date:,"Sat, 16 May 2020",Prob (F-statistic):,0.0
Time:,22:27:31,Log-Likelihood:,-17465.0
No. Observations:,1460,AIC:,34940.0
Df Residuals:,1453,BIC:,34980.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-5.866e+05,8.59e+04,-6.825,0.000,-7.55e+05,-4.18e+05
overallqual,2.121e+04,1159.325,18.299,0.000,1.89e+04,2.35e+04
grlivarea,49.8040,2.552,19.514,0.000,44.797,54.810
garagecars,1.441e+04,1809.200,7.964,0.000,1.09e+04,1.8e+04
totalbsmtsf,27.4632,2.844,9.656,0.000,21.884,33.042
yearbuilt,252.6358,45.329,5.573,0.000,163.718,341.553
mszoning_RL,1.28e+04,2595.069,4.934,0.000,7712.952,1.79e+04

0,1,2,3
Omnibus:,418.584,Durbin-Watson:,1.983
Prob(Omnibus):,0.0,Jarque-Bera (JB):,42358.719
Skew:,-0.128,Prob(JB):,0.0
Kurtosis:,29.386,Cond. No.,236000.0


* R-squared: 0.772
* Adjusted R-squared: 0.771
* F-stat: p-value < 0.05
* AIC: 3.494e+04
* BIC: 3.498e+04

Based on the goodness of fit metrics of each model, I can confidently say that the third model is the one with the most predictive power, without raising too much concern about overfitting. It's R-squared values are the highest of the three, with a statistically significant F-stat and the lowest AIC / BIC values.