In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import linear_model
import statsmodels.formula.api as smf
from sqlalchemy import create_engine
import statsmodels.api as sm
from scipy.stats import bartlett
from scipy.stats import levene
from statsmodels.tsa.stattools import acf

# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

import warnings
warnings.filterwarnings(action="ignore")

In [2]:
#load data
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
house_df = pd.read_sql_query('select * from houseprices',con=engine)

# no need for an open connection, as we're only doing a single query
engine.dispose()

In [3]:
#select variables for model
categories2 = ['mszoning', 'street','centralair', 'kitchenqual']
#create dummy variables
house_df = pd.concat([house_df,pd.get_dummies(house_df.mszoning, prefix='mszoning', drop_first=True)], axis=1)
zoning_column_names = list(pd.get_dummies(house_df.mszoning, prefix='mszoning', drop_first=True).columns)
house_df['street_access'] = pd.get_dummies(house_df.street, drop_first=True)
house_df['has_AC'] = pd.get_dummies(house_df.centralair, drop_first=True)
house_df = pd.concat([house_df,pd.get_dummies(house_df.kitchenqual, prefix='kitchenqual', drop_first=True)], axis=1)
kitchen_column_names = list(pd.get_dummies(house_df.kitchenqual, prefix='kitchenqual', drop_first=True).columns)

In [4]:
#target variable
Y = house_df['saleprice']
#feature set
X = house_df[['overallqual', 'totalbsmtsf', 'firstflrsf','grlivarea', 'garagecars', 'garagearea', 
             'street_access', 'has_AC'] + zoning_column_names + kitchen_column_names]

#define linear model
X = sm.add_constant(X)

results = sm.OLS(Y, X).fit()

results.summary()

0,1,2,3
Dep. Variable:,saleprice,R-squared:,0.794
Model:,OLS,Adj. R-squared:,0.792
Method:,Least Squares,F-statistic:,371.8
Date:,"Mon, 16 Sep 2019",Prob (F-statistic):,0.0
Time:,13:05:39,Log-Likelihood:,-17390.0
No. Observations:,1460,AIC:,34810.0
Df Residuals:,1444,BIC:,34900.0
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.691e+04,1.89e+04,-1.426,0.154,-6.39e+04,1.01e+04
overallqual,1.777e+04,1163.014,15.280,0.000,1.55e+04,2.01e+04
totalbsmtsf,20.6404,4.047,5.100,0.000,12.702,28.579
firstflrsf,4.3663,4.793,0.911,0.362,-5.035,13.767
grlivarea,44.4075,2.521,17.617,0.000,39.463,49.352
garagecars,1.419e+04,2843.841,4.990,0.000,8613.025,1.98e+04
garagearea,7.2384,9.889,0.732,0.464,-12.161,26.638
street_access,-5206.4437,1.54e+04,-0.337,0.736,-3.55e+04,2.51e+04
has_AC,1.016e+04,4306.673,2.358,0.018,1708.935,1.86e+04

0,1,2,3
Omnibus:,500.709,Durbin-Watson:,1.992
Prob(Omnibus):,0.0,Jarque-Bera (JB):,56929.274
Skew:,-0.544,Prob(JB):,0.0
Kurtosis:,33.572,Cond. No.,65400.0


Which features are statistically significant, and which are not?

The coefficients for overall quality, total basement square footage, ground floor living area, garage area by car capacity, has air-conditioning, mszoning_FV, mszoning_RL, kitchenqual_Fa, kitchenqual_Gd, and kitchenqual_TA are statistically significant. The coefficients for first floor square footage, garage area, street access, mszoning_RH, and mszoning_RM are not significantly significant.

Now, exclude the insignificant features from your model.

In [5]:
#update feature set
X2 = house_df[['overallqual', 'totalbsmtsf', 'grlivarea', 'garagecars', 'has_AC', 'mszoning_FV', 'mszoning_RL',
              'kitchenqual_Fa', 'kitchenqual_Gd', 'kitchenqual_TA']]

#define linear model
X2 = sm.add_constant(X2)

results2 = sm.OLS(Y, X2).fit()

results2.summary()

0,1,2,3
Dep. Variable:,saleprice,R-squared:,0.794
Model:,OLS,Adj. R-squared:,0.792
Method:,Least Squares,F-statistic:,558.2
Date:,"Mon, 16 Sep 2019",Prob (F-statistic):,0.0
Time:,13:05:39,Log-Likelihood:,-17391.0
No. Observations:,1460,AIC:,34800.0
Df Residuals:,1449,BIC:,34860.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.053e+04,9102.173,-2.256,0.024,-3.84e+04,-2678.589
overallqual,1.768e+04,1147.122,15.409,0.000,1.54e+04,1.99e+04
totalbsmtsf,23.7283,2.739,8.664,0.000,18.356,29.100
grlivarea,45.4407,2.335,19.463,0.000,40.861,50.020
garagecars,1.595e+04,1667.736,9.563,0.000,1.27e+04,1.92e+04
has_AC,1.023e+04,4255.049,2.405,0.016,1886.488,1.86e+04
mszoning_FV,1.619e+04,5342.893,3.030,0.002,5710.727,2.67e+04
mszoning_RL,1.887e+04,2729.682,6.913,0.000,1.35e+04,2.42e+04
kitchenqual_Fa,-5.671e+04,7942.782,-7.140,0.000,-7.23e+04,-4.11e+04

0,1,2,3
Omnibus:,494.794,Durbin-Watson:,1.993
Prob(Omnibus):,0.0,Jarque-Bera (JB):,55060.253
Skew:,-0.527,Prob(JB):,0.0
Kurtosis:,33.066,Cond. No.,24700.0


Did anything change?

The coefficients of the variables changed a little bit when I removed the insignificant variable, but the model overall is similar to the original model.

Interpret the statistically significant coefficients by quantifying their relations with the house prices. Which features have a more prominent effect on house prices?

The kitchen quality variables (fair, good, and average) have the highest effect on house prices, and all of them have a negative impact. Of the statistically significant variables, the total basement square footage and ground floor square footage have the smallest impact on house prices. 

Do the results sound reasonable to you? If not, try to explain the potential reasons.

This sounds reasonable because the kitchen is one of the most important rooms in the house and can be very expensive if it needs to be renovated. Since the dummy variable for the kitchen being in excellent condition is dropped, it is the default in the experiment. This is why the kitchen quality variable taking any other value causes a decrease in sale price of the house.