Import necessary packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
from statsmodels.formula.api import ols

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
%matplotlib inline

from custom_functions import remove_extremes

Import dataset that was cleaned and prepared in Data Preparation notebook

In [2]:
conn = sqlite3.connect('../../data/processed/main.db')
query = '''SELECT * FROM df_main'''
df_main = pd.read_sql(query, conn)
df_main.drop('index', axis=1, inplace=True)
df_main.head(3)

Unnamed: 0,SalePrice,Parcel_ID,PropertyType,PrincipalUse,SaleInstrument,AFForestLand,AFCurrentUseLand,AFNonProfitUse,AFHistoricProperty,SaleReason,PropertyClass,SaleWarning,porch_both,porch_none,porch_closed,porch_open,heat_FloorWall,heat_Gravity,heat_Radiant,heat_ElecBB,heat_ForcedAir,heat_HotWater,heat_HeatPump,heat_Other,NbrLivingUnits,ZipCode,Stories,BldgGrade,SqFt1stFloor,SqFtHalfFloor,SqFt2ndFloor,SqFtUpperFloor,SqFtUnfinFull,SqFtUnfinHalf,SqFtTotLiving,SqFtTotBasement,SqFtFinBasement,FinBasementGrade,SqFtGarageBasement,SqFtGarageAttached,DaylightBasement,SqFtOpenPorch,SqFtEnclosedPorch,SqFtDeck,HeatSystem,HeatSource,BrickStone,ViewUtilization,Bedrooms,BathHalfCount,Bath3qtrCount,BathFullCount,FpSingleStory,FpMultiStory,FpFreestanding,FpAdditional,YrBuilt,YrRenovated,PcntComplete,Obsolescence,PcntNetCondition,Condition
0,445000.0,010050-0380,11,6,3,N,N,N,N,1,8,,0,0,0,1,0,0,0,0,1,0,0,0,1,98003,1,7,1480,0,0,0,0,0,1480,0,0,0,0,440,,80,0,120,5,2,0,,3,0,0,2,1,0,0,0,1994,0,0,0,0,5
1,188500.0,017900-0315,11,6,3,N,N,N,N,1,8,,0,1,0,0,1,0,0,0,0,0,0,0,1,98178,1,5,550,0,0,0,0,0,550,0,0,0,0,0,,0,0,0,1,1,0,,2,0,1,0,0,0,0,0,1942,0,0,0,0,3
2,730000.0,018800-0095,11,6,3,N,N,N,N,1,8,,0,1,0,0,1,0,0,0,0,0,0,0,1,98102,2,9,660,0,595,0,0,0,1540,380,285,8,0,0,Y,0,0,155,1,3,0,Y,3,1,1,1,0,1,0,0,1984,0,0,0,0,3


## Model *Porch* Predictions

In [3]:
df_main.columns

Index(['SalePrice', 'Parcel_ID', 'PropertyType', 'PrincipalUse',
       'SaleInstrument', 'AFForestLand', 'AFCurrentUseLand', 'AFNonProfitUse',
       'porch_both', 'porch_none', 'porch_closed', 'porch_open',
       'heat_FloorWall', 'heat_Gravity', 'heat_Radiant', 'heat_ElecBB',
       'heat_ForcedAir', 'heat_HotWater', 'heat_HeatPump', 'heat_Other',
       'NbrLivingUnits', 'ZipCode', 'Stories', 'BldgGrade', 'SqFt1stFloor',
       'SqFtHalfFloor', 'SqFt2ndFloor', 'SqFtUpperFloor', 'SqFtUnfinFull',
       'SqFtUnfinHalf', 'SqFtTotLiving', 'SqFtTotBasement', 'SqFtFinBasement',
       'FinBasementGrade', 'SqFtGarageBasement', 'SqFtGarageAttached',
       'DaylightBasement', 'SqFtOpenPorch', 'SqFtEnclosedPorch', 'SqFtDeck',
       'HeatSystem', 'HeatSource', 'BrickStone', 'ViewUtilization', 'Bedrooms',
       'BathHalfCount', 'Bath3qtrCount', 'BathFullCount', 'FpSingleStory',
       'FpMultiStory', 'FpFreestanding', 'FpAdditional', 'YrBuilt',
       'YrRenovated', 'PcntComplete', 'Obsole

In [4]:
def remove_df_extremes(df, devct, specific_columns=False, drop_zeros=False):
    
    if specific_columns==False:
        keeper = pd.DataFrame(columns=df.columns)
    else:
        keeper = pd.DataFrame(columns=df[specific_columns].columns)
    
    for col in cols:
        keeper[col] = [float(num) for num in df[col]]
        med = keeper[col].median()
        std = keeper[col].std()

        max_ = med + devct*std
        min_ = 0.0001 if drop_zeros else med - devct*std 

        keeper[col] = [x if ((x>min_) & (x<max_)) else np.nan for x in keeper[col]]
    return keeper

In [5]:
df_main[['heat_FloorWall', 'heat_Gravity', 'heat_Radiant', 'heat_ElecBB',
       'heat_ForcedAir', 'heat_HotWater', 'heat_HeatPump', 'heat_Other']]

Unnamed: 0,heat_FloorWall,heat_Gravity,heat_Radiant,heat_ElecBB,heat_ForcedAir,heat_HotWater,heat_HeatPump,heat_Other
0,0,0,0,0,1,0,0,0
1,1,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0
4,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...
15948,0,0,0,0,1,0,0,0
15949,0,0,0,0,1,0,0,0
15950,0,0,0,0,1,0,0,0
15951,0,0,0,0,1,0,0,0


In [6]:
def produce_model(df, xs=False, y='SalePrice'):
    
    xs = df.drop(y, axis=1).columns if xs == False else xs    
    formula = y + ' ~ ' + '+'.join(xs)
    print(formula)
    
    model_data = pd.concat([df[y], df[xs]], axis=1)
    model = ols(formula, model_data).fit()
    return model.summary()


In [7]:
xs = ['heat_FloorWall', 'heat_Gravity', 'heat_Radiant', 'heat_ElecBB', 'heat_HotWater', 'heat_HeatPump', 'heat_Other']
produce_model(df_main, xs)

SalePrice ~ heat_FloorWall+heat_Gravity+heat_Radiant+heat_ElecBB+heat_HotWater+heat_HeatPump+heat_Other


0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.043
Model:,OLS,Adj. R-squared:,0.043
Method:,Least Squares,F-statistic:,103.1
Date:,"Thu, 04 Mar 2021",Prob (F-statistic):,3.7e-148
Time:,09:09:08,Log-Likelihood:,-234160.0
No. Observations:,15953,AIC:,468300.0
Df Residuals:,15945,BIC:,468400.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,7.757e+05,5105.469,151.939,0.000,7.66e+05,7.86e+05
heat_FloorWall,-2.293e+05,2.65e+04,-8.668,0.000,-2.81e+05,-1.77e+05
heat_Gravity,5861.3406,9.32e+04,0.063,0.950,-1.77e+05,1.88e+05
heat_Radiant,1.715e+05,4.26e+04,4.028,0.000,8.81e+04,2.55e+05
heat_ElecBB,-2.734e+05,1.78e+04,-15.323,0.000,-3.08e+05,-2.38e+05
heat_HotWater,4.513e+05,3.04e+04,14.826,0.000,3.92e+05,5.11e+05
heat_HeatPump,2.053e+05,1.78e+04,11.534,0.000,1.7e+05,2.4e+05
heat_Other,-9.294e+04,1.81e+05,-0.512,0.608,-4.48e+05,2.63e+05

0,1,2,3
Omnibus:,20277.732,Durbin-Watson:,1.8
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7565148.381
Skew:,6.721,Prob(JB):,0.0
Kurtosis:,108.832,Cond. No.,40.2


In [8]:
cols = ['SalePrice', 'porch_both', 'porch_none', 'porch_closed', 'porch_open']
cleaned = remove_df_extremes(df_main, 3, cols, drop_zeros=False)


#x = cleaned['SqFtOpenPorch']
y = cleaned.SalePrice
# plt.scatter(x, y, color='r', alpha=0.5)
# plt.scatter(cleaned['SqFtEnclosedPorch'], y, color='b', alpha=0.5)

cols.remove('SalePrice')
#fig, axes = plt.subplots(2, 2, figsize=(12, 8))
#for col, ax in zip(cleaned.drop('SalePrice', axis=1).columns, axes.flatten()):
    #ax.scatter(cleaned[col], y)

    
cleaned.loc[cleaned.porch_both > 0, 'porch_both']

Series([], Name: porch_both, dtype: float64)

In [9]:

outcome = ['SalePrice']
#columns = ['heat_FloorWall', 'heat_Gravity', 'heat_Radiant', 'heat_ElecBB', 'heat_HotWater', 'heat_HeatPump', 'heat_Other']

columns = ['SqFtOpenPorch', 'SqFtEnclosedPorch']

predictors = df_main[outcome+columns]
pred_sum = '+'.join(df_main[columns].columns)
formula = outcome[0] + ' ~ ' + pred_sum
model = ols(formula, predictors).fit()
model.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.062
Model:,OLS,Adj. R-squared:,0.062
Method:,Least Squares,F-statistic:,527.7
Date:,"Thu, 04 Mar 2021",Prob (F-statistic):,1.25e-222
Time:,09:09:09,Log-Likelihood:,-234000.0
No. Observations:,15953,AIC:,468000.0
Df Residuals:,15950,BIC:,468000.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,7.117e+05,4950.516,143.767,0.000,7.02e+05,7.21e+05
SqFtOpenPorch,990.0358,30.494,32.466,0.000,930.264,1049.808
SqFtEnclosedPorch,137.1681,116.774,1.175,0.240,-91.722,366.059

0,1,2,3
Omnibus:,19328.382,Durbin-Watson:,1.795
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7833425.858
Skew:,6.079,Prob(JB):,0.0
Kurtosis:,110.875,Cond. No.,177.0
