## Feature Engineering/Train Test Split

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set_style('darkgrid')

import warnings
warnings.filterwarnings('ignore')

from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
import scipy.stats as stats
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('cleaned_data.csv')
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,sqft_living15,sqft_lot15
0,342000570,2014-09-09,429000.0,-1.482459,-1.451039,-1.089549,-0.277679,-0.915531,-0.082498,-0.304942,-0.629972,-0.560787,-0.856042,-0.649622,1922,1922.0,0.355179,-0.302785
1,7779200075,2014-09-09,689000.0,-1.482459,-0.475727,0.271949,-0.119683,-0.915531,-0.082498,2.310557,0.907199,-0.560787,-0.68691,1.874139,1953,1953.0,0.836769,-0.110297
2,7954300740,2014-09-09,527000.0,0.676671,0.499586,0.816548,-0.215789,0.937409,-0.082498,-0.304942,-0.629972,1.143952,1.258098,-0.649622,2000,2000.0,1.084861,-0.240382
3,1370803445,2014-09-09,1140000.0,0.676671,-0.475727,1.088848,-0.207652,-0.915531,-0.082498,-0.304942,0.907199,1.143952,-0.107032,2.488027,1941,1941.0,1.420514,-0.258384
4,4006000183,2014-09-09,450000.0,3.915367,2.450211,1.165092,-0.17626,0.937409,-0.082498,-0.304942,-0.629972,0.291583,1.644684,-0.649622,2013,2013.0,-0.155598,-0.247788


In [5]:
outcome = 'price'
x_cols = list(df.columns)
x_cols.remove(outcome)

In [6]:
train, test = train_test_split(df)

In [7]:
print(len(train), len(test))
train.head()

16197 5400


Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,sqft_living15,sqft_lot15
2014,4222200380,2014-08-05,237000.0,-0.402894,-0.150622,-0.403354,-0.173363,-0.915531,-0.082498,-0.304942,-0.629972,0.291583,-0.638587,0.373524,1968,1968.0,-0.330721,-0.17006
933,796000085,2014-09-23,175000.0,0.676671,-1.451039,-0.947953,-0.213689,-0.915531,-0.082498,-0.304942,-0.629972,-0.560787,-0.698991,-0.649622,1962,1962.0,-1.133371,-0.16379
2537,7522600030,2014-08-26,251000.0,-0.402894,-0.150622,-0.849926,-0.161772,-0.915531,-0.082498,-0.304942,0.907199,-0.560787,-0.590264,-0.649622,1967,1967.0,-1.002028,-0.192792
15535,1442300005,2015-02-18,435000.0,-0.402894,-0.150622,-0.043919,-0.198476,0.937409,-0.082498,-0.304942,0.907199,-0.560787,0.303715,-0.649622,1954,1954.0,-0.403689,-0.189235
16468,1118001835,2014-12-23,1720000.0,0.676671,0.499586,1.077956,-0.19058,0.937409,-0.082498,-0.304942,0.907199,1.996322,1.064806,0.259841,1927,1927.0,1.756168,-0.191948


In [8]:
test.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,sqft_living15,sqft_lot15
7827,3438502083,2014-06-12,310000.0,1.756237,1.149794,-0.21819,-0.243873,-0.915531,-0.082498,-0.304942,-0.629972,-0.560787,-0.83188,1.123832,1997,1997.0,-0.505845,-0.284453
12423,7140600020,2015-04-14,245000.0,-0.402894,-1.451039,-1.187577,-0.13282,-0.915531,-0.082498,-0.304942,0.907199,-1.413157,-0.964769,-0.649622,1959,1959.0,-1.124615,-0.087785
1882,8044050040,2014-08-07,419950.0,0.676671,0.499586,0.195705,-0.239912,0.937409,-0.082498,-0.304942,-0.629972,0.291583,0.569493,-0.649622,1996,1996.0,0.39896,-0.252701
21194,2485000076,2015-01-22,1050000.0,0.676671,1.474898,1.742367,-0.157426,-0.915531,-0.082498,-0.304942,2.444371,1.996322,0.062099,3.533909,1959,1959.0,1.04108,-0.134129
7361,1087700030,2014-06-19,450000.0,-0.402894,-0.475727,-0.512274,-0.09416,-0.915531,-0.082498,-0.304942,-0.629972,-0.560787,-0.215759,-0.649622,1955,1955.0,-0.549626,-0.057133


In [9]:
# Fitting the actual model
predictors = '+'.join(x_cols)
formula = outcome + '~' + predictors
model = ols(formula=formula, data=train).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.669
Model:,OLS,Adj. R-squared:,0.66
Method:,Least Squares,F-statistic:,82.82
Date:,"Sun, 01 Nov 2020",Prob (F-statistic):,0.0
Time:,21:52:43,Log-Likelihood:,-221270.0
No. Observations:,16197,AIC:,443300.0
Df Residuals:,15811,BIC:,446300.0
Df Model:,385,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,7.344e+06,1.63e+05,45.069,0.000,7.02e+06,7.66e+06
date[T.2014-05-03],1.515e+05,1.51e+05,1.000,0.317,-1.45e+05,4.48e+05
date[T.2014-05-04],-7.163e+04,1.09e+05,-0.657,0.511,-2.85e+05,1.42e+05
date[T.2014-05-05],1.216e+04,4e+04,0.304,0.761,-6.63e+04,9.06e+04
date[T.2014-05-06],-1.357e+04,4.03e+04,-0.337,0.736,-9.26e+04,6.55e+04
date[T.2014-05-07],2.273e+04,3.83e+04,0.593,0.553,-5.24e+04,9.79e+04
date[T.2014-05-08],4.026e+04,3.99e+04,1.010,0.313,-3.79e+04,1.18e+05
date[T.2014-05-09],-8547.5486,3.9e+04,-0.219,0.827,-8.5e+04,6.79e+04
date[T.2014-05-10],-1.317e+04,1.25e+05,-0.105,0.916,-2.58e+05,2.32e+05

0,1,2,3
Omnibus:,10130.412,Durbin-Watson:,1.988
Prob(Omnibus):,0.0,Jarque-Bera (JB):,394167.124
Skew:,2.43,Prob(JB):,0.0
Kurtosis:,26.674,Cond. No.,1880000000000.0


### Removing Uninfluential Features

In [12]:
# Extracting the p-value table from the summary and use it to subset our features
summary = model.summary()
p_table = summary.tables[1]
p_table = pd.DataFrame(p_table.data)
p_table.columns = p_table.iloc[0]
p_table = p_table.drop(0)
p_table = p_table.set_index(p_table.columns[0])
p_table['P>|t|'] = p_table['P>|t|'].astype(float)
x_cols = list(p_table[p_table['P>|t|'] < 0.05].index)
x_cols.remove('Intercept')
print(len(p_table), len(x_cols))
print(x_cols[:5])
p_table.head(20)

386 38
['date[T.2014-05-13]', 'date[T.2014-07-02]', 'date[T.2014-07-05]', 'date[T.2014-08-15]', 'date[T.2014-10-11]']


Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
,,,,,,
Intercept,7344000.0,163000.0,45.069,0.0,7020000.0,7660000.0
date[T.2014-05-03],151500.0,151000.0,1.0,0.317,-145000.0,448000.0
date[T.2014-05-04],-71630.0,109000.0,-0.657,0.511,-285000.0,142000.0
date[T.2014-05-05],12160.0,40000.0,0.304,0.761,-66300.0,90600.0
date[T.2014-05-06],-13570.0,40300.0,-0.337,0.736,-92600.0,65500.0
date[T.2014-05-07],22730.0,38300.0,0.593,0.553,-52400.0,97900.0
date[T.2014-05-08],40260.0,39900.0,1.01,0.313,-37900.0,118000.0
date[T.2014-05-09],-8547.5486,39000.0,-0.219,0.827,-85000.0,67900.0
date[T.2014-05-10],-13170.0,125000.0,-0.105,0.916,-258000.0,232000.0


In [11]:
predictors = '+'.join(x_cols)
formula = outcome + '~' + predictors
model = ols(formula=formula, data=train).fit()
model.summary()

SyntaxError: invalid syntax (<unknown>, line 1)