In [1]:
# running linear regression on the housing dataset

In [27]:
# Use this notebook for feature selection.
import pandas as pd
import numpy as np
import scipy as sp
from scipy import stats
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from ipywidgets import interact, interact_manual
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mp
import seaborn as sns
from IPython.display import HTML

##### read in data #####
train = pd.read_csv('clean_data/train_fe_dum.csv', index_col=0)
test = pd.read_csv('clean_data/test_fe_dum.csv', index_col=0)



##### initial modifications #####

# set target
target = 'SalePrice'

# train without log of target
train_original = train.copy()

# take log of target variable
train[target] = np.log(train[target])

# drop target from train set
train.drop(target, axis=1, inplace=True)

In [33]:
train.dtypes

MSSubClass              int64
MSZoning               object
Street                 object
Alley                  object
LotShape               object
LandContour            object
Utilities              object
LotConfig              object
LandSlope              object
Neighborhood           object
BldgType               object
HouseStyle             object
OverallQual             int64
OverallCond            object
YearBuilt               int64
YearRemodAdd            int64
RoofStyle              object
RoofMatl               object
Exterior1st            object
Exterior2nd            object
MasVnrType             object
MasVnrArea            float64
ExterQual              object
ExterCond              object
Foundation             object
BsmtQual               object
BsmtCond               object
BsmtExposure           object
BsmtFinType1           object
BsmtFinSF1              int64
                       ...   
GarageType_Low          int64
GarageFinish_Unf        int64
GarageFini

In [26]:

x = sm.add_constant(train)
# Notice that the dependent variable (y) comes before the independent variables (x)
# Use df.columns.values to get all the column names
model = sm.OLS(np.array(train[[target]]), np.array(x))
results = model.fit()
print(results.summary())

TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [88]:
pValues = results.pvalues
print('The significant coefficients')
sig_coefs = pValues[pValues < .05]
sig_coefs

The significant coefficients


const                   3.329847e-17
garagecars              7.914084e-09
wooddecksf              9.512737e-03
screenporch             4.846038e-03
poolarea                2.891367e-03
saletype_fnew           4.057703e-06
garagecond_ffa          3.274755e-04
garagecond_fgd          2.693848e-03
garagecond_fpo          8.192262e-04
garagecond_fta          2.142903e-04
garagequal_ffa          2.979280e-05
garagequal_fgd          2.262561e-04
garagequal_fpo          2.441930e-04
garagequal_fta          4.853302e-05
poolqc_ffa              7.350631e-06
poolqc_fgd              5.323378e-05
x1stflrsf               1.180410e-08
x2ndflrsf               6.389462e-11
lowqualfinsf            3.378453e-03
grlivarea               2.200574e-06
bedroomabvgr            7.158208e-06
kitchenabvgr            7.862147e-05
totrmsabvgrd            5.013399e-04
fireplaces              1.162822e-04
kitchenqual_ex          5.601943e-10
functional_maj1         2.064112e-02
functional_min1         6.077096e-03
f

In [40]:
significant_coefs = sig_coefs.index.tolist()

In [43]:
# reduce model significant_coefs
reduced_trainX = trainX[significant_coefs]
lm_reduced = LinearRegression()
lm_reduced.fit(reduced_trainX, trainY)
lm_reduced.score(reduced_trainX, trainY)

0.9111916982283614

In [67]:
reduced_x = sm.add_constant(reduced_trainX)
# Notice that the dependent variable (y) comes before the independent variables (x)
# Use df.columns.values to get all the column names
reduced_model = sm.OLS(trainY, reduced_x)
reduced_results = reduced_model.fit()
#print(reduced_results.summary())
reduced_results.rsquared

0.9111916982283614

In [51]:
pValues = reduced_results.pvalues
print('The significant coefficients')
red_sig_coefs = pValues[pValues < .05]

The significant coefficients


In [99]:
TrainX.columns.tolist()

['garagecars',
 'garagearea',
 'wooddecksf',
 'openporchsf',
 'enclosedporch',
 'x3ssnporch',
 'screenporch',
 'poolarea',
 'miscval',
 'paveddrive_fp',
 'paveddrive_fy',
 'saletype_fcon',
 'saletype_fconld',
 'saletype_fconli',
 'saletype_fconlw',
 'saletype_fcwd',
 'saletype_fnew',
 'saletype_foth',
 'saletype_fwd',
 'garagefinish_fnone',
 'garagefinish_frfn',
 'garagefinish_funf',
 'garagecond_ffa',
 'garagecond_fgd',
 'garagecond_fnone',
 'garagecond_fpo',
 'garagecond_fta',
 'garagequal_ffa',
 'garagequal_fgd',
 'garagequal_fnone',
 'garagequal_fpo',
 'garagequal_fta',
 'poolqc_ffa',
 'poolqc_fgd',
 'poolqc_fnone',
 'fence_fgdwo',
 'fence_fmnprv',
 'fence_fmnww',
 'fence_fnone',
 'miscfeature_fnone',
 'miscfeature_fothr',
 'miscfeature_fshed',
 'miscfeature_ftenc',
 'x1stflrsf',
 'x2ndflrsf',
 'lowqualfinsf',
 'grlivarea',
 'bsmtfullbath',
 'bsmthalfbath',
 'fullbath',
 'halfbath',
 'bedroomabvgr',
 'kitchenabvgr',
 'totrmsabvgrd',
 'fireplaces',
 'garageyrblt',
 'heatingqc_ex',
 

In [1]:
# write for loop to run regression, get best features, run regression, get best features, run regression....
significant_coefs = train.columns.tolist()
n = 50
trainY = df[target]
trainX = df[~cat_drop]
R2 = []
coefs = []

for i in range(0,n):
    trainX = TrainX[significant_coefs] # reset training set
    
    x = sm.add_constant(trainX)
    model = sm.OLS(trainY, x)
    results = model.fit()
    
    R2.append(results.rsquared)
    pValues = results.pvalues
    new_coefs = pValues[pValues < 0.05].index.tolist()[1:len(pValues[pValues < 0.05].index.tolist())]
    coefs.append(new_coefs)
    significant_coefs = new_coefs

for lst in coefs:
    print(len(lst))

NameError: name 'TrainX' is not defined