In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet, LinearRegression
from sklearn.metrics import mean_squared_log_error

In [2]:
train = pd.read_csv('Housing Dreams/house_train_raw.csv')
test = pd.read_csv('Housing Dreams/houses_test_raw.csv')
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


EDA

In [3]:
# count null values
train_null = train.isnull().sum()
print(train_null[train_null > 0])

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


In [4]:
# drop columns with most null values
train = train.drop(['Id','Alley','FireplaceQu','PoolQC','Fence','MiscFeature'], axis=1)

In [5]:
# save category features
category_features = []
for col in train.columns:
  if train[col].dtypes=='object':
    category_features.append(col)
print(category_features)

['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']


In [6]:
# convert categoric to numeric 
train_dummies = pd.get_dummies(train, columns = category_features, drop_first = False)
train_dummies.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,1,0,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,1,0,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,1,0,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,0,1,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,0,0,1,0,0,0,0,1,0


In [7]:
# drop categoric 
train = train.drop(category_features, axis=1)

In [8]:
# concatenate original dataset with dummies
train_new = pd.concat([train, train_dummies])
train_new.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,,,,,,,,,,
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,,,,,,,,,,
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,,,,,,,,,,
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,,,,,,,,,,
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,,,,,,,,,,


Model

In [9]:
# set x and y values for the model
x = train_new.drop('SalePrice',axis=1)
y = train_new['SalePrice']

In [10]:
# impute null values
imputer = SimpleImputer()
x_imp = imputer.fit_transform(x)
print(x_imp)

[[6.00000000e+01 6.50000000e+01 8.45000000e+03 ... 1.36986301e-02
  8.20547945e-01 8.56164384e-02]
 [2.00000000e+01 8.00000000e+01 9.60000000e+03 ... 1.36986301e-02
  8.20547945e-01 8.56164384e-02]
 [6.00000000e+01 6.80000000e+01 1.12500000e+04 ... 1.36986301e-02
  8.20547945e-01 8.56164384e-02]
 ...
 [7.00000000e+01 6.60000000e+01 9.04200000e+03 ... 0.00000000e+00
  1.00000000e+00 0.00000000e+00]
 [2.00000000e+01 6.80000000e+01 9.71700000e+03 ... 0.00000000e+00
  1.00000000e+00 0.00000000e+00]
 [2.00000000e+01 7.50000000e+01 9.93700000e+03 ... 0.00000000e+00
  1.00000000e+00 0.00000000e+00]]


In [11]:
# standardization 
scaler = StandardScaler()
x_imp_stdz = scaler.fit_transform(x_imp)
print(x_imp_stdz)

[[ 0.07337496 -0.22937175 -0.20714171 ...  0.          0.
   0.        ]
 [-0.87256276  0.4519361  -0.09188637 ...  0.          0.
   0.        ]
 [ 0.07337496 -0.09311018  0.07347998 ...  0.          0.
   0.        ]
 ...
 [ 0.30985939 -0.18395123 -0.14781027 ... -0.16666667  0.66135895
  -0.43274232]
 [-0.87256276 -0.09311018 -0.08016039 ... -0.16666667  0.66135895
  -0.43274232]
 [-0.87256276  0.22483348 -0.05811155 ... -0.16666667  0.66135895
  -0.43274232]]


In [12]:
# split test and train 
x_train, x_test, y_train, y_test = train_test_split(x_imp_stdz, y, test_size=0.3, random_state=0)

In [13]:
# create and train the model
model = ElasticNet()
model.fit(x_train, y_train) 

ElasticNet()

In [14]:
# predict train dataset 
predict_train = model.predict(x_train)
print(predict_train)

[123153.22110815 135162.25203879 193523.15877203 ... 149240.17299553
 166642.71346778 131255.41077152]


In [15]:
# predict test dataset
predict_test = model.predict(x_test)
print(predict_test)

[171996.5220812   96208.06779695 101874.98719282 290282.86315194
 246574.92886177 125948.9141133  122039.73756436 157850.40500984
 213096.97382937 151318.62031051 113286.74321327 113364.31257909
 153751.52670938 201281.03520959 180253.74069717 166797.92764895
 127997.34237792 146354.1796567  150698.30153028 138576.27185269
 177145.90645159  98739.53067039  90557.09958192 193711.33230722
 210834.59040864 123778.94517573 211214.87344682 151553.35616863
 358390.37757677 280236.13092192 146229.71932299 123230.53804899
 166109.83536937  97084.08963788 121257.88793375 138128.03331211
 154587.40282965 227235.80165812 192793.83131339 216882.58748491
 187851.547617   119565.88828829 214988.83478521 234209.96359098
 209330.77314672 135691.05145877 245359.24848971 184207.02761817
 159442.22936816 168591.04020453 255074.62773102 132603.42793609
 137129.13800246 272722.63043029 212849.24310256 156856.14301382
 185666.19898848 182533.75303428 179190.85053826 123486.97204442
 134454.51785765 150372.7

Model Evaluation

In [16]:
# RMSLE 
rmsle = mean_squared_log_error(y_test, predict_test, squared=False)
print('RMSLE: ' + str(round(rmsle,2)))

RMSLE: 0.16
