In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import statsmodels.api as sm
from statsmodels.tools.eval_measures import mse, rmse

%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

import warnings
warnings.filterwarnings(action="ignore")

In [2]:
house_price=pd.read_csv('./data/houseprice.csv')
house_price.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
df_cat = house_price[house_price.select_dtypes(include = ["object"]).columns]
df_cat = pd.get_dummies(df_cat)
df_num = house_price[house_price.select_dtypes(exclude = ["object"]).columns]
df_all = pd.concat([df_cat,df_num],axis=1)

In [4]:
Y = df_all['SalePrice']
X = df_all[['OverallQual', 'GrLivArea','GarageCars','TotalBsmtSF','YearBuilt','YearRemodAdd','KitchenQual_Ex','Fireplaces','ExterQual_Gd','ExterQual_Ex','HeatingQC_Ex','Neighborhood_NridgHt']]

X=sm.add_constant(X)
results=sm.OLS(Y,X).fit()
results.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.805
Model:,OLS,Adj. R-squared:,0.803
Method:,Least Squares,F-statistic:,496.5
Date:,"Wed, 17 Jun 2020",Prob (F-statistic):,0.0
Time:,17:32:49,Log-Likelihood:,-17352.0
No. Observations:,1460,AIC:,34730.0
Df Residuals:,1447,BIC:,34800.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-9.052e+05,1.3e+05,-6.958,0.000,-1.16e+06,-6.5e+05
OverallQual,1.271e+04,1210.205,10.503,0.000,1.03e+04,1.51e+04
GrLivArea,46.3337,2.457,18.859,0.000,41.514,51.153
GarageCars,1.316e+04,1685.422,7.808,0.000,9854.174,1.65e+04
TotalBsmtSF,21.1368,2.676,7.899,0.000,15.887,26.386
YearBuilt,229.3135,46.247,4.958,0.000,138.595,320.032
YearRemodAdd,214.0643,62.491,3.425,0.001,91.481,336.648
KitchenQual_Ex,3.512e+04,4703.932,7.466,0.000,2.59e+04,4.43e+04
Fireplaces,1.04e+04,1667.367,6.237,0.000,7127.877,1.37e+04

0,1,2,3
Omnibus:,602.25,Durbin-Watson:,2.005
Prob(Omnibus):,0.0,Jarque-Bera (JB):,82309.766
Skew:,-0.859,Prob(JB):,0.0
Kurtosis:,39.743,Cond. No.,475000.0


In [6]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=465)

## OLS

In [14]:
ols_model = LinearRegression()
ols_model.fit(X_train,Y_train)

ols_model_train_predict = ols_model.predict(X_train)
ols_model_test_predict = ols_model.predict(X_test)

In [15]:

print("R-squared on train set :",ols_model.score(X_train,Y_train),"\n")

print("--- OLS model's statistics on test set ---\n")
print("R-squared :",ols_model.score(X_test,Y_test))
print("Mean Absolute Error :",mean_absolute_error(Y_test,ols_model_test_predict))
print("Mean Squared Error :",mse(Y_test,ols_model_test_predict))
print("Root of Mean Squared Error :",rmse(Y_test,ols_model_test_predict))

R-squared on train set : 0.7980140528301275 

--- OLS model's statistics on test set ---

R-squared : 0.8176759651887013
Mean Absolute Error : 23275.30987230918
Mean Squared Error : 1321004507.947627
Root of Mean Squared Error : 36345.62570582087


## Ridge

In [16]:
from sklearn.linear_model import Ridge

ridgeregr = Ridge(alpha=10**37) 
ridgeregr.fit(X_train, Y_train)

Y_preds_train = ridgeregr.predict(X_train)
Y_preds_test = ridgeregr.predict(X_test)

print("R-squared of the model in training set is: {}".format(ridgeregr.score(X_train, Y_train)))
print("-----Ridge model's statistics on test set-----")
print("R-squared of the model in test set is: {}".format(ridgeregr.score(X_test, Y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(Y_test, Y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(Y_test, Y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(Y_test, Y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((Y_test - Y_preds_test) / Y_test)) * 100))

R-squared of the model in training set is: 0.0
-----Ridge model's statistics on test set-----
R-squared of the model in test set is: -0.0019389852335491842
Mean absolute error of the prediction is: 58947.449052355034
Mean squared error of the prediction is: 7259415455.301054
Root mean squared error of the prediction is: 85202.20334768963
Mean absolute percentage error of the prediction is: 35.30720792964479


## Lasso

In [17]:
from sklearn.linear_model import Lasso

lassoregr = Lasso(alpha=10**20.5) 
lassoregr.fit(X_train, Y_train)

y_preds_train = lassoregr.predict(X_train)
y_preds_test = lassoregr.predict(X_test)

print("R-squared of the model in training set is: {}".format(lassoregr.score(X_train, Y_train)))
print("-----Lasso model's statistics on test set-----")
print("R-squared of the model in test set is: {}".format(lassoregr.score(X_test, Y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(Y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(Y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(Y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((Y_test - y_preds_test) / Y_test)) * 100))

R-squared of the model in training set is: 0.0
-----Lasso model's statistics on test set-----
R-squared of the model in test set is: -0.0019389852335491842
Mean absolute error of the prediction is: 58947.449052355034
Mean squared error of the prediction is: 7259415455.301054
Root mean squared error of the prediction is: 85202.20334768963
Mean absolute percentage error of the prediction is: 35.30720792964479


## ElasticNet

In [18]:
from sklearn.linear_model import ElasticNet

elasticregr = ElasticNet(alpha=10**21, l1_ratio=0.5) 
elasticregr.fit(X_train, Y_train)

y_preds_train = elasticregr.predict(X_train)
y_preds_test = elasticregr.predict(X_test)

print("R-squared of the model in training set is: {}".format(elasticregr.score(X_train, Y_train)))
print("-----ElasticNet model's statistics on test set-----")
print("R-squared of the model in test set is: {}".format(elasticregr.score(X_test, Y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(Y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(Y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(Y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((Y_test - y_preds_test) / Y_test)) * 100))

R-squared of the model in training set is: 0.0
-----ElasticNet model's statistics on test set-----
R-squared of the model in test set is: -0.0019389852335491842
Mean absolute error of the prediction is: 58947.449052355034
Mean squared error of the prediction is: 7259415455.301054
Root mean squared error of the prediction is: 85202.20334768963
Mean absolute percentage error of the prediction is: 35.30720792964479


## Conclusion

In [23]:
conc = pd.DataFrame()

conc["OLS"] = [ols_model.score(X_train,Y_train),ols_model.score(X_test,Y_test)]
conc["Ridge"] = [ridgeregr.score(X_train,Y_train),ridgeregr.score(X_test,Y_test)]
conc["Lasso"] = [lassoregr.score(X_train,Y_train),lassoregr.score(X_test,Y_test)]
conc["ElasticNet"] = [elasticregr.score(X_train,Y_train),elasticregr.score(X_test,Y_test)]

conc.index = ["Train set R-squared","Test set R-squared"]

conc

Unnamed: 0,OLS,Ridge,Lasso,ElasticNet
Train set R-squared,0.798,0.0,0.0,0.0
Test set R-squared,0.818,-0.002,-0.002,-0.002
