In [6]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error


import statsmodels.api as sm
from statsmodels.tools.eval_measures import mse, rmse
from sqlalchemy import create_engine

# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

import warnings
warnings.filterwarnings(action="ignore")



postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

houses = pd.read_sql_query('select * from houseprices',con=engine)

# no need for an open connection, as we're only doing a single query
engine.dispose()

houses.head()

Unnamed: 0,id,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,...,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition,saleprice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [7]:
houses = houses[['saleprice','overallqual','grlivarea','garagecars','garagearea','totalbsmtsf','fullbath','yearbuilt','yearremodadd']]

houses['saleprice_log'] = [float(np.log(i)) for i in houses['saleprice']]

features = list(houses.columns)
features.remove('saleprice')
features.remove('saleprice_log')
print('features (len: {}): {}\ntarget: saleprice_log'.format(len(features),{i for i in features}))

X = houses[features]

Y = houses['saleprice_log']

X = sm.add_constant(X)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 465)

results = sm.OLS(Y, X).fit()

results.summary()

features (len: 8): {'totalbsmtsf', 'garagecars', 'fullbath', 'yearbuilt', 'garagearea', 'overallqual', 'yearremodadd', 'grlivarea'}
target: saleprice_log


0,1,2,3
Dep. Variable:,saleprice_log,R-squared:,0.82
Model:,OLS,Adj. R-squared:,0.819
Method:,Least Squares,F-statistic:,823.5
Date:,"Sun, 05 Jan 2020",Prob (F-statistic):,0.0
Time:,16:15:40,Log-Likelihood:,518.46
No. Observations:,1460,AIC:,-1019.0
Df Residuals:,1451,BIC:,-971.3
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.8506,0.574,4.963,0.000,1.724,3.977
overallqual,0.0933,0.005,17.602,0.000,0.083,0.104
grlivarea,0.0002,1.34e-05,18.247,0.000,0.000,0.000
garagecars,0.0738,0.014,5.422,0.000,0.047,0.101
garagearea,5.657e-05,4.61e-05,1.228,0.220,-3.38e-05,0.000
totalbsmtsf,0.0001,1.29e-05,9.671,0.000,9.96e-05,0.000
fullbath,-0.0160,0.012,-1.344,0.179,-0.039,0.007
yearbuilt,0.0018,0.000,8.128,0.000,0.001,0.002
yearremodadd,0.0022,0.000,7.714,0.000,0.002,0.003

0,1,2,3
Omnibus:,994.212,Durbin-Watson:,1.984
Prob(Omnibus):,0.0,Jarque-Bera (JB):,43000.326
Skew:,-2.604,Prob(JB):,0.0
Kurtosis:,29.072,Cond. No.,439000.0


# OLS

In [8]:
# We fit an OLS model using sklearn
lrm = LinearRegression()
lrm.fit(X_train, y_train)


# We are making predictions here
y_preds_train = lrm.predict(X_train)
y_preds_test = lrm.predict(X_test)

print("R-squared of the model in the training set is: {}".format(lrm.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in the test set is: {}".format(lrm.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model in the training set is: 0.8112373214943298
-----Test set statistics-----
R-squared of the model in the test set is: 0.8483327565154258
Mean absolute error of the prediction is: 0.11384928621635201
Mean squared error of the prediction is: 0.02529019158639426
Root mean squared error of the prediction is: 0.15902890173296885
Mean absolute percentage error of the prediction is: 0.9526765230037398


# Ridge regression

In [9]:
# Fitting a ridge regression model. Alpha is the regularization
# parameter (usually called lambda). As alpha gets larger, parameter
# shrinkage grows more pronounced.
ridgeregr = Ridge(alpha=10**37) 
ridgeregr.fit(X_train, y_train)

# We are making predictions here
y_preds_train = ridgeregr.predict(X_train)
y_preds_test = ridgeregr.predict(X_test)

print("R-squared of the model on the training set is: {}".format(ridgeregr.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the test set is: {}".format(ridgeregr.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))


R-squared of the model on the training set is: 0.0
-----Test set statistics-----
R-squared of the model on the test set is: -0.0013312764353405893
Mean absolute error of the prediction is: 0.3178263906622543
Mean squared error of the prediction is: 0.16696986930519447
Root mean squared error of the prediction is: 0.4086194676042668
Mean absolute percentage error of the prediction is: 2.6437833835698767


# Lasso

In [10]:
lassoregr = Lasso(alpha=10**20.5) 
lassoregr.fit(X_train, y_train)

# We are making predictions here
y_preds_train = lassoregr.predict(X_train)
y_preds_test = lassoregr.predict(X_test)

print("R-squared of the model on the training set is: {}".format(lassoregr.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the test set is: {}".format(lassoregr.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))


R-squared of the model on the training set is: 0.0
-----Test set statistics-----
R-squared of the model on the test set is: -0.0013312764353405893
Mean absolute error of the prediction is: 0.3178263906622543
Mean squared error of the prediction is: 0.16696986930519447
Root mean squared error of the prediction is: 0.4086194676042668
Mean absolute percentage error of the prediction is: 2.6437833835698767


# ElasticNet regression

In [11]:
elasticregr = ElasticNet(alpha=10**21, l1_ratio=0.5) 
elasticregr.fit(X_train, y_train)

# We are making predictions here
y_preds_train = elasticregr.predict(X_train)
y_preds_test = elasticregr.predict(X_test)

print("R-squared of the model on the training set is: {}".format(elasticregr.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the test set is: {}".format(elasticregr.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))


R-squared of the model on the training set is: 0.0
-----Test set statistics-----
R-squared of the model on the test set is: -0.0013312764353405893
Mean absolute error of the prediction is: 0.3178263906622543
Mean squared error of the prediction is: 0.16696986930519447
Root mean squared error of the prediction is: 0.4086194676042668
Mean absolute percentage error of the prediction is: 2.6437833835698767


The first model using OLS is the best. This dataset also seems not to be fit for regression models due to very low R2.