In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels as sm
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV


# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

import warnings
warnings.filterwarnings(action="ignore")

In [2]:
#load data
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
house_df = pd.read_sql_query('select * from houseprices',con=engine)

# no need for an open connection, as we're only doing a single query
engine.dispose()

In [3]:
#create dummy variables
house_df = pd.concat([house_df,pd.get_dummies(house_df.mszoning, prefix='mszoning', drop_first=True)], axis=1)
zoning_column_names = list(pd.get_dummies(house_df.mszoning, prefix='mszoning', drop_first=True).columns)
house_df['street_access'] = pd.get_dummies(house_df.street, drop_first=True)
house_df['has_AC'] = pd.get_dummies(house_df.centralair, drop_first=True)
house_df = pd.concat([house_df,pd.get_dummies(house_df.kitchenqual, prefix='kitchenqual', drop_first=True)], axis=1)
kitchen_column_names = list(pd.get_dummies(house_df.kitchenqual, prefix='kitchenqual', drop_first=True).columns)

In [4]:
#target variable
Y = house_df['saleprice']
#define feature set
X = house_df[['overallqual', 'totalbsmtsf', 'firstflrsf','grlivarea', 'garagecars', 'garagearea', 
             'street_access', 'has_AC'] + zoning_column_names + kitchen_column_names]
#split data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 465)

#set range of values for alpha
alphas = [np.power(10.0,p) for p in np.arange(-10,40,1)]

#define linear model
linear_model = LinearRegression()
linear_model.fit(X_train, Y_train)

Y_preds_train = linear_model.predict(X_train)
Y_preds_test = linear_model.predict(X_test)

print('R-squared of the model on the training set is: {}'.format(linear_model.score(X_train, Y_train)))
print('-----Test set statistics-----')
print('R-squared of the model on the test set is: {}'.format(linear_model.score(X_test, Y_test)))
print('Mean absolute error of the prediction is: {}'.format(mean_absolute_error(Y_test, Y_preds_test)))
print('Mean squared error of the prediction is: {}'.format(mse(Y_test, Y_preds_test)))
print('Root mean squared error of the prediction is: {}'.format(rmse(Y_test, Y_preds_test)))
print('Mean absolute percentage error of the prediction is: {}'.format(np.mean(np.abs((Y_test - Y_preds_test) / Y_test)) * 100))


R-squared of the model on the training set is: 0.7908179363492478
-----Test set statistics-----
R-squared of the model on the test set is: 0.8026735994841876
Mean absolute error of the prediction is: 23453.890959943325
Mean squared error of the prediction is: 1324791060.0518851
Root mean squared error of the prediction is: 36397.67932233984
Mean absolute percentage error of the prediction is: 14.081455488212924


In [5]:
lasso_cv = LassoCV(alphas=alphas, cv=10)

lasso_cv.fit(X_train, Y_train)

Y_preds_train = lasso_cv.predict(X_train)
Y_preds_test = lasso_cv.predict(X_test)

print('Best alpha value is: {}'.format(lasso_cv.alpha_))
print('R-squared of the model on the training set is: {}'.format(lasso_cv.score(X_train, Y_train)))
print('-----Test set statistics-----')
print('R-squared of the model on the test set is: {}'.format(lasso_cv.score(X_test, Y_test)))
print('Mean absolute error of the prediction is: {}'.format(mean_absolute_error(Y_test, Y_preds_test)))
print('Mean squared error of the prediction is: {}'.format(mse(Y_test, Y_preds_test)))
print('Root mean squared error of the prediction is: {}'.format(rmse(Y_test, Y_preds_test)))
print('Mean absolute percentage error of the prediction is: {}'.format(np.mean(np.abs((Y_test - Y_preds_test) / Y_test)) * 100))


Best alpha value is: 10.0
R-squared of the model on the training set is: 0.79078519415009
-----Test set statistics-----
R-squared of the model on the test set is: 0.8024034180451981
Mean absolute error of the prediction is: 23469.568157030077
Mean squared error of the prediction is: 1326604978.2809198
Root mean squared error of the prediction is: 36422.58884649634
Mean absolute percentage error of the prediction is: 14.066284547586056


In [6]:
ridge_cv = RidgeCV(alphas=alphas, cv=10)

ridge_cv.fit(X_train, Y_train)

Y_preds_train = ridge_cv.predict(X_train)
Y_preds_test = ridge_cv.predict(X_test)

print('Best alpha value is: {}'.format(ridge_cv.alpha_))
print('R-squared of the model on the training set is: {}'.format(ridge_cv.score(X_train, Y_train)))
print('-----Test set statistics-----')
print('R-squared of the model on the test set is: {}'.format(ridge_cv.score(X_test, Y_test)))
print('Mean absolute error of the prediction is: {}'.format(mean_absolute_error(Y_test, Y_preds_test)))
print('Mean squared error of the prediction is: {}'.format(mse(Y_test, Y_preds_test)))
print('Root mean squared error of the prediction is: {}'.format(rmse(Y_test, Y_preds_test)))
print('Mean absolute percentage error of the prediction is: {}'.format(np.mean(np.abs((Y_test - Y_preds_test) / Y_test)) * 100))


Best alpha value is: 1.0
R-squared of the model on the training set is: 0.7907100404281503
-----Test set statistics-----
R-squared of the model on the test set is: 0.8017750036720747
Mean absolute error of the prediction is: 23501.159113276633
Mean squared error of the prediction is: 1330823966.4211059
Root mean squared error of the prediction is: 36480.460063177736
Mean absolute percentage error of the prediction is: 14.094977294436678


In [7]:
elastic_net_cv = ElasticNetCV(alphas=alphas, cv=10)

elastic_net_cv.fit(X_train, Y_train)

Y_preds_train = elastic_net_cv.predict(X_train)
Y_preds_test = elastic_net_cv.predict(X_test)

print('Best alpha value is: {}'.format(elastic_net_cv.alpha_))
print('R-squared of the model on the training set is: {}'.format(elastic_net_cv.score(X_train, Y_train)))
print('-----Test set statistics-----')
print('R-squared of the model on the test set is: {}'.format(elastic_net_cv.score(X_test, Y_test)))
print('Mean absolute error of the prediction is: {}'.format(mean_absolute_error(Y_test, Y_preds_test)))
print('Mean squared error of the prediction is: {}'.format(mse(Y_test, Y_preds_test)))
print('Root mean squared error of the prediction is: {}'.format(rmse(Y_test, Y_preds_test)))
print('Mean absolute percentage error of the prediction is: {}'.format(np.mean(np.abs((Y_test - Y_preds_test) / Y_test)) * 100))


Best alpha value is: 0.001
R-squared of the model on the training set is: 0.7907748533914915
-----Test set statistics-----
R-squared of the model on the test set is: 0.8021281378950682
Mean absolute error of the prediction is: 23483.06259145377
Mean squared error of the prediction is: 1328453127.7477336
Root mean squared error of the prediction is: 36447.95094031671
Mean absolute percentage error of the prediction is: 14.087792369910938


The OLS model had the highest R-squared value and lowest mean absolute error, so that model had the best performance, although all of the models had a similar level of performance.