In [14]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import statsmodels.api as sm
from statsmodels.tools.eval_measures import mse, rmse
from sqlalchemy import create_engine

import warnings
warnings.filterwarnings('ignore')

postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'


engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
house_prices_df = pd.read_sql_query('select * from houseprices',con=engine)

engine.dispose()

In [15]:
house_prices_df = pd.concat([house_prices_df,pd.get_dummies(house_prices_df.mszoning, prefix="mszoning", drop_first=True)], axis=1)
house_prices_df = pd.concat([house_prices_df,pd.get_dummies(house_prices_df.street, prefix="street", drop_first=True)], axis=1)
dummy_column_names = list(pd.get_dummies(house_prices_df.mszoning, prefix="mszoning", drop_first=True).columns)
dummy_column_names = dummy_column_names + list(pd.get_dummies(house_prices_df.street, prefix="street", drop_first=True).columns)

In [16]:
Y = np.log1p(house_prices_df['saleprice'])
house_prices_df['grlivarea_garagearea'] = house_prices_df['grlivarea'] * house_prices_df['garagearea']
house_prices_df['grlivarea_overallqual'] = house_prices_df['grlivarea'] * house_prices_df['overallqual']
X = house_prices_df[['overallqual', 'grlivarea', 'garagecars', 'garagearea', 'totalbsmtsf', 'grlivarea_garagearea', 
                     'grlivarea_overallqual'] + dummy_column_names]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 465)

print("The number of observations in training set is {}".format(X_train.shape[0]))
print("The number of observations in test set is {}".format(X_test.shape[0]))

The number of observations in training set is 1168
The number of observations in test set is 292


## Linear Regression

In [18]:
lrm = LinearRegression()
lrm.fit(X_train, y_train)

y_preds_train = lrm.predict(X_train)
y_preds_test = lrm.predict(X_test)

In [19]:
print("R-squared of the model in the training set is: {}".format(lrm.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in the test set is: {}".format(lrm.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model in the training set is: 0.834266040650961
-----Test set statistics-----
R-squared of the model in the test set is: 0.8357263431015027
Mean absolute error of the prediction is: 0.12230696621285152
Mean squared error of the prediction is: 0.02739191750306434
Root mean squared error of the prediction is: 0.1655050376969364
Mean absolute percentage error of the prediction is: 1.0221605937147933


## Ridge Regression

In [60]:
from sklearn.linear_model import Ridge

ridgeregr = Ridge(alpha=10**5) 
ridgeregr.fit(X_train, y_train)

y_preds_train_r = ridgeregr.predict(X_train)
y_preds_test_r = ridgeregr.predict(X_test)

print("R-squared of the model on the training set is: {}".format(ridgeregr.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the test set is: {}".format(ridgeregr.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test_r)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test_r)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test_r)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model on the training set is: 0.7765637742885652
-----Test set statistics-----
R-squared of the model on the test set is: 0.8091113658335765
Mean absolute error of the prediction is: 0.12792160637780742
Mean squared error of the prediction is: 0.03182984915585169
Root mean squared error of the prediction is: 0.17840921824796974
Mean absolute percentage error of the prediction is: 2.6437648221337358


## Lasso Regression

In [49]:
from sklearn.linear_model import Lasso

lassoregr = Lasso(alpha=10**5) 
lassoregr.fit(X_train, y_train)

y_preds_train_l = lassoregr.predict(X_train)
y_preds_test_l = lassoregr.predict(X_test)

print("R-squared of the model on the training set is: {}".format(lassoregr.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the test set is: {}".format(lassoregr.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test_l)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test_l)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test_l)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test_l) / y_test)) * 100))


R-squared of the model on the training set is: 0.3154709984082549
-----Test set statistics-----
R-squared of the model on the test set is: 0.31307538626151277
Mean absolute error of the prediction is: 0.2609188534792156
Mean squared error of the prediction is: 0.11454169040612078
Root mean squared error of the prediction is: 0.33844008392346314
Mean absolute percentage error of the prediction is: 2.1732777757997312


## ElasticNet Regression

In [59]:
from sklearn.linear_model import ElasticNet

elasticregr = ElasticNet(alpha=10**5, l1_ratio=0.5) 
elasticregr.fit(X_train, y_train)

y_preds_train_e = elasticregr.predict(X_train)
y_preds_test_e = elasticregr.predict(X_test)

print("R-squared of the model on the training set is: {}".format(elasticregr.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the test set is: {}".format(elasticregr.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test_e)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test_e)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test_e)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test_e) / y_test)) * 100))


R-squared of the model on the training set is: 0.46018858468661783
-----Test set statistics-----
R-squared of the model on the test set is: 0.4843978863124329
Mean absolute error of the prediction is: 0.22285638292100243
Mean squared error of the prediction is: 0.08597440897819714
Root mean squared error of the prediction is: 0.2932139303958752
Mean absolute percentage error of the prediction is: 1.858895420650795


According to the results, the best model is the the first one (Linear Regression)