In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse,rmse
from sklearn.linear_model import LinearRegression,LassoCV, RidgeCV,ElasticNetCV

import warnings
warnings.filterwarnings('ignore')

postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

In [2]:
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
house_prices_df = pd.read_sql_query('select * from houseprices',con=engine)

# no need for an open connection, as we're only doing a single query
engine.dispose()

In [5]:
house_prices_df = pd.concat([house_prices_df,pd.get_dummies(house_prices_df.mszoning,prefix = "mszoning",drop_first = True)],axis =1)
house_prices_df = pd.concat([house_prices_df,pd.get_dummies(house_prices_df.street,prefix ="street",drop_first = True)], axis =1)
dummy_column_names = list(pd.get_dummies(house_prices_df.mszoning,prefix = "mszoning", drop_first = True).columns)
dummy_column_names = dummy_column_names + list(pd.get_dummies(house_prices_df.street,prefix="street",drop_first = True).columns)


In [6]:
house_prices_df['totalsf'] = house_prices_df['totalbsmtsf'] + house_prices_df['firstflrsf']+ house_prices_df['secondflrsf']
house_prices_df['int_over_sf']= house_prices_df['totalsf']*house_prices_df['overallqual']
# Y is the target variable
Y = np.log1p(house_prices_df['saleprice'])
# X is the feature
X = house_prices_df[['overallqual','grlivarea','garagecars','garagearea','totalsf','int_over_sf'] + dummy_column_names]
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size = 0.2,random_state = 465)
alphas = [np.power(10.0,p) for p in np.arange(-10,40,1)]

In [7]:
lrm = LinearRegression()
lrm.fit(X_train,y_train)
y_preds_train = lrm.predict(X_train)
y_preds_test = lrm.predict(X_test)
print("R-squared of the model in training set is: {}".format(lrm.score(X_train,y_train)))
print("-----Test set statistics----")
print("R-squared of the model in test set is: {}".format(lrm.score(X_test,y_test)))
print("Mean absolute error of the prediction is : {}".format(mean_absolute_error(y_test,y_preds_test)))
print("Mean squared error of the prediction is : {}".format(mse(y_test,y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test,y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test-y_preds_test)/y_test))*100))

R-squared of the model in training set is: 0.8321322553132751
-----Test set statistics----
R-squared of the model in test set is: 0.8249302330916406
Mean absolute error of the prediction is : 0.12570372872861713
Mean squared error of the prediction is : 0.029192121871357397
Root mean squared error of the prediction is: 0.17085702172096234
Mean absolute percentage error of the prediction is: 1.0503577667823747


In [8]:
lasso_cv = LassoCV(alphas= alphas, cv =5)
lasso_cv.fit(X_train,y_train)
# We are making predictions
y_preds_train = lasso_cv.predict(X_train)
y_preds_test = lasso_cv.predict(X_test)
print("Best alpha value is: {}".format(lasso_cv.alpha_))
print("R-squared of the model is : {}".format(lasso_cv.score(X_train,y_train)))
print("----Test set statistics----")
print("R-squared of the model in test set is : {}".format(lasso_cv.score(X_test,y_test)))
print(" Mean absolute error of the prediction is: {}".format(mse(y_test,y_preds_test)))
print("Root mean  squared error of the prediction is: {}".format(rmse(y_test,y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test-y_preds_test)/y_test))*100))

Best alpha value is: 0.0001
R-squared of the model is : 0.8319394287042421
----Test set statistics----
R-squared of the model in test set is : 0.8226434437869414
 Mean absolute error of the prediction is: 0.02957343403767702
Root mean  squared error of the prediction is: 0.1719692822502816
Mean absolute percentage error of the prediction is: 1.0552354946577736


In [9]:
ridge_cv = RidgeCV(alphas=alphas, cv=5)
ridge_cv.fit(X_train,y_train)
# We are making predictions
y_preds_train = ridge_cv.predict(X_train)
y_preds_test = ridge_cv.predict(X_test)
print("Best alpha value is : {}".format(ridge_cv.alpha_))
print("R-squared of the model in training set is : {}".format(ridge_cv.score(X_train,y_train)))
print("----Test set statistics----")
print("R-squared of the model in test set is: {}".format(ridge_cv.score(X_test,y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test,y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test,y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test,y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test-y_preds_test)/y_test))*100))

Best alpha value is : 1.0
R-squared of the model in training set is : 0.8319491710033406
----Test set statistics----
R-squared of the model in test set is: 0.8222603911701664
Mean absolute error of the prediction is: 0.12630017373110553
Mean squared error of the prediction is: 0.02963730639479218
Root mean squared error of the prediction is: 0.17215489070831586
Mean absolute percentage error of the prediction is: 1.05577315303204


In [11]:
elasticnet_cv = ElasticNetCV(alphas = alphas, cv =5)
elasticnet_cv.fit(X_train,y_train)
# We are making predictions here
y_preds_train = elasticnet_cv.predict(X_train)
y_preds_test = elasticnet_cv.predict(X_test)
print("Best alpha value is: {}".format(elasticnet_cv.alpha_))
print("R-sqaured of the model in training set is: {}".format(elasticnet_cv.score(X_train,y_train)))
print("---Test set statistics---")
print("R-squared of the model in test set is: {}".format(elasticnet_cv.score(X_test,y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test,y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test,y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test,y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test-y_preds_test)/y_test))*100))

Best alpha value is: 0.001
R-sqaured of the model in training set is: 0.8299843719493093
---Test set statistics---
R-squared of the model in test set is: 0.8149048869657498
Mean absolute error of the prediction is: 0.12772310120124825
Mean squared error of the prediction is: 0.030863804715733027
Root mean squared error of the prediction is: 0.17568097425655696
Mean absolute percentage error of the prediction is: 1.0686732547082527


In [12]:
# According to the results, the best model is the OLS regression.