In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split
import statsmodels.formula.api as smf
from sqlalchemy import create_engine
from scipy.stats import bartlett
from scipy.stats import levene
from statsmodels.tsa.stattools import acf
import statsmodels.api as sm
import seaborn as sns
from statsmodels.tools.eval_measures import mse, rmse
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet

# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

import warnings
warnings.filterwarnings(action="ignore")

postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
houseprices = pd.read_sql_query('select * from houseprices',con=engine)

# no need for an open connection, as we're only doing a single query
engine.dispose()


## Original OLS/Linear Regression Model

In [2]:
houseprices['garage_cars_area'] = houseprices['garagecars'] + houseprices['garagearea']
houseprices['grade_living_area'] = houseprices.totrmsabvgrd  * houseprices.grlivarea
houseprices = pd.concat([houseprices, pd.get_dummies(houseprices["kitchenqual"])], axis=1)
houseprices['sold_remodeled'] = houseprices.yearbuilt * houseprices.yearremodadd 

X = houseprices[['overallqual', 'totalbsmtsf', 'grade_living_area', 'garage_cars_area', 'TA', 'sold_remodeled']]
Y = houseprices[['saleprice']]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 465)
print("The number of observations in training set is {}".format(X_train.shape[0]))
print("The number of observations in test set is {}".format(Y_test.shape[0]))

X_train_const = sm.add_constant(X_train)

# We fit an OLS model using statsmodels
results = sm.OLS(Y_train, X_train_const).fit()

# We print the summary results
print(results.summary())

The number of observations in training set is 1168
The number of observations in test set is 292
                            OLS Regression Results                            
Dep. Variable:              saleprice   R-squared:                       0.765
Model:                            OLS   Adj. R-squared:                  0.764
Method:                 Least Squares   F-statistic:                     630.2
Date:                Sun, 03 Nov 2019   Prob (F-statistic):               0.00
Time:                        11:21:45   Log-Likelihood:                -13980.
No. Observations:                1168   AIC:                         2.797e+04
Df Residuals:                    1161   BIC:                         2.801e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
---------------------------

In [None]:
print('With Cross Validation:')
cross_val_score(bnb, data, target, cv=10)

## Lasso Regression Model

It seems like a small lambda gives me the best R-squared value and lowest evaluation metrics for this model. My understanding is that this is because my OLS model was pretty decent. 

In [29]:
lassoregr = Lasso(alpha=.5) 
lassoregr.fit(X_train, Y_train)

# We are making predictions here
Y_preds_train = lassoregr.predict(X_train)
Y_preds_test = lassoregr.predict(X_test)

print("R-squared of the model on the training set is: {}".format(lassoregr.score(X_train, Y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the test set is: {}".format(lassoregr.score(X_test, Y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(Y_test, Y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mean_squared_error(Y_test, Y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(Y_test['saleprice'], Y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((Y_test['saleprice'] - Y_preds_test) / Y_test['saleprice'])) * 100))



R-squared of the model on the training set is: 0.7650894698391388
-----Test set statistics-----
R-squared of the model on the test set is: 0.7747199235629062
Mean absolute error of the prediction is: 25281.678001853765
Mean squared error of the prediction is: 1512463768.1096895
Root mean squared error of the prediction is: 38890.40714764618
Mean absolute percentage error of the prediction is: 15.02318855855706


## Ridge Regression Model

Setting my lambda equal to zero gave me the highest R-squared values and lowest evaluation metrics, again indicating that my OLS model was a decent fit.

In [43]:
# Fitting a ridge regression model. Alpha is the regularization
# parameter (usually called lambda). As alpha gets larger, parameter
# shrinkage grows more pronounced.
ridgeregr = Ridge(alpha=0) 
ridgeregr.fit(X_train, Y_train)

# We are making predictions here
Y_preds_train = ridgeregr.predict(X_train)
Y_preds_test = ridgeregr.predict(X_test)

print("R-squared of the model on the training set is: {}".format(ridgeregr.score(X_train, Y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the test set is: {}".format(ridgeregr.score(X_test, Y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(Y_test, Y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(Y_test, Y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(Y_test, Y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((Y_test - Y_preds_test) / Y_test)) * 100))


R-squared of the model on the training set is: 0.7650894701062251
-----Test set statistics-----
R-squared of the model on the test set is: 0.7747194916253897
Mean absolute error of the prediction is: 25281.842278245338
Mean squared error of the prediction is: [1.51246667e+09]
Root mean squared error of the prediction is: [38890.44443061]
Mean absolute percentage error of the prediction is: saleprice   15.023
dtype: float64


## ElasticNet Regression Model

Like the Ridge model, using 0 as my alpha gave me the best values.

In [52]:
elasticregr = ElasticNet(alpha=0, l1_ratio=0.5) 
elasticregr.fit(X_train, Y_train)

# We are making predictions here
Y_preds_train = elasticregr.predict(X_train)
Y_preds_test = elasticregr.predict(X_test)

print("R-squared of the model on the training set is: {}".format(elasticregr.score(X_train, Y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the test set is: {}".format(elasticregr.score(X_test, Y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(Y_test, Y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mean_squared_error(Y_test, Y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(sqrt(mean_squared_error(Y_test, Y_preds_test))))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((Y_test['saleprice'] - Y_preds_test)/Y_test['saleprice']) * 100)))


R-squared of the model on the training set is: 0.765089470106225
-----Test set statistics-----
R-squared of the model on the test set is: 0.7747194916253897
Mean absolute error of the prediction is: 25281.842278245324
Mean squared error of the prediction is: 1512466668.010355
Root mean squared error of the prediction is: 38890.44443060988
Mean absolute percentage error of the prediction is: 15.023300986071774


## Conclusion

My OLS model is a good fit for the data. I understand that because two of the other models gave the best vaues when the lambda/alpha was zero, the OLS model was not overfitting the data. 

Is there another way to choose a lambda without guess and check? It seems like cross-validation is the way to go, but I'm not sure I understand how that would work. 