In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LassoCV
from sklearn.model_selection import cross_validate
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split

In [8]:
one_hot_df = pd.read_csv('./Data/one_hot_df.csv')
label_df = pd.read_csv('./Data/label_df.csv')
y = one_hot_df['SalePrice']
X = one_hot_df.drop(['SalePrice'], axis =1)

## Linear Regression

In [52]:
lr_model = LinearRegression()
metrics = cross_validate(lr_model, X,y,cv = 5, scoring = ('neg_root_mean_squared_error'))
lr_cv = -metrics['test_score'].mean()
print('CV PREDICTION ERROR FOR MULTIVARIATE LR')
print(lr_cv)

CV PREDICTION ERROR FOR MULTIVARIATE LR
0.15076176717943116


This linear regression model uses all the fatures, and does not perform any subset selesction stepwise selection methodology. When we use k fold cross validation, we get a cross validation prediction error of 0.1508

## Ridge Regression

In [53]:
param_list = []
start = 10
for i in range(30):
    param_list.append(start)
    start = start + 0.25

parameters = {'alpha': param_list} 
rr = Ridge()
metrics = cross_validate(rr, X, y, cv = 5, scoring = ('neg_root_mean_squared_error'))
print('CV PREDICTION ERROR FOR UNTUNED RR')
print(-metrics['test_score'].mean())

CV PREDICTION ERROR FOR UNTUNED RR
0.14128730494282382


Tuning the ridge regression model

In [102]:
tuned_rr = GridSearchCV(rr, parameters, scoring = 'neg_root_mean_squared_error', cv = 5)
tuned_rr.fit(X,y)
print('CV PREDICTION ERROR FOR TUNED RR')
print(-tuned_rr.best_score_)
print(tuned_rr.best_params_)

CV PREDICTION ERROR FOR TUNED RR
0.1363310366409711
{'alpha': 14.75}


We find the value of alpha that gives us the lowest cross validation prediction error. The alpha 

## Lasso Regression

In [57]:
param_list = []
start = 0.0001
for i in range(30):
    param_list.append(start)
    start = start + 0.0001
parameters2 = {'alpha': param_list}
lasso = Lasso()
lasso.fit(X,y)
metrics = cross_validate(lasso, X,y, cv = 5, scoring = ('neg_root_mean_squared_error'))
print('SCORE FOR UNTUNED LASSO')
print(-metrics['test_score'].mean())

SCORE FOR UNTUNED LASSO
0.3994558059361331


Tuning the lasso regression model

In [60]:
tuned_lasso = GridSearchCV(lasso, parameters2, scoring = 'neg_root_mean_squared_error', cv = 5)
tuned_lasso.fit(X,y)
best_param = tuned_lasso.best_params_['alpha']
lasso_tuned = Lasso(alpha = best_param)
lasso_tuned.fit(X,y)
metrics = cross_validate(lasso_tuned, X,y, cv = 5, scoring = ('neg_root_mean_squared_error'))


Analysis of tuned lasso regression model

In [104]:
print('BASED ON CV, WE FOUND OPTIMAL ALPHA TO BE:')
print(best_param)
print(-tuned_lasso.best_score_)
print()
model_coefs = list(lasso_tuned.coef_)

good_features = []
for i in range(len(model_coefs)):
    if model_coefs[i]> 0:
        good_features.append(list(X.columns)[i])

print('\n'+'FEATURES WITH NON ZERO COEFFICIENTS FOR LASSO')
print(good_features)

BASED ON CV, WE FOUND OPTIMAL ALPHA TO BE:
0.0005
0.13375088484596281


FEATURES WITH NON ZERO COEFFICIENTS FOR LASSO
['LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'ScreenPorch', 'MSSubClass_20', 'MSSubClass_50', 'MSZoning_RL', 'LotShape_IR2', 'LotConfig_Corner', 'LotConfig_CulDSac', 'Neighborhood_BrkSide', 'Neighborhood_ClearCr', 'Neighborhood_Crawfor', 'Neighborhood_NoRidge', 'Neighborhood_NridgHt', 'Neighborhood_Somerst', 'Neighborhood_StoneBr', 'Condition1_Norm', 'Condition1_RRAn', 'BldgType_1Fam', 'Exterior1st_BrkFace', 'Exterior1st_MetalSd', 'Exterior1st_VinylSd', 'Exterior2nd_VinylSd', 'Exterior2nd_Wd Sdng', 'ExterQual_Gd', 'ExterCond_TA', 'Foundation_PConc', 'BsmtQual_Ex', 'BsmtCond_TA', 'BsmtExposure_Gd', 'BsmtFinType1_ALQ', 'BsmtFinType1_GLQ

We perform feature selection based on the coefficients of our lasso regression model, narrowing our features down from 273 to 66

The value of alpha that gives us the lowest CV prediction error is 0.0004. This alpha value is very low: when alpha is low it means that the result (model coefficients) become similar to that of the linear regression model.

## Random Forest

## Other concerns and questions

1. Running stepwise regresssion techniques in sklearn
2. 