In [1]:
# Imports libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import missingno as msno
import statsmodels.api as sm
from statsmodels.formula.api import ols

#sklearn imports
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler, PolynomialFeatures 

import patsy
%matplotlib inline
# y, X = patsy.dmatrices(formula, data=diamonds, return_type='dataframe')

In [2]:
# imports the training dataset for lasso modeling
df = pd.read_csv('./data/train_clean.csv')
df

Unnamed: 0,Id,PID,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,Veenker,YearBuilt_squ,Hip,Other_roof,BsmtFinType1*BsmtFinSF1,BsmtFinType2*BsmtFinSF2,ln_1stFlrSF,HeatingQC*CentralAir,TotalBsmtSF*BsmtUnfSF,BedroomAbvGr*FullBath
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,3904576,0,0,3198.0,0.0,6.586172,5,139200.0,6
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,3984016,0,0,3822.0,0.0,6.816736,5,251988.0,8
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,3814209,0,0,4386.0,0.0,6.963190,3,344582.0,3
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,4024036,0,0,0.0,0.0,6.612041,4,147456.0,6
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,3610000,0,0,0.0,0.0,6.722630,3,456976.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,1587,921126030,20,RL,79.0,11449,Pave,,IR1,HLS,...,0,4028049,0,0,6066.0,0.0,7.454720,5,1644732.0,6
2047,785,905377130,30,RL,,12342,Pave,,IR1,Lvl,...,0,3763600,0,0,1048.0,0.0,6.758095,5,515739.0,1
2048,916,909253010,50,RL,57.0,7558,Pave,,Reg,Bnk,...,0,3717184,0,0,0.0,0.0,7.066467,4,802816.0,3
2049,639,535179160,20,RL,80.0,10400,Pave,,Reg,Lvl,...,0,3825936,0,0,465.0,1500.0,7.090077,3,354000.0,3


In [3]:
# Instantiates the X and y variables to be inputted into lasso regression 
X = df[['ln_LotArea',
 'ln_GrLivArea',
 'GarageArea',
 'OverallQual',
 'TotalBsmtSF',
 'BrDale',
 'BrkSide',
 'ClearCr',
 'CollgCr',
 'Crawfor',
 'Edwards',
 'Gilbert',
 'IDOTRR',
 'MeadowV',
 'Mitchel',
 'NAmes',
 'NPkVill',
 'NWAmes',
 'NoRidge',
 'NridgHt',
 'OldTown',
 'Other_nhood',
 'SWISU',
 'Sawyer',
 'SawyerW',
 'Somerst',
 'StoneBr',
 'Timber',
 'Veenker',
 'YearBuilt_squ',
 'YearRemod-Built',
 'ExterQual',
 'BsmtQual',
 'BsmtFinType1',
 'BsmtFinSF1',
 'BsmtFinType1*BsmtFinSF1',
 '1stFlrSF',
 'HeatingQC',
 'Hip',
 'Other_roof']]
y = df['SalePrice']

In [4]:
# Instantiates the train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [5]:
# Instantiates a standard scaler and fits it to the X_train and transforms the X_train and X_test
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [6]:
# Creates the alphas to be used in cross validation, instantiates the lass cross validation regression, fits it to the scaled train data, and
# prints the r2 score
l_alphas = np.logspace(-3, 0, 100)

lasso_cv = LassoCV(alphas= l_alphas, cv=5, max_iter=5_000)
lasso_cv.fit(X_train_sc, y_train)

lasso_cv.coef_
lasso_cv.score(X_train_sc, y_train)

0.8599588091657129

In [7]:
lasso_cv.score(X_test_sc, y_test)

0.829605219331235

In [8]:
# prints the lasso cross validation model's coefficients
lasso_cv.coef_

array([15113.00898457, 18150.83363594,  4859.4637994 , 19344.54461003,
       -1457.05827636,  2635.99866067,  1519.87732637,   671.25659741,
       -2751.80667619,  3504.64754534, -2392.85572699, -2715.41950545,
         218.25827872,  2498.22838902,  -790.03826208,  -400.87326147,
        1727.50806755, -1712.78671823,  5602.91067674,  8462.15648309,
         137.67001269,  1823.40663814,   -30.94256573,  -734.64489809,
       -3338.90569907,  2064.70361084,  8235.88341652,  -447.43811841,
         594.75963981, 11100.36299495,  4591.06936884, 10514.98902684,
        4809.56103517,  1125.33882028, -2007.2996643 , 11054.91639199,
        5181.98681506,  2288.80441249,  4298.82141856, -1024.09212034])

In [10]:
# defines a function to output the linear evaluation metrics
def lin_metrics(model_fit, X, y):
    """Prints the model's r2_score, adj_r2, rmse, mse, and 
    cross val r2 score given the fitted model, a matrix X, and a vector y"""
    
    y_preds = model_fit.predict(X)
    r2 = r2_score(y, y_preds)
    cross_val = cross_val_score(model_fit, X, y, cv=10).mean()
    rmse = np.sqrt(mean_squared_error(y, y_preds))
    mse = mean_squared_error(y, y_preds)
    adj_r2 = 1 - (1-r2)*(len(y)-1)/(len(y)-X.shape[1]-1)
    
    print(f'r2_score = {r2}')
    print('=====================')
    print(f'cross_val_score = {cross_val}')
    print('=====================')
    print(f'RMSE = {rmse}')
    print('=====================')
    print(f'MSE = {mse}')
    print('=====================')
    print(f'adj_r2 = {adj_r2}')

In [11]:
# prints out the linear evaluation metrics for the test data set
lin_metrics(lasso_cv, X_test_sc, y_test)

r2_score = 0.829605219331235
cross_val_score = 0.8096756398463667
RMSE = 29896.613653488734
MSE = 893807507.9459689
adj_r2 = 0.8151649836813397


In [12]:
# Creates the coefficient dictionary
coef_dict = {'feature':X.columns, 'coefficient':lasso_cv.coef_}
coef_df = pd.DataFrame(coef_dict)

In [13]:
coef_df

Unnamed: 0,feature,coefficient
0,ln_LotArea,15113.008985
1,ln_GrLivArea,18150.833636
2,GarageArea,4859.463799
3,OverallQual,19344.54461
4,TotalBsmtSF,-1457.058276
5,BrDale,2635.998661
6,BrkSide,1519.877326
7,ClearCr,671.256597
8,CollgCr,-2751.806676
9,Crawfor,3504.647545
