# LASSO
Rachael An

# Implement the following steps:

- Rescale predictor variables
- Fit LASSO L1 regularization models in python
- Demonstrate the effect of underfit models on the relationship between training and test scores
- Show what happens when $n$ is small relative to $p$
- Use `GridSearchCV()` to find the complexity parameter $\lambda$ that yields the best out-of-sample predictive error

# Import libraries and data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Import all additional libraries/functions here:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

In [2]:
# Load test training, test, and sample_submission CSV's as pandas data frames here
# Set the row indices to be the "ID" variable
training = pd.read_csv("data/train.csv", index_col = "Id")
training_50 = pd.read_csv("data/train_50.csv", index_col = "Id")
test = pd.read_csv("data/test.csv", index_col = "Id")
submission = pd.read_csv("data/sample_submission.csv", index_col = "Id")

# Variables and model considered

We'll only consider the following numerical variables: 
    
   

In [3]:
features = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 
            'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 
            'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 
            'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 
            'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 
            'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 
            'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 
            'MoSold', 'YrSold']

# Subset training, training_50 and test data frames to just relevant variables:
training = training[['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 
            'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 
            'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 
            'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 
            'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 
            'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 
            'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 
            'MoSold', 'YrSold', 'SalePrice']]

training_50 = training_50[['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 
            'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 
            'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 
            'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 
            'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 
            'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 
            'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 
            'MoSold', 'YrSold', 'SalePrice']]

test = test[['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 
            'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 
            'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 
            'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 
            'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 
            'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 
            'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 
            'MoSold', 'YrSold']]

In [4]:
# Inspect info on all 3 data frames. Note number of missing values
print(training.info())
print(training_50.info())
print(test.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 37 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   LotFrontage    1201 non-null   float64
 2   LotArea        1460 non-null   int64  
 3   OverallQual    1460 non-null   int64  
 4   OverallCond    1460 non-null   int64  
 5   YearBuilt      1460 non-null   int64  
 6   YearRemodAdd   1460 non-null   int64  
 7   MasVnrArea     1452 non-null   float64
 8   BsmtFinSF1     1460 non-null   int64  
 9   BsmtFinSF2     1460 non-null   int64  
 10  BsmtUnfSF      1460 non-null   int64  
 11  TotalBsmtSF    1460 non-null   int64  
 12  1stFlrSF       1460 non-null   int64  
 13  2ndFlrSF       1460 non-null   int64  
 14  LowQualFinSF   1460 non-null   int64  
 15  GrLivArea      1460 non-null   int64  
 16  BsmtFullBath   1460 non-null   int64  
 17  BsmtHalfBath   1460 non-null   int64  
 18  FullBath

In [5]:
# Fill in missing values for predictor variables with mean value
datasets = [training, training_50, test]

for df in datasets:
    for column in df.columns:
        if df[column].isna().any():
            mean_value = df[column].mean()
            df[column].fillna(mean_value, inplace=True)

# check if data has been filled in 
training.info()
training_50.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 37 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   LotFrontage    1460 non-null   float64
 2   LotArea        1460 non-null   int64  
 3   OverallQual    1460 non-null   int64  
 4   OverallCond    1460 non-null   int64  
 5   YearBuilt      1460 non-null   int64  
 6   YearRemodAdd   1460 non-null   int64  
 7   MasVnrArea     1460 non-null   float64
 8   BsmtFinSF1     1460 non-null   int64  
 9   BsmtFinSF2     1460 non-null   int64  
 10  BsmtUnfSF      1460 non-null   int64  
 11  TotalBsmtSF    1460 non-null   int64  
 12  1stFlrSF       1460 non-null   int64  
 13  2ndFlrSF       1460 non-null   int64  
 14  LowQualFinSF   1460 non-null   int64  
 15  GrLivArea      1460 non-null   int64  
 16  BsmtFullBath   1460 non-null   int64  
 17  BsmtHalfBath   1460 non-null   int64  
 18  FullBath

# Phase 1 MVP: Run LASSO on full training data

Using the full training data

In [6]:
# Create variables for model fitting
# Define y_train, X_train, X_test
y_train = training["SalePrice"].values
X_train = training[['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 
            'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 
            'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 
            'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 
            'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 
            'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 
            'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 
            'MoSold', 'YrSold']].values
X_test = test[['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 
            'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 
            'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 
            'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 
            'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 
            'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 
            'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 
            'MoSold', 'YrSold']].values

# Instantiate rescaler
scaler = StandardScaler()

# Rescale and recenter training data using the above means and sd's. 
X_train_scaled = scaler.fit_transform(X_train)

# Rescale and recenter test data using the **same** means and sd's of training data.
X_test_scaled = scaler.transform(X_test)

print(np.mean(X_train_scaled, axis = 0))
print(np.std(X_train_scaled, axis = 0))

[-8.45594523e-17  4.07588727e-16 -5.84007728e-17  1.38701835e-16
  3.54054685e-16  1.04634718e-15  4.49685951e-15 -3.40671175e-17
 -2.43336553e-17 -3.40671175e-17 -6.60050401e-17  2.45769919e-16
  6.50925280e-17 -1.82502415e-17  1.21668277e-17 -1.27751691e-16
  2.31169726e-17  2.43336553e-17  1.18018228e-16  2.08356924e-17
  2.14136167e-16  4.50172624e-16 -1.02201352e-16 -4.86673107e-18
  3.93901046e-15  1.21668277e-16 -1.21668277e-17  5.59674073e-17
  3.04170692e-17 -2.31169726e-17  4.86673107e-18  5.47507245e-17
  1.94669243e-17 -2.67670209e-17  7.54343315e-17  3.56743554e-14]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [53]:
print(X_train_scaled)

[[ 0.07337496 -0.22937175 -0.20714171 ... -0.08768781 -1.5991111
   0.13877749]
 [-0.87256276  0.4519361  -0.09188637 ... -0.08768781 -0.48911005
  -0.61443862]
 [ 0.07337496 -0.09311018  0.07347998 ... -0.08768781  0.99089135
   0.13877749]
 ...
 [ 0.30985939 -0.18395123 -0.14781027 ...  4.95311151 -0.48911005
   1.64520971]
 [-0.87256276 -0.09311018 -0.08016039 ... -0.08768781 -0.8591104
   1.64520971]
 [-0.87256276  0.22483348 -0.05811155 ... -0.08768781 -0.1191097
   0.13877749]]


## 1.a) Run LASSO for a large value of $\lambda$
i.e. the fitted slope coefficients are _very_ much penalized for complexity, thus should equal 0,
thus you end up with the simplest model: the mean model

Hints:
- Check out the functions below:
    - log1p(x) = log(x + 1) and 
    - it's inverse expm1(x) = exp(x) - 1
- Transform the outcome variable $y$ to (log + 1)-space and vice-versa to avoid issue of negative fitted values 

In [7]:
# Hint: log1p(x) = log(x + 1) and it's inverse expm1(x) = exp(x) - 1
training['log_SalePrice'] = np.log1p(training['SalePrice'])
#print(example)
#print(np.expm1(example))

training

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice,log_SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,61,0,0,0,0,0,2,2008,208500,12.247699
2,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,0,0,0,5,2007,181500,12.109016
3,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,42,0,0,0,0,0,9,2008,223500,12.317171
4,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,35,272,0,0,0,0,2,2006,140000,11.849405
5,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,84,0,0,0,0,0,12,2008,250000,12.429220
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,62.0,7917,6,5,1999,2000,0.0,0,0,...,40,0,0,0,0,0,8,2007,175000,12.072547
1457,20,85.0,13175,6,6,1978,1988,119.0,790,163,...,0,0,0,0,0,0,2,2010,210000,12.254868
1458,70,66.0,9042,7,9,1941,2006,0.0,275,0,...,60,0,0,0,0,2500,5,2010,266500,12.493133
1459,20,68.0,9717,5,6,1950,1996,0.0,49,1029,...,0,112,0,0,0,0,4,2010,142125,11.864469


In [9]:
# Instantiate and fit large lambda LASSO model
model_LASSO = Lasso(alpha = 10000000)
model_LASSO.fit(X_train_scaled, training['log_SalePrice'])

# Print model coefficients
print(model_LASSO.coef_)

training['log_SalePrice_hat'] = model_LASSO.predict(X_train_scaled)

training

# Print RMSLE score on training data
rmsle = np.sqrt((sum((training['log_SalePrice_hat'] - training['log_SalePrice'])**2))/(len(training)))
print(rmsle)

# Write to CSV and make note of your Kaggle score
# (no need to take Kaggle screenshot, graders will look at your output .csv)
test['log_SalePrice_hat'] = model_LASSO.predict(X_test_scaled)
test['SalePrice_hat'] = np.expm1(test['log_SalePrice_hat'])
test

submission = pd.DataFrame(test.index,
                          test['SalePrice_hat'])

submission.set_index(test.index, inplace = True)

submission['SalePrice'] = test['SalePrice_hat']

del submission['Id']

submission

submission.to_csv("data/submission_LASSO_training_full_large_lambda.csv")

# Kaggle score: 0.41637

[-0.  0.  0.  0. -0.  0.  0.  0.  0.  0.  0.  0.  0.  0. -0.  0.  0. -0.
  0.  0.  0. -0.  0.  0.  0.  0.  0.  0.  0. -0.  0.  0.  0. -0.  0. -0.]
0.39931245219387457


## 1.b) Run LASSO for a small value of $\lambda$
i.e. the fitted slope coefficients are _very_ much _not_ penalized for complexity, thus should equal to coefficients from multiple regression, thus you end up with the most complex model: the same as multiple regression

In [10]:
# Instantiate and fit small lambda LASSO model
model_LASSO_simple = Lasso(alpha = 0)
model_LASSO_simple.fit(X_train_scaled, training['log_SalePrice'])

# Print model coefficients
print(model_LASSO_simple.coef_)

training['log_SalePrice_complex_hat'] = model_LASSO_simple.predict(X_train_scaled)

training

# Print RMSLE score on training data
rmsle = np.sqrt((sum((training['log_SalePrice_complex_hat'] - training['log_SalePrice'])**2))/(len(training)))
print(rmsle)

# Write to CSV and make note of your Kaggle score
# (no need to take Kaggle screenshot, graders will look at your output .csv)
test['log_SalePrice_simple_hat'] = model_LASSO_simple.predict(X_test_scaled)
test['SalePrice_simple_hat'] = np.expm1(test['log_SalePrice_simple_hat'])
test

submission = pd.DataFrame(test.index,
                          test['SalePrice_simple_hat'])

submission.set_index(test.index, inplace = True)

submission['SalePrice'] = test['SalePrice_simple_hat']

del submission['Id']

submission

submission.to_csv("data/submission_LASSO_training_full_small_lambda.csv")

# Kaggle score: 0.14779

[-0.02725046 -0.00316201  0.01869051  0.11677195  0.05403081  0.09046824
  0.02371397  0.000215    0.02609385  0.00733644  0.01483703  0.00916351
  0.06850501  0.06632851  0.00778879  0.00956018  0.03292193  0.00459448
  0.02224568  0.01108469 -0.00180134 -0.01106886  0.02537526  0.02907506
 -0.00598026  0.04952651  0.00687626  0.01556996 -0.00223711  0.01029204
  0.00649832  0.0202051  -0.01516488 -0.0017476   0.00096757 -0.00934482]
0.14489915330361017


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


# Phase 2: Run LASSO on n=50 training data
Using the n=50 row training data

In [11]:
# Create variables for model fitting
# Define y_train, X_train, X_test
y_train = training_50["SalePrice"].values
X_train = training_50[['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 
            'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 
            'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 
            'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 
            'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 
            'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 
            'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 
            'MoSold', 'YrSold']].values
X_test = test[['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 
            'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 
            'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 
            'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 
            'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 
            'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 
            'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 
            'MoSold', 'YrSold']].values

# Instantiate rescaler
scaler = StandardScaler()

# Rescale and recenter training data using the above means and sd's. 
X_train_scaled = scaler.fit_transform(X_train)

# Rescale and recenter test data using the **same** means and sd's of training data.
X_test_scaled = scaler.transform(X_test)

print(np.mean(X_train_scaled, axis = 0))
print(np.std(X_train_scaled, axis = 0))

[ 5.55111512e-17  8.83737528e-16 -2.45359288e-16  2.10942375e-16
 -2.81996648e-16 -6.06181771e-16  2.47357690e-15  3.33066907e-17
 -1.77635684e-17 -4.44089210e-18  3.33066907e-17 -2.22044605e-16
  1.53210777e-16 -1.99840144e-17  4.02455846e-17 -1.35447209e-16
  2.88657986e-17 -1.27675648e-17  1.68753900e-16 -1.17683641e-16
 -1.77635684e-16 -1.59317004e-16 -1.56541446e-16  2.22044605e-17
  4.44089210e-18 -1.57374114e-16 -1.39888101e-16  4.88498131e-17
  2.77555756e-17 -9.99200722e-18  6.38378239e-18  7.10542736e-17
 -4.71844785e-18 -2.49800181e-17  1.37667655e-16 -4.02378131e-14]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


## 2.a) Run LASSO for a large value of $\lambda$
i.e. the fitted slope coefficients are _very_ much penalized for complexity, thus should equal 0,
thus you end up with the simplest model: the mean model

In [12]:
training_50['log_SalePrice'] = np.log1p(training_50['SalePrice'])

# Instantiate and fit large lambda LASSO model
model_LASSO = Lasso(alpha = 10000000)
model_LASSO.fit(X_train_scaled, training_50['log_SalePrice'])

# Print model coefficients
print(model_LASSO.coef_)

training_50['log_SalePrice_hat'] = model_LASSO.predict(X_train_scaled)

training_50

# Print RMSLE score on training data
rmsle = np.sqrt((sum((training_50['log_SalePrice_hat'] - training_50['log_SalePrice'])**2))/(len(training_50)))
print(rmsle)

# Write to CSV and make note of your Kaggle score
# (no need to take Kaggle screenshot, graders will look at your output .csv)

test['log_SalePrice_small_hat'] = model_LASSO.predict(X_test_scaled)
test['SalePrice_small_hat'] = np.expm1(test['log_SalePrice_small_hat'])
test

submission = pd.DataFrame(test.index,
                          test['SalePrice_small_hat'])

submission.set_index(test.index, inplace = True)

submission['SalePrice'] = test['SalePrice_small_hat']

del submission['Id']

submission

submission.to_csv("data/submission_LASSO_training_50_large_lambda.csv")

# Kaggle score: 0.42575

[-0.  0.  0.  0. -0.  0.  0.  0.  0.  0.  0.  0.  0.  0. -0.  0.  0. -0.
  0.  0.  0. -0.  0.  0.  0.  0.  0.  0.  0. -0.  0.  0.  0. -0.  0. -0.]
0.45926799259835493


## 1.b) Run LASSO for a small value of $\lambda$
i.e. the fitted slope coefficients are _very_ much _not_ penalized for complexity, 
thus should equal to coefficients from multiple regression, thus you end up with the most complex model: 
the same as multiple regression

In [59]:
# Instantiate and fit large lambda LASSO model
model_LASSO_simple = Lasso(alpha = 0)
model_LASSO_simple.fit(X_train_scaled, training_50['log_SalePrice'])

# Print model coefficients
print(model_LASSO_simple.coef_)

training_50['log_SalePrice_small_complex_hat'] = model_LASSO_simple.predict(X_train_scaled)

training_50

# Print RMSLE score on training data
rmsle = np.sqrt((sum((training_50['log_SalePrice_small_complex_hat'] - training_50['log_SalePrice'])**2))/(len(training_50)))
print(rmsle)

# Write to CSV and make note of your Kaggle score
# (no need to take Kaggle screenshot, graders will look at your output .csv)

test['log_SalePrice_small_complex_hat'] = model_LASSO_simple.predict(X_test_scaled)
test['SalePrice_small_complex_hat'] = np.expm1(test['log_SalePrice_small_complex_hat'])
test

submission = pd.DataFrame(test.index,
                          test['SalePrice_small_complex_hat'])

submission.set_index(test.index, inplace = True)

submission['SalePrice'] = test['SalePrice_small_complex_hat']

del submission['Id']

submission

submission.to_csv("data/submission_LASSO_training_50_small_lambda.csv")

# Kaggle score: 0.17480

[-0.00921919  0.0525041  -0.00225779  0.14477921  0.05057082  0.08643744
  0.02042349 -0.02271167  0.09229736  0.01192975  0.06017714  0.03760304
  0.09598726  0.13824481 -0.02188611  0.01377728 -0.0007833   0.00632475
  0.04633903 -0.00065763  0.01814597  0.0039986  -0.0224194  -0.01167026
  0.00379261  0.01635566  0.01076636  0.01922473 -0.01998843  0.01928563
  0.00938999  0.00923785 -0.04873755  0.00635661 -0.00700959 -0.01332101]
0.09227842555297872


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


# Comparing Scores

Using full training data

| RMSLE                | $\lambda=$ 10000000 | $\lambda=$ 0        |
|----------------------|---------------------|---------------------|
| Score on training    | 0.39931             | 0.14490             |
| Score on test Kaggle | 0.41637             | 0.14779             |


Using n=50 training data

| RMSLE                | $\lambda=$ 10000000 | $\lambda=$ 0        |
|----------------------|---------------------|---------------------|
| Score on training    | 0.45927             | 0.09228             |
| Score on test Kaggle | 0.42575             | 0.17480             |

Explore the following questions:

1. What is the lambdas used
1. Which value of $\lambda$ addresses potentially underfit models? The large or small one? The large one
1. Which value of $\lambda$ addresses potentially overfit models? The large or small one? The small one
1. Argue using the values above why when $n$ is really small relative to $p$, you are very vulnerable to overfitting your models. 

When the number of observations (n) is small relative to the number of predictors (p) in the LASSO model, it is more vulnerable to overfitting. The reasons are as follows:

There's less information available to estimate the coefficients accurately. Therefore, the estimates of coefficients can be very unreliable.

Many predictors may have little or no association with the response variable. LASSO tends to reduce the coefficients of these predictors towards zero. However, the algorithm may not accurately identify which predictors should have their coefficients reduced to zero, leading to potential overfitting.

Taken together, the minimization of coefficients may not be reliable and accurate when n is small relative to p in the LASSO model.

# Phase 3: Use `GridSearchCV()`
- Find optimal $\lambda$ for n=50 training set that returns the lowest estimated RMSLE using `GridSearchCV()`
- Make a Kaggle submission using this $\lambda$ value and report score

In [22]:
# Instantiate Lasso model with random_state = 76, tol=0.001
model_LASSO_cv = Lasso(random_state = 76, tol=0.001)

# Instantiate the GridSearchCV object and run the search on a grid of 
# alpha values from 10^{-4} = 0.0001 to 10^{-1} = 0.1
parameters = {'alpha':pow(10, np.linspace(-4, -1, num=100))}

searcher = GridSearchCV(model_LASSO_cv, parameters, scoring = 'neg_mean_squared_error')
searcher.fit(X_train_scaled, np.log1p(y_train))

print(searcher.best_params_)

{'alpha': 0.007054802310718645}


In [23]:
model_LASSO_optimal_lambda = Lasso(alpha = 0.0070548)

model_LASSO_optimal_lambda.fit(X_train_scaled, training_50['log_SalePrice'])

# Print model coefficients
print(model_LASSO_optimal_lambda.coef_)

test['log_SalePrice_optimal_lambda'] = model_LASSO_optimal_lambda.predict(X_test_scaled)

test['SalePrice_optimal_lambda'] = np.expm1(test['log_SalePrice_optimal_lambda'])

submission = pd.DataFrame(test.index,
                          test['SalePrice_optimal_lambda'])

submission.set_index(test.index, inplace = True)

submission['SalePrice'] = test['SalePrice_optimal_lambda']

del submission['Id']

submission

submission.to_csv("data/submission_LASSO_optimal_lambda.csv")

# Kaggle score: 0.16140

[-0.00000000e+00  3.51650408e-02  1.00979432e-02  1.24121928e-01
  3.15721488e-02  6.13390316e-02  3.37638662e-02 -0.00000000e+00
  3.47488085e-02 -0.00000000e+00 -0.00000000e+00  7.20642492e-02
  0.00000000e+00  0.00000000e+00 -1.73836961e-02  1.24669905e-01
  0.00000000e+00  0.00000000e+00  5.93744911e-02  7.57175305e-03
  3.61546778e-03 -0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  2.91001627e-02  0.00000000e+00  7.56064583e-03
 -0.00000000e+00  0.00000000e+00  9.20964832e-05  3.65221009e-03
 -1.19858719e-02  0.00000000e+00 -0.00000000e+00 -1.06373298e-03]
