## <font color='green'> Application of the LASSO to Boston Data

In [None]:
import os
os.chdir('/Users/hj020/Desktop/2022/EconomicAnalytics-master/Python_/Data')

import numpy as np
import pandas as pd
import math

np.set_printoptions(precision=3, suppress=True)

raw0 = pd.read_csv('Boston.csv')

In [None]:
raw0.head()

In [None]:
# Store variable names for labeling later
varname=list(raw0.iloc[:,1:-1].columns)

# Define y and X (including all the regressors)
raw0 = raw0.iloc[:,1:].values
Y = raw0[:,-1]
X = raw0[:,0:-1]

In [None]:
varname

### <font color='green'> 1) Lasso Estimation with a Preselected Tuning Parameter Value ($\lambda$)

Parameters in linear_model.Lasso: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html

In [None]:
from sklearn import linear_model
las = linear_model.Lasso(alpha=0.5).fit(X,Y) # alpha is the tuning parameter

In [None]:
print(las.coef_)

### <font color='green'> 2) Computation of a Lasso Solution Path
Parameters and returns in lasso_path: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.lasso_path.html

In [None]:
import matplotlib.pyplot as plt
from sklearn.linear_model import lasso_path
from itertools import cycle

In [None]:
# Use "eps" to specify the length and density of the grid (eps = alpha_min / alpha_max)
eps = 1e-10  
alphas_lasso, coefs_lasso, _ = lasso_path(X, Y, eps = eps)

In [None]:
print(alphas_lasso) 

In [None]:
print(coefs_lasso)

In [None]:
# Each row of "coefs_lasso" contains a series of estimates for a coefficient over the grid
# Each column contains coefficient etimates at each lambda value
print(coefs_lasso[2]) 

In [None]:
# Display the solution path
plt.figure(figsize=(8, 7), dpi=80)
colors = cycle(['b', 'r', 'g', 'c','m', 'y', 'k'])

# Take the log of the alpha values to adjust the scale of X-axis
log_alphas_lasso = np.log10(alphas_lasso) 

# Use a for-loop to plot several paths on a figure 
for coef_l, c, vn in zip(coefs_lasso, colors, varname):
    l1 = plt.plot(log_alphas_lasso, coef_l, c=c, label=vn)
    
plt.xlabel('Log(alpha)')
plt.ylabel('Coefficient Estimtes')
plt.title('Lasso Solution Path')
plt.legend()
plt.savefig('lassopath.png')
plt.show()


### <font color='green'> 3) Selection of a Tuning Parameter Value (= Selection of a Model) in the LASSO using CV/BIC/AIC

In [None]:
from sklearn.linear_model import LassoLarsCV, LassoLarsIC
import time

#### <font color='green'> i) Cross Validation

In [None]:
t1 = time.time() # Get the current time
lascv = LassoLarsCV(cv=5).fit(X, Y)
t_lasso_lars_cv = time.time() - t1 # Calculate running time

In [None]:
# Display the results
eps = 5e-10
lascv_log_alphas = np.log10(lascv.cv_alphas_ + eps) 
lascv_log_alpha = np.log10(lascv.alpha_)
# Caution: lascv.alphas contains the alpha at the lowest MSE whereas lascv.cv_alphas_ contains the set of alphas used in the path
# The smallest value in lascv.cv_alphas_ is 0 so we add eps (a small number) to avoid log(0)

plt.figure(figsize=(8, 7), dpi=80)
plt.plot(lascv_log_alphas, lascv.mse_path_.mean(axis=1), 'k',
         label='Average of the MSEs over the Folds', linewidth=2)
plt.axvline(lascv_log_alpha, linestyle='--', color='k',
            label='alpha selected by CV')
plt.legend()

plt.xlabel('Log(alpha)')
plt.ylabel('Mean Square Error')
plt.title('Model Selection by Cross Validation (train time: %.2fs)'
          % t_lasso_lars_cv)
plt.show()

In [None]:
# Access the Lasso estimates at the alpha selected by CV
print(lascv.coef_)

#### <font color='green'> ii) BIC and AIC

In [None]:
lasic_bic = LassoLarsIC(criterion='bic').fit(X, Y)
lasic_aic = LassoLarsIC(criterion='aic').fit(X, Y)

In [None]:
# Display results
# make a fn to produce figures with the same features repeatedly
def plot_ic_criterion(model, name, color): 
    alpha_ = model.alpha_ + eps
    alphas_ = model.alphas_ + eps
    criterion_ = model.criterion_ # BIC or AIC values over the alpha values
    plt.plot(np.log10(alphas_), criterion_, '--', color=color,
             linewidth=3, label='%s' % name)
    plt.axvline(np.log10(alpha_), color=color, linewidth=3,
                label='$\lambda$ selected by %s ' % name)
    plt.xlabel('Log($\lambda$)')
    plt.ylabel('Criterion Value')
    

plt.figure(figsize=(8, 7), dpi=80)
plot_ic_criterion(lasic_aic, 'AIC', 'b')
plot_ic_criterion(lasic_bic, 'BIC', 'r')
plt.legend()
plt.title('Model Selection by Information Criteria')
#plt.show()
plt.savefig('lasso.png')

In [None]:
# Access the Lasso estimates at the alpha selected by AIC and BIC
print(lasic_aic.coef_)
print(lasic_bic.coef_)

### <font color='darkred'> HW5
    
* Use the dataset, "Hitters.csv", posted on BB to explain/predict a baseball player’s salary <u> using a subset of covariates in the dataset </u>.
    
* In order to select a subset of covariates, do the following:
    - Forward and backward stepwise selections based on AIC and BIC
    - LASSO Estimations with CV, AIC and BIC
    - Produce tables or figures or both to summarize your results
    
* For this exercise, you need to take care of missing values and also generate dummies for some variables