# ISLP - Chapter 6 - Exercise 9
### Author: pzuehlke

In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error

In [100]:
college = pd.read_csv("College.csv")
print(college.info())
college.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 777 entries, 0 to 776
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   777 non-null    object 
 1   Private      777 non-null    object 
 2   Apps         777 non-null    int64  
 3   Accept       777 non-null    int64  
 4   Enroll       777 non-null    int64  
 5   Top10perc    777 non-null    int64  
 6   Top25perc    777 non-null    int64  
 7   F.Undergrad  777 non-null    int64  
 8   P.Undergrad  777 non-null    int64  
 9   Outstate     777 non-null    int64  
 10  Room.Board   777 non-null    int64  
 11  Books        777 non-null    int64  
 12  Personal     777 non-null    int64  
 13  PhD          777 non-null    int64  
 14  Terminal     777 non-null    int64  
 15  S.F.Ratio    777 non-null    float64
 16  perc.alumni  777 non-null    int64  
 17  Expend       777 non-null    int64  
 18  Grad.Rate    777 non-null    int64  
dtypes: float

Unnamed: 0.1,Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,Adelphi University,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,Adrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,Agnes Scott College,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,Alaska Pacific University,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [101]:
college["Private"] = (college["Private"] == "Yes").astype(int)
college = college.rename(columns={"Unnamed: 0": "College"})
college = college.set_index("College")
college.info()
college.head()

<class 'pandas.core.frame.DataFrame'>
Index: 777 entries, Abilene Christian University to York College of Pennsylvania
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Private      777 non-null    int64  
 1   Apps         777 non-null    int64  
 2   Accept       777 non-null    int64  
 3   Enroll       777 non-null    int64  
 4   Top10perc    777 non-null    int64  
 5   Top25perc    777 non-null    int64  
 6   F.Undergrad  777 non-null    int64  
 7   P.Undergrad  777 non-null    int64  
 8   Outstate     777 non-null    int64  
 9   Room.Board   777 non-null    int64  
 10  Books        777 non-null    int64  
 11  Personal     777 non-null    int64  
 12  PhD          777 non-null    int64  
 13  Terminal     777 non-null    int64  
 14  S.F.Ratio    777 non-null    float64
 15  perc.alumni  777 non-null    int64  
 16  Expend       777 non-null    int64  
 17  Grad.Rate    777 non-null    int64  
dtypes: 

Unnamed: 0_level_0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
College,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Abilene Christian University,1,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
Adelphi University,1,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
Adrian College,1,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
Agnes Scott College,1,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
Alaska Pacific University,1,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


__9 (a):__ Here we chose an $ 80\% $ train / $ 20 \% $ test split for the data:

In [102]:
X = college.drop("Apps", axis=1)
y = college["Apps"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

__9 (b):__ The test MSE is approximately $ 649\,901$.

In [110]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

linear_reg = LinearRegression()
linear_reg.fit(X_train_scaled, y_train)

test_predictions = linear_reg.predict(X_test_scaled)
lr_mse = mean_squared_error(y_test, test_predictions)
print(round(lr_MSE, 4))

649900.82


__9 (c):__ The chosen $ \lambda $ is $ \lambda = 7.5646 $, and the corresponding
regression test MSE is $ 664\,495 $, which is larger than the MSE obtained
through multiple linear regression.

In [104]:
ridge_cv = RidgeCV(alphas=np.logspace(-3, 3, 100), cv=5)
ridge_cv.fit(X_train_scaled, y_train)
ridge_pred = ridge_cv.predict(X_test_scaled)
ridge_mse = mean_squared_error(y_test, ridge_pred)
print(f"ridge regression test MSE: {ridge_mse:.4f}")
print(f"lambda: {ridge_cv.alpha_:.4f}")

ridge regression test MSE: 664495.2535
lambda: 7.5646


__9 (d):__ The value of $ \lambda $ chosen by the lasso regularization is $ \lambda
= 26.56 $, and the corresponding test MSE is $ 695\,332 $, which
is greater than that for ridge regression and least squares regression. However,
using the lasso we reduced the number of nonzero coefficients from $ 17 $ to $ 13 $.

In [105]:
lasso_cv = LassoCV(alphas=np.logspace(-3, 3, 100), cv=5)
lasso_cv.fit(X_train_scaled, y_train)
lasso_pred = lasso_cv.predict(X_test_scaled)
lasso_mse = mean_squared_error(y_test, lasso_pred)
nonzero_coef = np.sum(lasso_cv.coef_ != 0)
print(f"lasso test MSE: {lasso_mse:.4f}")
print(f"lasso lambda: {lasso_cv.alpha_:.4f}")
print(f"number of non-zero coefficients: {nonzero_coef}")

lasso test MSE: 695331.9270
lasso lambda: 26.5609
number of non-zero coefficients: 13


In [106]:
lasso_coef = pd.Series(lasso_cv.coef_, index=X.columns)
print(lasso_coef[lasso_coef == 0].index.tolist())

['F.Undergrad', 'P.Undergrad', 'Books', 'Personal']


We can see that the four variables that were excluded by the lasso were: the
number of full-time and part-time undergraduates, the estimated book costs and
the personal spending by students.

__9 (e):__ The optimal value $ M $ of components of PCR is $ M = 17 $, which
equals the number of variables in the original model. Hence, in this case PCR is
equivalent to multiple linear regression on these variables. Indeed, we can
check that the test MSE obtained here is the same as that
obtained in item (b): $ 649\,901 $.

In [107]:
def pcr_cv(X_train, y_train, X_test, y_test, max_components):
    mse_scores = []
    for m in range(1, max_components + 1):
        pca = PCA(n_components=m)
        X_train_pca = pca.fit_transform(X_train_scaled)
        X_test_pca = pca.transform(X_test_scaled)
        
        linear_reg = LinearRegression()
        linear_reg.fit(X_train_pca, y_train)
        predictions = linear_reg.predict(X_test_pca)
        mse = mean_squared_error(y_test, predictions)
        mse_scores.append(mse)
    
    best_m = np.argmin(mse_scores) + 1
    return best_m, min(mse_scores)

n_components = X_train.shape[1]
best_m_pcr, pcr_mse = pcr_cv(X_train_scaled, y_train, X_test_scaled, y_test, n_components)
print(f"PCR test MSE: {pcr_mse:.2f}")
print(f"best number of components (M) for PCR: {best_m_pcr}")


PCR test MSE: 649900.82
best number of components (M) for PCR: 17


__9 (f):__

In [108]:
def pls_cv(X_train, y_train, X_test, y_test, max_components):
    mse_scores = []
    for n in range(1, max_components + 1):
        pls_reg = PLSRegression(n_components=n)
        pls_reg.fit(X_train_scaled, y_train)
        predictions = pls_reg.predict(X_test_scaled)
        mse = mean_squared_error(y_test, predictions.ravel())
        mse_scores.append(mse)
    
    best_n = np.argmin(mse_scores) + 1
    return best_n, min(mse_scores)

best_m_pls, pls_mse = pls_cv(X_train_scaled, y_train, X_test_scaled, y_test, n_components)
print(f"PLS test MSE: {pls_mse:.2f}")
print(f"best number of components (M) for PLS: {best_m_pls}")

PLS test MSE: 647155.25
best number of components (M) for PLS: 10


__9 (g):__ There isn't much difference in this case between the resulting test
MSE among the models. The largest difference occurs between lasso and partial
least squares, and is equal to
$$
    695\,332 - 647\,155 = 48\,177
$$
or just
$$
    \frac{48\,177}{647\,155} = 7.44\%\,.
$$
The values of the root MSE tell us that for each of the models, on average the
predictions are within around $ 800 $ applicants of the true number of
applicants, which is a reasonable level of accuracy.

In [119]:
print("\nSummary of the MSE estimates:")
print("-" * 50)
methods = {
    "Linear regression": lr_mse,
    "Ridge regression": ridge_mse,
    "Lasso": lasso_mse,
    "PCR": pcr_mse,
    "PLS": pls_mse
}

for method, mse in methods.items():
    print(f"{method:20} MSE: {round(mse):<12} RMSE: {round(np.sqrt(mse))}")


Summary of the MSE estimates:
--------------------------------------------------
Linear regression    MSE: 649901       RMSE: 806
Ridge regression     MSE: 664495       RMSE: 815
Lasso                MSE: 695332       RMSE: 834
PCR                  MSE: 649901       RMSE: 806
PLS                  MSE: 647155       RMSE: 804
