In [88]:
import numpy as np
import pandas as pd

from ISLP import load_data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [89]:
College = load_data('College')
College

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
772,No,2197,1515,543,4,26,3089,2029,6797,3900,500,1200,60,60,21.0,14,4469,40
773,Yes,1959,1805,695,24,47,2849,1107,11520,4960,600,1250,73,75,13.3,31,9189,83
774,Yes,2097,1915,695,34,61,2793,166,6900,4200,617,781,67,75,14.4,20,8323,49
775,Yes,10705,2453,1317,95,99,5217,83,19840,6510,630,2115,96,96,5.8,49,40386,99


In [90]:

y = College['Apps']

In [91]:
X_columns = College.columns.drop('Apps')

X = College[X_columns].copy()
X['Private'] = X['Private'].map({ 'Yes': 1, 'No': 0 })


In [92]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

### (a)

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=101)

### (b)

In [94]:
from sklearn.linear_model import LinearRegression

regr = LinearRegression()
regr.fit(X_train, y_train)

y_pred = regr.predict(X_test)

print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
print("R-squired: %.2f" % r2_score(y_test, y_pred))

Mean squared error: 816981.79
R-squired: 0.94


### (c)

In [95]:
from sklearn.linear_model import  RidgeCV

lambdas = 10**np.linspace(8,-2, 100) / y_train.std()

ridge = RidgeCV(alphas=lambdas).fit(X_train, y_train)
y_pred = ridge.predict(X_test)

print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
print(f"R-squired: {ridge.score(X_test, y_test):.2f}")
print(f"selected alpha: {ridge.alpha_}")

Mean squared error: 816983.40
R-squired: 0.94
selected alpha: 2.5697943105459096e-06


### (d)

In [96]:
from sklearn.linear_model import  LassoCV

lasso = LassoCV(alphas=lambdas).fit(X_train, y_train)
y_pred = lasso.predict(X_test)

print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
print(f"R-squired: {lasso.score(X_test, y_test):.2f}")
print(f"Selected alpha: {lasso.alpha_}")
print(f"Number of non-zero coefficient estimates: {(lasso.coef_ != 0).sum()}")


Mean squared error: 853598.87
R-squired: 0.94
Selected alpha: 18.992635432580833
Number of non-zero coefficient estimates: 14
