# Importing Libraries

In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.cross_decomposition import PLSRegression

# Loading Data

In [41]:
college = pd.read_csv("../data/College.csv", index_col=0)
college.head()

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
Adelphi University,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
Adrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
Agnes Scott College,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
Alaska Pacific University,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


# Cleaning Data

In [42]:
college["Private"] = [1 if item == "Yes" else 0 for item in college["Private"]]
college["Private"].describe()

count    777.000000
mean       0.727156
std        0.445708
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        1.000000
Name: Private, dtype: float64

In [43]:
X = college.drop("Apps", axis=1)
y = college["Apps"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Linear Regression

In [44]:
lm = LinearRegression()
lm.fit(X_train, y_train)
lm.score(X_test, y_test)

0.9086043157818349

In [45]:
lm.coef_

array([-6.71494581e+02,  1.26084659e+00, -3.78676178e-01,  5.26189920e+01,
       -1.61543998e+01,  9.89482066e-02,  3.76517628e-02, -4.28434296e-02,
        2.17593133e-01,  3.55265903e-02, -4.53107082e-02, -9.46608275e+00,
       -4.68766853e+00,  4.57273871e+00, -7.54108110e+00,  6.44367603e-02,
        1.05837554e+01])

# Ridge Regression

In [46]:
rcv = RidgeCV(alphas=(np.e ** np.linspace(-10, 5, 100)), cv=5)
rcv.fit(X_train, y_train)
rcv.score(X_test, y_test)

0.9092522298932009

In [47]:
rcv.coef_

array([-5.83400825e+02,  1.26452343e+00, -3.85233645e-01,  5.27026687e+01,
       -1.62563610e+01,  1.01022333e-01,  3.93234301e-02, -4.75113007e-02,
        2.13967823e-01,  3.19989029e-02, -4.51266779e-02, -9.16435885e+00,
       -4.25750060e+00,  5.92760252e+00, -7.70839423e+00,  6.51712391e-02,
        1.04509966e+01])

# Lasso Regression

In [48]:
lcv = LassoCV(alphas=(np.e ** np.linspace(-10, 5, 100)), cv=5, max_iter=100000)
lcv.fit(X_train, y_train)
lcv.score(X_test, y_test)

0.9087674310536485

In [49]:
lcv.coef_

array([-6.50735016e+02,  1.26171653e+00, -3.80030298e-01,  5.25827304e+01,
       -1.61381198e+01,  9.94345587e-02,  3.80002723e-02, -4.39757899e-02,
        2.16751018e-01,  3.46979034e-02, -4.53118780e-02, -9.38232355e+00,
       -4.58634781e+00,  4.71634797e+00, -7.56146756e+00,  6.45626498e-02,
        1.05375204e+01])

# Principal Components Regression (PCR)

In [50]:
pca = PCA()
X_train_PCA = pca.fit_transform(scale(X_train))
X_test_PCA = pca.transform(scale(X_test))

np.cumsum(pca.explained_variance_ratio_)

array([0.31240358, 0.5694867 , 0.64172412, 0.69808782, 0.75310894,
       0.80411059, 0.84078342, 0.87591999, 0.90602142, 0.93118968,
       0.95209374, 0.96974588, 0.9806435 , 0.98907396, 0.99460556,
       0.99843487, 1.        ])

In [51]:
# Find optimal number of principal components using cross-validation
best_M = 0
max_score = 0
pcr = LinearRegression()

for M in range(1, X_train.shape[1] + 1):
    pcr.fit(X_train_PCA[:,:M], y_train)
    score = np.mean(cross_val_score(pcr, X_train_PCA[:, :M], y_train, cv=5))
    if score > max_score:
        max_score = score
        best_M = M

pcr.fit(X_train_PCA[:, :best_M], y_train)
pcr.score(X_test_PCA[:, :best_M], y_test)

0.9003980191025094

In [52]:
# Optimal value of M selected
best_M

17

# Partial Least Squares (PLS)

In [53]:
# Find optimal number of principal components using cross-validation
best_M = 0
max_score = 0

for M in range(1, X_train.shape[1] + 1):
    pls = PLSRegression(n_components=M, max_iter=100000)
    pls.fit(X_train, y_train)
    score = np.mean(cross_val_score(pls, X_train, y_train, cv=5))
    if score > max_score:
        max_score = score
        best_M = M

pls = PLSRegression(n_components=best_M, max_iter=100000)
pls.fit(X_train, y_train)
pls.score(X_test, y_test)

0.9083013629152908

In [54]:
# Optimal value of M selected
best_M

9