# 5.3 Lab: Cross-Validation and the Bootstrap

## 5.3.1 The Validation Set Approach

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scipy
import pandas as pd 
import math
import random
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.graphics.regressionplots import *
from sklearn import datasets, linear_model
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from collections import OrderedDict

In [None]:
Auto = pd.read_csv('data/Auto.csv', header=0, na_values='?')
Auto = Auto.dropna().reset_index(drop=True) # drop the observation with NA values and reindex the obs from 0
print(Auto.shape)
print(Auto.head())

In [None]:
# split the data into training and record the index of train samples
np.random.seed(1)
train = np.random.choice(Auto.shape[0], 196, replace=False)
select = np.in1d(range(Auto.shape[0]), train)

In [None]:
# start to build the model
lm = smf.ols ('mpg~horsepower', data = Auto[select]).fit()
print(lm.summary())

In [None]:
# to follow the book, get prediction for all the observations in the dataset
# here we use ~ select to exclude the result of the training samples
preds = lm.predict(Auto)
square_error = (Auto['mpg'] - preds)**2
print('-------- Test error for 1st order model --------')
print(np.mean(square_error[~select]))

In [None]:
# build a model with 2nd order of features  
lm2 = smf.ols ('mpg~horsepower + I(horsepower ** 2.0)', data = Auto[select]).fit()
preds = lm2.predict(Auto)
square_error = (Auto['mpg'] - preds)**2
print('--------Test error for 2nd order--------')
print(square_error[~select].mean())

In [None]:
# build a model with 3rd order of features  
lm3 = smf.ols ('mpg~horsepower + I(horsepower ** 2.0) + I(horsepower ** 3.0)', data = Auto[select]).fit()
preds = lm3.predict(Auto)
square_error = (Auto['mpg'] - preds)**2
print('--------Test rror for 3rd order--------')
print(np.mean(square_error[~select]))

""" 
These results are consistent with our previous findings: a model that predicts mpg using a quadratic function of 
horsepower performs better than a model that involves only a linear function of horsepower, 
and there is little evidence in favor of a model that uses a cubic function of horsepower.
"""

In [None]:
# if we look at the summmary for 3rd order regression, 
# the coefficient of the 3rd order term is not statistically significant. 
# I will use this as Supporting evidence for the above claim. 
print(lm3.summary())

## 5.3.2 Leave-One-Out Cross-Validation
The LOOCV estimates only keep one sample in the validation data and use the rest of the data to train the model. This way the training model has similar dataset comparing to the model trained on entire dataset.

In [None]:
# OLS fit 
ols_fit = smf.ols ('mpg~horsepower', data = Auto).fit()
print(ols_fit.params)

In [None]:
# GLM fit. Compare with OLS fit, the coeffs are the same
glm_fit = smf.glm('mpg~horsepower', data = Auto).fit()
print(glm_fit.params)

In [None]:
# trying CV in Python is not as easy as that in R. It will require some manual coding.
# to use some of implemented function in Python, we use Sklearn for linear model 
# from sklearn.model_selection import KFold, cross_val_score
# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.linear_model import LinearRegression
# from sklearn.pipeline import Pipeline

In [None]:
# let us re-train the model in sklearn
x = pd.DataFrame(Auto.horsepower)
y = Auto.mpg

model = LinearRegression()
model.fit(x, y)
print(model.intercept_)
print(model.coef_)

In [None]:
# loo use folds equal to # of observations. We could also choose other number of folds.
k_fold = KFold(n_splits=x.shape[0]) 
test = cross_val_score(model, x, y, cv=k_fold,  scoring = 'neg_mean_squared_error', n_jobs=-1)
print(np.mean(-test))

In [None]:
# for higher order polynomial fit, we use pipline tool. 
# below shows how to fit an order 1 to 20 polynomial data and show the loo results
# this step may take a few mins
A = OrderedDict()
n_split = x.shape[0]
for porder in range(1, 21, 2):
    model = Pipeline([('poly', PolynomialFeatures(degree=porder)), ('linear', LinearRegression())])
    k_fold = KFold(n_splits=n_split) # loo use folds equal to # of observations
    test = cross_val_score(model, x, y, cv=k_fold,  scoring = 'neg_mean_squared_error', n_jobs=-1)
    A[str(porder)] = np.mean(-test)
    
print(A)

## 5.3.3 k-Fold Cross-Validation

In [None]:
# K-fold validation is exactly same as LOO with different n_splits parameter setup. 
# the computation time is much shorter than that of LOOCV.
np.random.seed(2)
A = OrderedDict()
n_split = 10
for porder in range(1, 21, 2):
    model = Pipeline([('poly', PolynomialFeatures(degree=porder)), ('linear', LinearRegression())])
    k_fold = KFold(n_splits=n_split) 
    test = cross_val_score(model, x, y, cv = k_fold,  scoring = 'neg_mean_squared_error', n_jobs = -1)
    A[str(porder)] = np.mean(-test)
    
print(A)

## 5.3.4 The Bootstrap
Bootstrap means sampling with replacement. To eliminate the effect of sample size, the norm practice is to sample the same size as original dataset with replacement.

Bootstrap can be used in a lot of other places, such as estimating the accuracy of a linear regression model coeffcients / Conduct non-parametric testing (permutation test) / Estimate some complicated probability 

In [None]:
Portfolio = pd.read_csv('data/Portfolio.csv', header=0)

In [None]:
# to illustrate the use of the bootstrap on this data, we must first create a function, alpha_fn(), 
# which takes as input the (X, Y) data as well as a vector indicating which observations should be used to estimate alpha.
def alpha_fn(data, index):
    X = data.X.iloc[index]
    Y = data.Y.iloc[index]
    return (np.var(Y) - np.cov(X,Y)[0,1])/(np.var(X) + np.var(Y) - 2 * np.cov(X, Y)[0,1])

In [None]:
alpha_fn(Portfolio, range(0,100))

In [None]:
# generate one set of random index with 100 elements. The array has been sorted to show there are repeat elements.
np.sort(np.random.choice(range(0, 100), size=100, replace=True))

In [None]:
# recall the previous function with a random set of input. 
alpha_fn(Portfolio, np.random.choice(range(0, 100), size=100, replace=True))

In [None]:
# since I am not aware of boot like function in python, I just defined an ad-hoc function called boot_python()
def boot_python(data, input_fun, iteration):
    n = Portfolio.shape[0]
    idx = np.random.randint(0, n, (iteration, n))
    stat = np.zeros(iteration)
    for i in range(len(idx)):
        stat[i] = input_fun(data, idx[i])
    
    return {'Mean': np.mean(stat), 'STD': np.std(stat)}
    

In [None]:
boot_python(Portfolio, alpha_fn, 1000)

In [None]:
# End of Chapter 5