# 5.3 Lab: Cross-Validation and the Bootstrap

## 5.3.1 The Validation Set Approach

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scipy
import pandas as pd 
import math
import random

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.graphics.regressionplots import *
from sklearn import datasets, linear_model

In [None]:
Auto = pd.read_csv('data/Auto.csv', header=0, na_values='?')
Auto = Auto.dropna().reset_index(drop=True) # drop the observation with NA values and reindex the obs from 0
Auto.shape

### Python and R use different random number generator, so we may see slightly difference results in this chapter

In [None]:
np.random.seed(1)
train = np.random.choice(Auto.shape[0], 196, replace=False)
select = np.in1d(range(Auto.shape[0]), train)

In [None]:
import statsmodels.formula.api as smf
lm = smf.ols ('mpg~horsepower', data = Auto[select]).fit()
print lm.summary()
preds = lm.predict(Auto)
square_error = (Auto['mpg'] - preds)**2
print '--------Test Error for 1st order--------'
print np.mean(square_error[~select])

In [None]:
lm2 = smf.ols ('mpg~horsepower + I(horsepower ** 2.0)', data = Auto[select]).fit()
preds = lm2.predict(Auto)
square_error = (Auto['mpg'] - preds)**2
print '--------Test Error for 2nd order--------'
print square_error[~select].mean()

In [None]:
lm3 = smf.ols ('mpg~horsepower + I(horsepower ** 2.0) + I(horsepower ** 3.0)', data = Auto[select]).fit()
preds = lm3.predict(Auto)
square_error = (Auto['mpg'] - preds)**2
print '--------Test Error for 3rd order--------'
print np.mean(square_error[~select])

### These results are consistent with our previous findings: a model that predicts mpg using a quadratic function of horsepower performs better than a model that involves only a linear function of horsepower, and there is little evidence in favor of a model that uses a cubic function of horsepower.

### If we look at the summmary for 3rd order regression, the coefficient of the 3rd order term is not statistically significant. I will use this as Supporting evidence for the above claim. 

In [None]:
print lm3.summary()

## 5.3.2 Leave-One-Out Cross-Validation

### OLS Fit

In [None]:
ols_fit = smf.ols ('mpg~horsepower', data = Auto).fit()
print ols_fit.params

### GLM Fit. Compare with OLS fit, the coeffs are the same

In [None]:
glm_fit = sm.GLM.from_formula('mpg~horsepower', data = Auto).fit()
print glm_fit.params

### Trying CV in Python is not as easy as that in R. It will require some manual coding.

### To use some of implemented function in Python, we use Sklearn for linear model 

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

In [None]:
x = pd.DataFrame(Auto.horsepower)
y = Auto.mpg

model = LinearRegression()
model.fit(x, y)
print model.intercept_
print model.coef_

In [None]:
k_fold = KFold(n_splits=x.shape[0]) # loo use folds equal to # of observations
test = cross_val_score(model, x, y, cv=k_fold,  scoring = 'neg_mean_squared_error', n_jobs=-1)
print np.mean(-test)

### For higher order polynomial fit, we use pipline tool. Below shows how to fit an order 1 to 5 polynomial data and show the loo results

In [None]:
A = []
for porder in xrange(1, 6):
    model = Pipeline([('poly', PolynomialFeatures(degree=porder)), ('linear', LinearRegression())])
    k_fold = KFold(n_splits=x.shape[0]) # loo use folds equal to # of observations
    test = cross_val_score(model, x, y, cv=k_fold,  scoring = 'neg_mean_squared_error', n_jobs=-1)
    A.append(np.mean(-test))
    
print A

## 5.3.3 k-Fold Cross-Validation

### K-fold validation is exactly same as LOO with different n_splits parameter setup. The computation time is much shorter than that of LOOCV.

In [None]:
np.random.seed(2)
A = []
for porder in xrange(1, 11):
    model = Pipeline([('poly', PolynomialFeatures(degree=porder)), ('linear', LinearRegression())])
    k_fold = KFold(n_splits=10) 
    test = cross_val_score(model, x, y, cv = k_fold,  scoring = 'neg_mean_squared_error', n_jobs = -1)
    A.append(np.mean(-test))
    
print A

### We still see little evidence that using cubic or higher-order polynomial terms leads to lower test error than simply using a quadratic fit.

## 5.3.4 The Bootstrap

### Bootstrap means sampling with replacement. To eliminate the effect of sample size, the norm practice is to sample the same size as original dataset with replacement.

In [None]:
Portfolio = pd.read_csv('data/Portfolio.csv', header=0)

### To illustrate the use of the bootstrap on this data, we must first create a function, alpha_fn(), which takes as input the (X, Y) data as well as a vector indicating which observations should be used to estimate alpha.

In [None]:
def alpha_fn(data, index):
    X = data.X[index]
    Y = data.Y[index]
    return (np.var(Y) - np.cov(X,Y)[0,1])/(np.var(X) + np.var(Y) - 2 * np.cov(X, Y)[0,1])

In [None]:
alpha_fn(Portfolio, range(0, 100))

### Generate one set of random index with 100 elements. The array has been sorted to show there are repeat elements.

In [None]:
np.sort(np.random.choice(range(0, 100), size=100, replace=True))

### Recall the previous function with a random set of input. 

In [None]:
alpha_fn(Portfolio, np.random.choice(range(0, 100), size=100, replace=True))

### Since I am not aware of boot similar function in python, I just define a ad hoc function called boot_python()

In [None]:
def boot_python(data, input_fun, iteration):
    n = Portfolio.shape[0]
    idx = np.random.randint(0, n, (iteration, n))
    stat = np.zeros(iteration)
    for i in xrange(len(idx)):
        stat[i] = input_fun(data, idx[i])
    
    return {'Mean': np.mean(stat), 'STD': np.std(stat)}
    

In [None]:
boot_python(Portfolio, alpha_fn, 1000)

### Similar idea (boostrap) can be used in a lot of other places, such as estimating the accuracy of a linear regression model coeffcients / Conduct non-parametric testing (permutation test) / Estimate some complicated probability 