# 5.3 Lab: Cross-Validation and the Bootstrap

## 5.3.1 The Validation Set Approach

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy
import pandas as pd 
import math
import random

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.graphics.regressionplots import *
from sklearn import datasets, linear_model



In [2]:
import pandas as pd 
Auto = pd.read_csv('data/Auto.csv', header=0, na_values='?')
Auto = Auto.dropna().reset_index(drop=True) # drop the observation with NA values and reindex the obs from 0
Auto.shape

(392, 9)

### Python and R use different random number generator, so we may see slightly difference results in this chapter

In [3]:
np.random.seed(1)
train = np.random.choice(Auto.shape[0], 196, replace=False)
select = np.in1d(range(Auto.shape[0]), train)

In [4]:
import statsmodels.formula.api as smf
lm = smf.ols ('mpg~horsepower', data = Auto[select]).fit()
print lm.summary()
preds = lm.predict(Auto)
square_error = (Auto['mpg'] - preds)**2
print '--------Test Error for 1st order--------'
print np.mean(square_error[~select])

                            OLS Regression Results                            
Dep. Variable:                    mpg   R-squared:                       0.620
Model:                            OLS   Adj. R-squared:                  0.618
Method:                 Least Squares   F-statistic:                     316.4
Date:                Wed, 08 Mar 2017   Prob (F-statistic):           1.28e-42
Time:                        16:58:23   Log-Likelihood:                -592.07
No. Observations:                 196   AIC:                             1188.
Df Residuals:                     194   BIC:                             1195.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     40.3338      1.023     39.416      0.0

In [5]:
lm2 = smf.ols ('mpg~horsepower + I(horsepower ** 2.0)', data = Auto[select]).fit()
preds = lm2.predict(Auto)
square_error = (Auto['mpg'] - preds)**2
print '--------Test Error for 2nd order--------'
print np.mean(square_error[~select])

--------Test Error for 2nd order--------
20.2526908584


In [6]:
lm3 = smf.ols ('mpg~horsepower + I(horsepower ** 2.0) + I(horsepower ** 3.0)', data = Auto[select]).fit()
preds = lm3.predict(Auto)
square_error = (Auto['mpg'] - preds)**2
print '--------Test Error for 3rd order--------'
print np.mean(square_error[~select])

--------Test Error for 3rd order--------
20.3256093659


### These results are consistent with our previous findings: a model that predicts mpg using a quadratic function of horsepower performs better than a model that involves only a linear function of horsepower, and there is little evidence in favor of a model that uses a cubic function of horsepower.

### If we look at the summmary for 3rd order regression, the coefficient of the 3rd order term is not statistically significant. I will use this as Supporting evidence for the above claim. 

In [7]:
print lm3.summary()

                            OLS Regression Results                            
Dep. Variable:                    mpg   R-squared:                       0.722
Model:                            OLS   Adj. R-squared:                  0.717
Method:                 Least Squares   F-statistic:                     165.9
Date:                Wed, 08 Mar 2017   Prob (F-statistic):           4.60e-53
Time:                        16:58:23   Log-Likelihood:                -561.56
No. Observations:                 196   AIC:                             1131.
Df Residuals:                     192   BIC:                             1144.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept               66.5200 

## 5.3.2 Leave-One-Out Cross-Validation

### OLS Fit

In [8]:
ols_fit = smf.ols ('mpg~horsepower', data = Auto).fit()
print ols_fit.params

Intercept     39.935861
horsepower    -0.157845
dtype: float64


### GLM Fit. Compare with OLS fit, the coeffs are the same

In [9]:
glm_fit = sm.GLM.from_formula('mpg~horsepower', data = Auto).fit()
print glm_fit.params

Intercept     39.935861
horsepower    -0.157845
dtype: float64


### Trying CV in Python is not as easy as that in R. It will require some manual coding.

### To use some of implemented function in Python, we use Sklearn for linear model 

In [30]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

In [31]:
x = pd.DataFrame(Auto.horsepower)
y = Auto.mpg

model = LinearRegression()
model.fit(x, y)
print model.intercept_
print model.coef_

39.9358610212
[-0.15784473]


In [32]:
k_fold = KFold(n_splits=x.shape[0]) # loo use folds equal to # of observations
test = cross_val_score(model, x, y, cv=k_fold,  scoring = 'neg_mean_squared_error', n_jobs=-1)
print np.mean(-test)

24.2315135179


### For higher order polynomial fit, we use pipline tool. Below shows how to fit an order 1 to 5 polynomial data and show the loo results

In [43]:
A = []
for porder in xrange(1, 6):
    model = Pipeline([('poly', PolynomialFeatures(degree=porder)), ('linear', LinearRegression(fit_intercept=False))])
    k_fold = KFold(n_splits=x.shape[0]) # loo use folds equal to # of observations
    test = cross_val_score(model, x, y, cv=k_fold,  scoring = 'neg_mean_squared_error', n_jobs=-1)
    A.append(np.mean(-test))
    
print A

[24.231513517929226, 19.248213124490029, 19.334984064022926, 19.424430301371874, 19.033232559685043]
