### Preprocessing

In [1]:
# import relevant statistical packages
import numpy as np
import pandas as pd

In [2]:
# import relevant data visualisation packages
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
url = "/Users/arpanganguli/Documents/Professional/Finance/ISLR/Datasets/Auto.csv"
df = pd.read_csv(url)

In [6]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [7]:
df.horsepower.dtype

dtype('int64')

In [8]:
df['hp'] = df.horsepower.astype(float)

In [9]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name,hp
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu,130.0
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320,165.0
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite,150.0
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst,150.0
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino,140.0


In [10]:
df.hp.dtype

dtype('float64')

*Okay cool!*

### Using simple linear regression

In [258]:
X = df[['hp']]
y = df['mpg']

In [259]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)

In [265]:
from sklearn.linear_model import LinearRegression

In [266]:
lmfit = LinearRegression().fit(X_train, y_train)

In [267]:
lmpred = lmfit.predict(X_test)

In [268]:
from sklearn.metrics import mean_squared_error

In [269]:
MSE = mean_squared_error(y_test, lmpred)

In [270]:
round(MSE, 2)

24.23

### Using Leave-One-Out-Cross-Validation (LOOCV)

In [28]:
from sklearn.model_selection import LeaveOneOut

In [29]:
X = df[['hp']]
y = df['mpg']

In [30]:
loo = LeaveOneOut()

In [62]:
total_sets = loo.get_n_splits(X)

In [63]:
from sklearn.linear_model import LinearRegression

In [64]:
from sklearn.metrics import mean_squared_error

In [65]:
MSE = 0

In [66]:
for train_index, test_index in loo.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    lmfit = LinearRegression().fit(X_train, y_train)
    lmpred = lmfit.predict(X_test)
    MSE += mean_squared_error(y_test, lmpred)

In [67]:
MSE

9708.79138971706

In [68]:
MSE_mean = MSE/total_sets

In [70]:
round(MSE_mean, 2)

24.46

**As we can see, LOOCV provides similar MSE to simple linear regression. Therefore, in the absence of a test dataset, we can resample the existing dataset through LOOCV to generate similar results to simple linear regression.**

### Using Leave-One-Out-Cross-Validation (LOOCV) for polynomial regressions (order: 1-5)

In [255]:
from sklearn.preprocessing import PolynomialFeatures as PF

In [256]:
X = df[['hp']]
y = df['mpg']

In [257]:
loo = LeaveOneOut()

In [258]:
total_sets = loo.get_n_splits(X)

In [259]:
MSE_all = pd.DataFrame()

In [260]:
for i in range(1,6):
    MSE = 0
    X = df[['hp']]
    X_ = pd.DataFrame(PF(i).fit_transform(X))
    X_.drop(columns=0, inplace=True)
    y = df[['mpg']]
    for train_index, test_index in loo.split(X):
        X_train, X_test = X_.iloc[train_index], X_.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        lmfit = LinearRegression().fit(X_train, y_train)
        lmpred = lmfit.predict(X_test)
        MSE += mean_squared_error(y_test, lmpred)
    MSE_mean = MSE/total_sets
    MSE_all = MSE_all.append([MSE_mean])

In [262]:
MSE_all.columns = [['MSE']]
MSE_all.reset_index(drop=True, inplace=True)
round(MSE_all, 2)

Unnamed: 0,MSE
0,24.46
1,19.44
2,19.52
3,19.61
4,19.2


**As we can see, there is a sharp drop between linear and quadratic regressions. Rest of the higher-order regressions
have similar MSEs.**