### Preprocessing - I

In [1]:
# import relevant statistical packages
import numpy as np
import pandas as pd

In [2]:
# import relevant data visualisation packages
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
url = "/Users/arpanganguli/Documents/Professional/Finance/ISLR/Datasets/Portfolio.csv"
Portfolio = pd.read_csv(url, index_col = 'Unnamed: 0')

In [5]:
Portfolio.head()

Unnamed: 0,X,Y
1,-0.895251,-0.234924
2,-1.562454,-0.885176
3,-0.41709,0.271888
4,1.044356,-0.734198
5,-0.315568,0.841983


In [6]:
covmatrix = Portfolio.cov()

In [7]:
covmatrix

Unnamed: 0,X,Y
X,1.128642,0.626358
Y,0.626358,1.308237


In [8]:
covmatrix.iloc[0][1]

0.6263582921063724

*Okay cool!*

### Estimating the Accuracy of a Statistic of Interest through Bootstrap

In [9]:
def alphafn(data, index):
    X = data.X[index]
    Y = data.Y[index]
    return round((Y.var() - covmatrix.iloc[0][1]) / (X.var() + Y.var() - 2*covmatrix.iloc[0][1]), 3)

In [10]:
idx = list(range(1,101))

In [38]:
true_alpha = alphafn(Portfolio, idx)
true_alpha

0.576

In [39]:
from sklearn.utils import resample

In [45]:
estimate_alpha = pd.DataFrame()
for i in range(1,1001): # this is equivalent to constructing a new bootstrap data set and recomputing alpha_hat based on the new data set and generating 1,000 bootstrap estimates 
    bootstrap_estimate = alphafn(Portfolio, resample(idx, replace=True))
    estimate_alpha = estimate_alpha.append([bootstrap_estimate])    

In [46]:
estimate_alpha.reset_index(drop=True, inplace=True)
estimate_alpha.columns = ['Alpha Estimate']
estimate_alpha.head()

Unnamed: 0,Alpha Estimate
0,0.458
1,0.622
2,0.632
3,0.565
4,0.675


In [138]:
estimate_alpha.shape

(1000, 1)

*We see here that we have generated 1,000 estimates of alpha.*

In [79]:
std_err = np.sqrt(np.sum(pow(estimate_alpha - (np.sum(estimate_alpha) / 1000), 2))/999) # check formula for standard error in page 189 of ISLR

In [80]:
std_err

Alpha Estimate    0.091405
dtype: float64

**Therefore, the estimate of $\alpha$-hat using the original data is 0.576, and that the bootstrap estimate for SE($\alpha$-hat)
is 0.091405.**

### Preprocessing - II

In [81]:
# import relevant statistical packages
import numpy as np
import pandas as pd

In [82]:
# import relevant data visualisation packages
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [83]:
from sklearn.model_selection import train_test_split

In [90]:
url = "/Users/arpanganguli/Documents/Professional/Finance/ISLR/Datasets/Auto.csv"
Auto = pd.read_csv(url)

In [91]:
Auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [92]:
Auto.horsepower.dtype

dtype('int64')

In [96]:
Auto['hp'] = Auto.horsepower.astype(float)

In [97]:
Auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name,hp
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu,130.0
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320,165.0
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite,150.0
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst,150.0
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino,140.0


In [98]:
Auto.hp.dtype

dtype('float64')

*Okay cool!*

### Estimating the Accuracy of a Linear Regression Model through Bootstrap

In [99]:
from sklearn.linear_model import LinearRegression

In [100]:
from sklearn.model_selection import train_test_split

In [128]:
def bootfn(data, index):
    X = data[['hp']]
    y = data['mpg']
    X_train = X.iloc[index]
    y_train = y.iloc[index]
    lmfit = LinearRegression().fit(X_train, y_train)
    return round(lmfit.intercept_, 3), np.round(lmfit.coef_, 3)

In [129]:
idx = list(range(1,393))

In [130]:
bootfn(Auto, idx)

(39.83, array([-0.157]))

**Creating bootstrap estimates**

In [133]:
bootfn(Auto, resample(idx, replace=True))

(40.524, array([-0.163]))

In [134]:
bootfn(Auto, resample(idx, replace=True))

(38.913, array([-0.155]))

*As we can see, bootstrapping generates different estimates for both the intercept and slope each time through random
sampling.*

In [135]:
estimate_coefficients = pd.DataFrame()
for i in range(1,1001): # this is equivalent to constructing a new bootstrap data set and recomputing alpha_hat based on the new data set and generating 1,000 bootstrap estimates 
    coef_estimate = bootfn(Auto, resample(idx, replace=True))
    estimate_coefficients = estimate_coefficients.append([coef_estimate])   

In [141]:
estimate_coefficients.reset_index(drop=True, inplace=True)
estimate_coefficients.columns = ['Intercept', 'Slope Term']
estimate_coefficients.head()

Unnamed: 0,Intercept,Slope Term
0,40.156,[-0.16]
1,40.344,[-0.16]
2,40.435,[-0.161]
3,39.938,[-0.153]
4,40.613,[-0.164]


In [137]:
estimate_coefficients.shape

(1000, 2)

*We see here that we have generated 1,000 estimates of intercepts and slope terms.*

In [143]:
std_err_st = np.sqrt(np.sum(pow(estimate_coefficients['Slope Term'] - (np.sum(estimate_coefficients['Slope Term']) / 1000), 2))/999)

In [144]:
std_err_intercept = np.sqrt(np.sum(pow(estimate_coefficients['Intercept'] - (np.sum(estimate_coefficients['Intercept']) / 1000), 2))/999)

In [146]:
print("Bootsrapped Intercept Standard Error: ", round(std_err_intercept, 4), "Bootstrapped Slope Term Standard Error: ", np.round(std_err_st, 4))

Bootsrapped Intercept Standard Error:  0.8595 Bootstrapped Slope Term Standard Error:  [0.0074]


**Conducting simple linear regression**

In [153]:
import statsmodels.api as sm

**Note:** scikit-learn does not have modules for inference. Hence, I am importing statsmodels to generate inferential statistics to get standard errors from simple linear regression

In [154]:
import statsmodels.api as sm

In [156]:
X1 = Auto[['hp']]
X1 = sm.add_constant(X1)
y1 = Auto['mpg']
lmfit1 = LinearRegression().fit(X1, y1)

In [157]:
ols = sm.OLS(y1, X1).fit()

In [158]:
ols.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.604
Model:,OLS,Adj. R-squared:,0.603
Method:,Least Squares,F-statistic:,603.4
Date:,"Mon, 14 Jan 2019",Prob (F-statistic):,1.5e-81
Time:,11:40:36,Log-Likelihood:,-1195.5
No. Observations:,397,AIC:,2395.0
Df Residuals:,395,BIC:,2403.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,40.0426,0.717,55.862,0.000,38.633,41.452
hp,-0.1586,0.006,-24.565,0.000,-0.171,-0.146

0,1,2,3
Omnibus:,16.479,Durbin-Watson:,0.925
Prob(Omnibus):,0.0,Jarque-Bera (JB):,17.349
Skew:,0.494,Prob(JB):,0.000171
Kurtosis:,3.271,Cond. No.,322.0


**Interestingly, note here that the standard errors generated through the bootstrap model seem way off than those from 
the simple linear regression. This is because the bootstrap model does not presume assumptions for standard errors like
they do in simple linear regression. Therfore, bootstrap estimates of standard errors are more likely to be accurate than
those of simple linear regression. For more information, I recommend reading page 196 of ISLR.**

### Estimating the Accuracy of a Quadratic Regression Model through Bootstrap

In [162]:
from sklearn.preprocessing import PolynomialFeatures as PF

In [170]:
def bootfn(data, index):
    for i in range(1, 3):
        MSE = 0
        X = df[['hp']]
        X_ = pd.DataFrame(PF(i).fit_transform(X))
        X_.drop(columns=0, inplace=True)
        y = df[['mpg']]
        X_train = X_.iloc[index]
        y_train = y.iloc[index]
        lmfit = LinearRegression().fit(X_train, y_train)
    return lmfit.intercept_, lmfit.coef_

In [171]:
bootfn(Auto, idx)

(array([56.6803142]), array([[-0.4630276 ,  0.00121982]]))

In [180]:
estim = bootfn(Auto, resample(idx, replace=True))

In [181]:
estim

(array([57.3956903]), array([[-0.47625463,  0.00127303]]))

In [221]:
df1 = pd.DataFrame()
df1 = pd.concat([pd.DataFrame([estim[1].reshape(2,-1)[0]]), pd.DataFrame([estim[1].reshape(2,-1)[1]])], axis=1)

In [173]:
estimate_coef = pd.DataFrame()

In [227]:
for i in range(1,1001):
    coefs = bootfn(Auto, resample(idx, replace=True))
    estimate_coef = estimate_coef.append(pd.concat([pd.DataFrame(coefs[0]), pd.DataFrame([coefs[1].reshape(2,-1)[0]]), pd.DataFrame([coefs[1].reshape(2,-1)[1]])], axis=1))

In [233]:
estimate_coef.reset_index(drop=True, inplace=True)
estimate_coef.columns = ['Intercept', 'hp', 'hp^2']

In [235]:
estimate_coef.head()

Unnamed: 0,Intercept,hp,hp^2
0,57.949797,-0.477463,0.00125
1,55.765245,-0.449997,0.001177
2,57.635393,-0.482477,0.001297
3,56.605658,-0.453925,0.001176
4,59.085845,-0.492528,0.001305


In [236]:
estimate_coef.shape

(1001, 3)

In [237]:
std_err_intercept = np.sqrt(np.sum(pow(estimate_coef['Intercept'] - (np.sum(estimate_coef['Intercept']) / 1000), 2))/999)

In [238]:
std_err_hp = np.sqrt(np.sum(pow(estimate_coef['hp'] - (np.sum(estimate_coef['hp']) / 1000), 2))/999)

In [239]:
std_err_hp2 = np.sqrt(np.sum(pow(estimate_coef['hp^2'] - (np.sum(estimate_coef['hp^2']) / 1000), 2))/999)

In [308]:
print("SE Intercept: ", round(std_err_intercept, 4), "SE HP: ", round(std_err_hp, 4), "SE HP^2: ", round(std_err_hp2, 4))

SE Intercept:  2.073 SE HP:  0.0328 SE HP^2:  0.0001


**Linear Regression**

In [241]:
import statsmodels.api as sm

In [304]:
X1 = Auto[['hp']]
X2 = pow(X1, 2)
X3 = pd.concat([X1, X2], axis = 1)
X3 = sm.add_constant(X3)
Y1 = Auto[['mpg']]
X3.columns = ['const', 'hp', 'hp2']
X3.head()

Unnamed: 0,const,hp,hp2
0,1.0,130.0,16900.0
1,1.0,165.0,27225.0
2,1.0,150.0,22500.0
3,1.0,150.0,22500.0
4,1.0,140.0,19600.0


In [305]:
ols = sm.OLS(Y1, X3).fit()

In [306]:
ols.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.686
Model:,OLS,Adj. R-squared:,0.684
Method:,Least Squares,F-statistic:,430.5
Date:,"Mon, 14 Jan 2019",Prob (F-statistic):,7.48e-100
Time:,12:56:47,Log-Likelihood:,-1149.6
No. Observations:,397,AIC:,2305.0
Df Residuals:,394,BIC:,2317.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,57.1036,1.802,31.688,0.000,53.561,60.646
hp,-0.4689,0.031,-15.039,0.000,-0.530,-0.408
hp2,0.0012,0.000,10.126,0.000,0.001,0.001

0,1,2,3
Omnibus:,15.411,Durbin-Watson:,1.091
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28.438
Skew:,0.211,Prob(JB):,6.68e-07
Kurtosis:,4.241,Cond. No.,128000.0


**As with simple linear regression, the standard errors generated through bootstrap is larger than the usual quadratic
regression, because it does not hold any assumptions that regressions need to calculate standard errors.**