### Preprocessing

In [1]:
# import relevant statistical packages
import numpy as np
import pandas as pd

In [2]:
# import relevant data visualisation packages
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
# load Default dataset
url = "/Users/arpanganguli/Documents/Professional/Finance/ISLR/Datasets/Default.csv"
Default = pd.read_csv(url, index_col = 'Unnamed: 0')

In [4]:
Default.head()

Unnamed: 0,default,student,balance,income
1,No,No,729.526495,44361.625074
2,No,Yes,817.180407,12106.1347
3,No,No,1073.549164,31767.138947
4,No,No,529.250605,35704.493935
5,No,No,785.655883,38463.495879


In [5]:
Default.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 1 to 10000
Data columns (total 4 columns):
default    10000 non-null object
student    10000 non-null object
balance    10000 non-null float64
income     10000 non-null float64
dtypes: float64(2), object(2)
memory usage: 390.6+ KB


In [6]:
dfX = Default[['student', 'balance','income']]
dfX = pd.get_dummies(data = dfX, drop_first=True)
dfy = Default['default']

In [57]:
dfX.head()

Unnamed: 0,balance,income,student_Yes
1,729.526495,44361.625074,0
2,817.180407,12106.1347,1
3,1073.549164,31767.138947,0
4,529.250605,35704.493935,0
5,785.655883,38463.495879,0


In [8]:
dfy.head()

1    No
2    No
3    No
4    No
5    No
Name: default, dtype: object

### 6.a. Estimated standard errors for coefficients associated with income and balance in a mulitple logit model

In [10]:
import statsmodels.api as sm

In [36]:
X = dfX[['balance', 'income']]
X = sm.add_constant(X)
y = pd.get_dummies(dfy, drop_first=True)

In [37]:
glmfit = sm.GLM(y, X, family=sm.families.Binomial()).fit()

In [38]:
glmfit.summary()

0,1,2,3
Dep. Variable:,Yes,No. Observations:,10000
Model:,GLM,Df Residuals:,9997
Model Family:,Binomial,Df Model:,2
Link Function:,logit,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-789.48
Date:,"Mon, 14 Jan 2019",Deviance:,1579.0
Time:,17:12:39,Pearson chi2:,6.95e+03
No. Iterations:,9,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-11.5405,0.435,-26.544,0.000,-12.393,-10.688
balance,0.0056,0.000,24.835,0.000,0.005,0.006
income,2.081e-05,4.99e-06,4.174,0.000,1.1e-05,3.06e-05


In [43]:
estimated_std_err = np.array(glmfit.params / glmfit.tvalues)

In [44]:
estimated_std_err

array([4.34772469e-01, 2.27381384e-04, 4.98524546e-06])

### 6.b. Writing function bootfn() which inputs Default data and index of the observations and returns estimates of the standard errors of the regression coefficients

In [52]:
def bootfn(data, index):
    X = data[['balance', 'income']]
    X = sm.add_constant(X)
    y = pd.get_dummies(data['default'], drop_first=True)
    X_train = X.iloc[index]
    y_train = y.iloc[index]
    glmfit = sm.GLM(y_train, X_train, family=sm.families.Binomial()).fit()
    estimated_std_err = np.array(glmfit.params / glmfit.tvalues)
    return estimated_std_err

In [53]:
bootfn(Default, list(range(1,10000)))

array([4.34775145e-01, 2.27384181e-04, 4.98521904e-06])

### 6.c. Bootstrapping the bootfn() function to estimate the standard errors of the regression coefficients

In [60]:
from sklearn.utils import resample

In [61]:
std_err_df = pd.DataFrame()
idx = list(range(10000))

In [63]:
for i in range(1000):
    std_temp = bootfn(Default, resample(idx, replace=True))
    std_err_df = std_err_df.append([std_temp])

In [68]:
std_err_df.reset_index(drop=True, inplace=True)
std_err_df.columns = ['intercept', 'balance', 'income']

In [69]:
std_err_df.head()

Unnamed: 0,intercept,balance,income
0,0.479226,0.000248,5e-06
1,0.423227,0.000218,5e-06
2,0.428759,0.000225,5e-06
3,0.450969,0.000231,5e-06
4,0.417985,0.000219,5e-06


In [70]:
std_err_df.shape

(1000, 3)

In [71]:
std_err_df.mean()

intercept    0.437193
balance      0.000228
income       0.000005
dtype: float64

### 6.d. As we can see, the bootstrap of the standard error estimates are close to standard errors of logistic regression coefficients.