# ISLP - Chapter 5 - Exercise 6
### Author: pzuehlke

In [24]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [95]:
# The preliminaries are similar to the ones used for exercise 5:
default_data = pd.read_csv("Default.csv")

# Convert 'default' to binary (No -> 0 / Yes -> 1):
default_data["default"] = (default_data["default"] == "Yes").astype(int)

# This time, let's also divide both income and balance by 1000 (so that they
# will now be denominated in thousands of dollars) to make the regression
# coefficients larger:
default_data[["income", "balance"]] /= 1000

print(default_data.info(), end="\n\n")
print(default_data["default"].value_counts(), end="\n\n")
default_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   default  10000 non-null  int64  
 1   student  10000 non-null  object 
 2   balance  10000 non-null  float64
 3   income   10000 non-null  float64
dtypes: float64(2), int64(1), object(1)
memory usage: 312.6+ KB
None

default
0    9667
1     333
Name: count, dtype: int64



Unnamed: 0,default,student,balance,income
0,0,No,0.729526,44.361625
1,0,Yes,0.81718,12.106135
2,0,No,1.073549,31.767139
3,0,No,0.529251,35.704494
4,0,No,0.785656,38.463496


__6 (a):__ The coefficients associated to `income` and `balance` (here
denominated in thousands of dollars) are $ 0.0208 $ and $ 5.6471 $, respectively,
while the intercept is $ -11.5405 $. All of them seem to be statistically significant.

The estimates provided for the standard errors are:
* $ 0.005 $ for `income`;
* $ 0.227 $ for `balance`;
* $ 0.435 $ for the intercept.


In [30]:
X = sm.add_constant(default_data[["income", "balance"]])
y = default_data["default"]
model = sm.GLM(y, X, family=sm.families.Binomial())
results = model.fit()
print(default_data["default"].value_counts())

print(results.summary())

default
0    9667
1     333
Name: count, dtype: int64
                 Generalized Linear Model Regression Results                  
Dep. Variable:                default   No. Observations:                10000
Model:                            GLM   Df Residuals:                     9997
Model Family:                Binomial   Df Model:                            2
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -789.48
Date:                Sun, 09 Feb 2025   Deviance:                       1579.0
Time:                        21:16:11   Pearson chi2:                 6.95e+03
No. Iterations:                     9   Pseudo R-squ. (CS):             0.1256
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------

__6 (b):__ Basically, all that we have to do is to encode the solution of part
(a) as a separate function:

In [94]:
def boot_fn(data, indices):
    boot_sample = data.iloc[indices]
    X = sm.add_constant(boot_sample[["income", "balance"]])
    y = boot_sample["default"]
    model = sm.GLM(y, X, family=sm.families.Binomial())
    results = model.fit()
    # results.params order: [constant, income, balance]
    return np.array([results.params["income"], results.params["balance"]])

Let's test the function using as sample all of the indices. We can check that
the results are the same as in part (b):

In [77]:
n = len(default_data)
coefs = boot_fn(default_data, np.arange(n))
print(f"income coefficient: {coefs[0]:4f}")
print(f"balance coefficient: {coefs[1]:4f}")

income coefficient: 0.020809
balance coefficient: 5.647103


For another test, now let's use a bootstrap sample to estimate the coefficients.
Note that there is no advantage to doing this: these estimates are very likely
worse than the ones using the entire dataset without replacement.

In [78]:
rng = np.random.default_rng(0)
boot_estimates = boot_fn(default_data, rng.choice(n, size=n, replace=True))
print(f"income coefficient: {boot_estimates[0]:4f}")
print(f"balance coefficient: {boot_estimates[1]:4f}")

income coefficient: 0.018778
balance coefficient: 5.738776


__6 (c):__

In [90]:
def boot_se(data, B, rng):
    bootstrap_estimates = np.zeros((B, 2))  # `income` and `balance` coefficients
    for i in range(B):
        indices = rng.choice(n, size=n, replace=True)
        bootstrap_estimates[i, :] = boot_fn(data, indices)

    return np.std(bootstrap_estimates, axis=0)  # estimates for the standard errors

In [93]:
rng = np.random.default_rng(0)
n = len(default_data)
B = 10
bootstrap_std_errors = boot_se(default_data, B, rng)
print(f"standard error for coef. of income: {bootstrap_std_errors[0]:.4f}")
print(f"standard error for coef. of balance: {bootstrap_std_errors[1]:.4f}")

standard error for coef. of income: 0.0056
standard error for coef. of balance: 0.2353


__6(d):__ Here is a summary of the estimates for the standard errors obtained by the two methods:

|            | GLM std err  | bootstrap std err |
|------------|--------------|-------------------|
| _Income_   | 0.005        | 0.0056            |
| _Balance_  | 0.227        | 0.2353            |

We see that the bootstrap standard errors are quite close to those obtained
from GLM, which validates both approaches and suggests that our model is
well-specified for this data. Also, both bootstrap estimates are slightly
larger.  This is a common and desirable characteristic of bootstrap standard
errors, because they can capture more sources of variability in the data
and are more robust to potential violations of model assumptions.