# Exercise 5.4.5

In [None]:
import sys
!{sys.executable} -m pip install "ISLP"

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import statsmodels.formula.api as smf

from ISLP import load_data
Default = load_data('Default')

In Chapter 4, we used logistic regression to predict the probability of default using income and balance on the Default data set. We will now estimate the test error of this logistic regression model using the validation set approach.

a. & b. Fit a logistic regression model that uses income and balance to predict default. Using the validation set approach, estimate the test error of this model.

In [2]:
#exercise 5a,b I and II
np.random.seed(2)

# Splitting the dataset into training and testing sets
train, test = train_test_split(Default, test_size=1/3, random_state=1)

# Logistic regression: one way
X_train = train[["income", "balance"]]
y_train = (train["default"] == "Yes").astype(int)
X_train = sm.add_constant(X_train)

model1 = sm.Logit(y_train, X_train).fit()
print(model1.summary())
#another way
model2 = smf.glm("default ~ income + balance", data=train, family=sm.families.Binomial(link=sm.families.links.Logit()))
result2=model2.fit()
print(result2.summary())



Optimization terminated successfully.
         Current function value: 0.080110
         Iterations 10
                           Logit Regression Results                           
Dep. Variable:                default   No. Observations:                 6666
Model:                          Logit   Df Residuals:                     6663
Method:                           MLE   Df Model:                            2
Date:                Tue, 20 May 2025   Pseudo R-squ.:                  0.4714
Time:                        00:43:14   Log-Likelihood:                -534.01
converged:                       True   LL-Null:                       -1010.3
Covariance Type:            nonrobust   LLR p-value:                1.416e-207
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        -11.7722      0.534    -22.060      0.000     -12.818     -10.726
income      2.299e-05   6.08

In [3]:
#5b III
# Predictions on test set
X_test = test[["income", "balance"]]
y_test = (test["default"] == "Yes").astype(int)
X_test = sm.add_constant(X_test)

pred_probs = model1.predict(X_test) #prediction of default status for each individual in the validation set

predictions = pred_probs > 0.5 # classification
# Confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)
print(conf_matrix)

# 5b IV, Misclassification rate
miss_class = np.mean(predictions != y_test)
print(f"Misclassification Rate: {miss_class * 100:.2f}%")

#note the imbalance in categories! If you just say always default=No you get a rate of :

predictions=np.zeros(len(predictions))
miss_classAlwaysNo = np.mean(predictions != y_test)
print(confusion_matrix(y_test, predictions))
print(f"Misclassification Rate: {miss_classAlwaysNo * 100:.2f}%")


[[3216   18]
 [  63   37]]
Misclassification Rate: 2.43%
[[3234    0]
 [ 100    0]]
Misclassification Rate: 3.00%


c. Repeat the fitting and splitting process three times, using three different splits of the observations into a training set and a validation set. Comment on the results obtained. 

In [4]:
results=[]
for i in range(3):
    np.random.seed(i+10)
    # Split data into training and test sets
    train, test = train_test_split(Default, test_size=1/3, random_state=i)
    
    # Fit logistic regression models
    X_train = train[["income", "balance"]]
    y_train = (train["default"] == "Yes").astype(int)
    X_train = sm.add_constant(X_train)
    X_test = test[["income", "balance"]]
    y_test = (test["default"] == "Yes").astype(int)
    X_test = sm.add_constant(X_test)


    model1 = sm.Logit(y_train, X_train).fit(disp=False)
    
    # Predictions
    pred_probs = model1.predict(X_test)
    y_test = (test["default"] == "Yes").astype(int)
    predictions = pred_probs > 0.5
    # Confusion matrix
    conf_matrix = confusion_matrix(y_test, predictions)
    
    # Misclassification rates
    miss_class = np.mean(predictions != y_test)
    results.append(miss_class * 100)
print(results)

[2.7594481103779245, 2.4295140971805638, 2.489502099580084]


So we see that there is variability in the estimated missclassification rate. This variable is a random variable with a degree of uncertainty about the actual (infinite population) value. 

d. Now consider a logistic regression model that predicts the probability of default using income, balance, and a dummy variable for student. Estimate the test error for this model using the validation set approach. Comment on whether or not including a dummy variable for student leads to a reduction in the test error rate.

In [5]:
#5d
X_train = train[["income", "balance"]].copy()
X_train["student"]=(train["student"] == "Yes").astype(int)
X_train = sm.add_constant(X_train)
X_test = test[["income", "balance"]].copy()
X_test["student"]=(test["student"] == "Yes").astype(int)
X_test = sm.add_constant(X_test)

model3 = sm.Logit(y_train, X_train).fit(disp=False)
print(model3.summary())
pred_probs = model3.predict(X_test) #prediction of default status for each individual in the validation set
predictions = pred_probs > 0.5
conf_matrix = confusion_matrix(y_test, predictions)
miss_class = np.mean(predictions != y_test)
print(miss_class*100)



                           Logit Regression Results                           
Dep. Variable:                default   No. Observations:                 6666
Model:                          Logit   Df Residuals:                     6662
Method:                           MLE   Df Model:                            3
Date:                Tue, 20 May 2025   Pseudo R-squ.:                  0.4760
Time:                        00:43:37   Log-Likelihood:                -546.67
converged:                       True   LL-Null:                       -1043.3
Covariance Type:            nonrobust   LLR p-value:                5.439e-215
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        -10.5802      0.574    -18.424      0.000     -11.706      -9.455
income     -3.281e-06   9.62e-06     -0.341      0.733   -2.21e-05    1.56e-05
balance        0.0058      0.000     21.003      0.0

The student variable coefficient is significant (and interacts with income which is no longer significant). This indicates that adding student might be useful. The misclassification error on our particular training set is slightly higher though: adding the variable does not appear to offer benefit.