## Chapter 5: Resampling Methods - Applied Exercises

In [22]:
import numpy as np
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
                         summarize,
                         poly,
                         sklearn_sm)
from sklearn.model_selection import train_test_split
from functools import partial
from sklearn.model_selection import (train_test_split,
                                     cross_validate,
                                     KFold,
                                     ShuffleSplit)
from sklearn.base import clone
from sklearn.metrics import confusion_matrix

In Chapter 4, we used logistic regression to predict the probability of default using income and balance on the `Default` data set. We will now estimate the test error of this logistic regression model using the validation set approach. Do not forget to set a random seed before beginning your analysis.

Fit a logistic regression model that uses `income` and `balance` to predict `default`.

In [2]:
Default = load_data('default')

In [5]:
Default_train, Default_valid = train_test_split(Default, test_size=5000, random_state=0)

In [14]:
# Fit a linear regression using the training set
default_mm = MS(['income', 'balance'])
X_train = default_mm.fit_transform(Default_train)
y_train = Default_train['default'] == 'Yes'
model = sm.Logit(y_train, X_train)
results = model.fit()

Optimization terminated successfully.
         Current function value: 0.072956
         Iterations 10


In [18]:
# Perform predictions on validation set
X_valid = default_mm.transform(Default_valid)
y_valid = Default_valid['default'] == 'Yes'
valid_pred = results.predict(X_valid)

# Overall fraction of misclassified observations
1 - np.mean(y_valid == (valid_pred > 0.5))

0.029000000000000026

In [16]:
valid_pred

9394    0.000014
898     0.000247
2398    0.008043
5906    0.002046
2343    0.000298
          ...   
3996    0.001859
5889    0.000034
4577    0.008793
8600    0.000735
847     0.000131
Length: 5000, dtype: float64

Repeat the process three times, using three diferent splits of the observations into a training set and a validation set. Comment on the results obtained.

In [20]:
# Create a vector to store the confusion matrices for the splits
C = []

In [23]:
for i in range(1,4):
    Default_train, Default_valid = train_test_split(Default, test_size=5000)
    X_train = default_mm.fit_transform(Default_train)
    y_train = Default_train['default'] == 'Yes'
    model = sm.Logit(y_train, X_train)
    results = model.fit()

    X_valid = default_mm.transform(Default_valid)
    y_valid = Default_valid['default'] == 'Yes'
    valid_pred = results.predict(X_valid)

    C.append(confusion_matrix(valid_pred > 0.5, y_valid))

Optimization terminated successfully.
         Current function value: 0.078193
         Iterations 10
Optimization terminated successfully.
         Current function value: 0.078611
         Iterations 10
Optimization terminated successfully.
         Current function value: 0.073792
         Iterations 10


In [24]:
tpr, fpr, ppv, npv, acc = ([] for i in range(5))

for c in C:
    tn = c[0,0] 
    fp = c[0,1]
    fn = c[1,0]
    tp = c[1,1]
    tpr.append((tp / (tp + fn + 0.)))
    fpr.append((fp / (fp + tn + 0.)))
    ppv.append((tp / (tp + fp + 0.)))
    npv.append((1- f n / (fn + tn + 0.)))
    acc.append(((tp + tn + 0.) / (tn + fp + fn + tp)))

In [25]:
def line(l):
    return " ".join( '{:06.4f}'.format(a) for a in l) + ', Average: ' +'{:06.4f}'.format(sum(l)/ len(l))

print('TPR: ')
print(line(tpr))
print('FPR: ')
print(line(fpr))
print('PPV: ')
print(line(ppv))
print('NPV: ')
print(line(npv))
print('ACC: ')
print(line(acc))

TPR: 
0.7432 0.8060 0.6471, Average: 0.7321
FPR: 
0.0229 0.0259 0.0248, Average: 0.0246
PPV: 
0.3274 0.2967 0.3107, Average: 0.3116
NPV: 
0.9961 0.9973 0.9938, Average: 0.9957
ACC: 
0.9736 0.9718 0.9696, Average: 0.9717


Now consider a logistic regression model that predicts the probability of `default` using `income`, `balance`, and a dummy variable for `student`. Estimate the test error for this model using the validation set approach. Comment on whether or not including a dummy variable for student leads to a reduction in the test error rate.

In [26]:
Default_train, Default_valid = train_test_split(Default, test_size=5000, random_state=0)

In [36]:
Default_train['student_yes'] = (Default_train['student'] == 'Yes').astype('int')
Default_valid['student_yes'] = (Default_valid['student'] == 'Yes').astype('int')
Default_train['default_yes'] = (Default_train['default'] == 'Yes').astype('int')
Default_valid['default_yes'] = (Default_valid['default'] == 'Yes').astype('int')

In [37]:
# Fit a linear regression using the training set
default_mm = MS(['income', 'balance', 'student_yes'])
X_train = default_mm.fit_transform(Default_train)
y_train = Default_train['default_yes']
model = sm.Logit(y_train, X_train)
results = model.fit()

Optimization terminated successfully.
         Current function value: 0.072293
         Iterations 10


In [38]:
# Perform predictions on validation set
X_valid = default_mm.transform(Default_valid)
y_valid = Default_valid['default_yes']
valid_pred = results.predict(X_valid)

# Overall fraction of misclassified observations
1 - np.mean(y_valid == (valid_pred > 0.5))

0.029200000000000004

It seems that adding the dummy student variable doesn't help with validation error rate, in fact, it makes it slightly worse.