# ISLP - Chapter 5 - Exercise 5
### Author: pzuehlke

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

__5 (a):__

In [30]:
default_data = pd.read_csv("Default.csv")
print(default_data.info(), end="\n\n")
default_data.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   default  10000 non-null  object 
 1   student  10000 non-null  object 
 2   balance  10000 non-null  float64
 3   income   10000 non-null  float64
dtypes: float64(2), object(2)
memory usage: 312.6+ KB
None



Unnamed: 0,default,student,balance,income
0,No,No,729.526495,44361.625074
1,No,Yes,817.180407,12106.1347
2,No,No,1073.549164,31767.138947
3,No,No,529.250605,35704.493935
4,No,No,785.655883,38463.495879


In [31]:
# Convert 'default' to numeric (Yes = 1, No = 0):
le = LabelEncoder()
default_data["default"] = le.fit_transform(default_data["default"])

In [32]:
X = default_data[["income", "balance"]]
y = default_data["default"]

model = LogisticRegression(random_state=0)
model.fit(X, y)

print(f"Income: {model.coef_[0][0]:.4f}")
print(f"Balance: {model.coef_[0][1]:.4f}")
print(f"Intercept: {model.intercept_[0]:.4f}")

Income: 0.0000
Balance: 0.0056
Intercept: -11.5405


__5 (b):__ We divide the original dataset in half, use one of these halves to
train the model and the other to estimate the error rate (or rather, the
accuracy). By default the model uses $ 0.5 $ as the probability threshold already.

In [33]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0
)

model = LogisticRegression(random_state=0)
model.fit(X_train, y_train)

train_score = model.score(X_train, y_train)  # accuracy, not error rate
test_score = model.score(X_test, y_test)

print(f"Training error rate: {1 - train_score:.4f}")
print(f"Validation error rate: {1 - test_score:.4f}")

Training error rate: 0.0240
Validation error rate: 0.0290


__5 (c):__ We repeat the same code as above, but use a different random seed to
split the dataset in two:

In [34]:
for split_seed in range(3):
    print(f"\nsplit {split_seed + 1}:")
    print("-" * 20)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.5, random_state=split_seed
    )

    model = LogisticRegression(random_state=0)
    model.fit(X_train, y_train)

    train_score = model.score(X_train, y_train)  # accuracy, not error rate
    test_score = model.score(X_test, y_test)

    print(f"Training error rate: {1 - train_score:.4f}")
    print(f"Validation error rate: {1 - test_score:.4f}")


split 1:
--------------------
Training error rate: 0.0240
Validation error rate: 0.0290

split 2:
--------------------
Training error rate: 0.0282
Validation error rate: 0.0250

split 3:
--------------------
Training error rate: 0.0278
Validation error rate: 0.0248


We see that the validation and training error rates differ by $ 10 $ to $ 20\% $ from
each other for all three splits, and that for two of the splits, the test error
rate is actually lower. However, all of the error rates fall in the $ 2.4 $ to $
3\% $ range, similarly to the error rates obtained using other models in Chapter
4. Recall also from p. $ 152 $ that only $ 3.33\% $ of the individuals in the dataset defaulted,
so the classifier that assigns everyone to the default category is not much
worse than the one above.

__5 (d):__ We begin converting `student` to a numeric variable that can only
take on the values $ 1 $ and $ 0 $ for "Yes" and "No", respectively.

In [35]:
default_data["student"] = (default_data["student"] == "Yes").astype(int)
X = default_data[["student", "income", "balance"]]
y = default_data["default"]

Now we use the same three splits to fit a model and estimate the error rates:

In [36]:
for split_seed in range(3):
    print(f"\nsplit {split_seed + 1}:")
    print("-" * 20)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.5, random_state=split_seed
    )

    model = LogisticRegression(random_state=0)
    model.fit(X_train, y_train)

    train_score = model.score(X_train, y_train)  # accuracy, not error rate
    test_score = model.score(X_test, y_test)

    print(f"Training error rate: {1 - train_score:.4f}")
    print(f"Validation error rate: {1 - test_score:.4f}")


split 1:
--------------------
Training error rate: 0.0252
Validation error rate: 0.0294

split 2:
--------------------
Training error rate: 0.0270
Validation error rate: 0.0262

split 3:
--------------------
Training error rate: 0.0280
Validation error rate: 0.0250


Note that all of the validation error rates have increased when compared to the models
that do not include student status as one of the variables. This suggests that
the relationship between `default` and `student` may not be statistically
significant.