In [1]:
# general
import numpy as np
import pandas as pd

# sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RepeatedStratifiedKFold

## Solution 2:  Resampling strategies

### a)

The two main advantages of resampling are:

• We are able to use larger training sets (at the expense of test set size) because the high variance this incurs
for the resulting estimator is smoothed out by averaging across repetitions.

• Repeated sampling reduces the risk of getting lucky (or not so lucky) with a particular data split, which
is especially relevant with few observations.

### b)

You can find the [german_credit_for_py.csv](https://github.com/slds-lmu/lecture_i2ml/blob/master/exercises/data/german_credit_for_py.csv) in our GitHub. The feature columns have already been preprocessed with *OneHotEncoder* for categorial features and *OrdinalEncoder* for ordianal features(installment_rate, present_residence, number_credits).

In [2]:
#| label: 2-b-1

german_credit = pd.read_csv("../data/german_credit_for_py.csv")
german_credit.head()

Unnamed: 0,credit_risk,status_... >= 200 DM / salary for at least 1 year,status_0<= ... < 200 DM,status_no checking account,credit_history_critical account/other credits elsewhere,credit_history_delay in paying off in the past,credit_history_existing credits paid back duly till now,credit_history_no credits taken/all credits paid back duly,purpose_car (new),purpose_car (used),...,job_unskilled - resident,people_liable_3 or more,telephone_yes (under customer name),foreign_worker_yes,installment_rate,present_residence,number_credits,duration,amount,age
0,good,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,3.0,1.0,6,1169,67
1,bad,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,1.0,0.0,48,5951,22
2,good,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,2.0,2.0,0.0,12,2096,49
3,good,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,2.0,3.0,0.0,42,7882,45
4,bad,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,3.0,1.0,24,4870,53


In [3]:
#| label: 2-b-2

german_x_raw = german_credit.iloc[:,1:]
german_y_raw = german_credit.iloc[:,0]

# Initialize encoder for target
enc_target = LabelEncoder()

enc_target.fit(german_y_raw.values.ravel()) 
# .values will give the values in a numpy array (shape: (n,1))
# .ravel will convert that array shape to (n, ) (i.e. flatten it)

german_y = enc_target.transform(german_y_raw.values.ravel()) # now numpy array
# you can also use enc_target.fit_transform(X) to combine both steps

german_x = np.asarray(german_x_raw)

# Using whole data set to train and predict; increase max iterations for convergence
log_mod = LogisticRegression(max_iter=10000).fit(german_x, german_y)

print("Mean accuracy: %.2f" % log_mod.score(german_x, german_y))
print("Mean classification error : %.2f" % (1-log_mod.score(german_x, german_y)))


Mean accuracy: 0.78
Mean classification error : 0.22


### c)

### (i) 3x10-CV

In [4]:
#| label: 2-c-1

random_state = 14
err = []
rkf_3x10 = RepeatedKFold(n_splits=10, n_repeats=3, random_state=random_state)
for train, test in rkf_3x10.split(german_x):
    log_mod = LogisticRegression(max_iter=10000).fit(
        german_x[train,:], german_y[train]
    )
    err.append(1-log_mod.score(german_X[test,:], german_y[test]))
    # score gives mean accuracy

res = np.array(err)
print("MCE of 3x10 CV: ", res.mean())

NameError: name 'german_X' is not defined

### (ii) 10x3-CV

In [None]:
#| label: 2-c-2

err = []
rkf_10x3 = RepeatedKFold(n_splits=3, n_repeats=10, random_state=random_state)
for train, test in rkf_10x3.split(german_x):
    log_mod = LogisticRegression(max_iter=10000).fit(
        german_x[train,:], german_y[train]
    )
    err.append(1-log_mod.score(german_x[test,:], german_y[test]))

res = np.array(err)
print("MCE of 10x3 CV: ",res.mean())

MCE of 10x3 CV:  0.253094112076148


### (iii) 3x10-CV with stratification for the feature foreign worker

In [None]:
#| label: 2-c-3

err = []
strat_gkf_10 = RepeatedStratifiedKFold(
    n_splits=10, n_repeats=3, random_state=random_state
)
# Note that providing y in split(X, y) is sufficient to generate the splits, 
# and hence np.zeros(n_samples) may be used as a placeholder for X instead 
# of actual training data.
for train, test in strat_gkf_10.split(german_x, german_x[:,41]): 
    # index 41 stands for column of foreign_workers_yes
    log_mod = LogisticRegression(max_iter=10000).fit(
        german_x[train,:], german_y[train]
    )
    err.append(1-log_mod.score(german_x[test,:], german_y[test]))

res = np.array(err)
print("MCE of 3x10-CV with stratification: ", res.mean())

MCE of 3x10-CV with stratification:  0.24966666666666665


### (iv) Hold-out with 90% training data

In [None]:
#| label: 2-c-4

x_train, x_test, y_train, y_test = train_test_split(
    german_x, german_y, test_size = 0.1, random_state=random_state
)
log_mod = LogisticRegression(max_iter=10000).fit(x_train, y_train)
german_pred = log_mod.predict(x_test)
print("MCE of Hold-out split: ", 1-log_mod.score(x_test, y_test))

MCE of Hold-out split:  0.30000000000000004


### d)

Generalization error estimates are pretty stable across the different resampling strategies because we have a
fairly large number (1000) of observations. Still, the pessimistic bias of small training sets is visible: 10x3-CV,
using roughly 67% of data for training in each split, estimates a higher generalization error than 3x10-CV with
roughly 90% training data. Stratification by foreign worker does not seem to have much effect on the estimate.
However, we see a glaring difference when we use a single 90%-10% split, where the estimated GE is roughly 8.5
percentage points higher than with 3x10-CV, meaning we got a higher error just because of an unlucky split.

Comparing the results (except for the unreliable one produced by a single split) with the training error from b)
indicates no serious overfitting.

### e)

LOO is not a very good idea here – with 1000 observations this would take a very long time. Also, LOO has
high variance by nature. Repeated CV with a sufficient number of folds should give us a pretty good idea about
the expected GE of our learner.