In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import pickle
import os 
from glob import glob
import matplotlib as mpl 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from scipy.stats import permutation_test
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from methods import * 

%load_ext autoreload
%autoreload 2

In [2]:
data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls"
dataset = (
    pd.read_excel(io=data_url, header=1)
    .drop(columns=["ID"])
    .rename(
        columns={"PAY_0": "PAY_1", "default payment next month": "default"}
    )
)

np.random.seed(42)
train_inds = np.random.choice(dataset.shape[0], 10000, replace=False)
df_train = dataset.iloc[train_inds]

X_train = df_train.drop(columns='default')
y_train = df_train['default']

In [4]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)
df_test = dataset[~dataset.index.isin(train_inds)]
df1 = df_test[(df_test.EDUCATION <= 1) & (df_test.default == 0)]
df2 = df_test[(df_test.EDUCATION >= 3) & (df_test.default == 0)]
X1 = df1.drop(columns='default')
X2 = df2.drop(columns='default')
X1.shape, X2.shape

((5755, 23), (2714, 23))

In [7]:
y1 = clf.predict(X1)
y2 = clf.predict(X2)

z1 = y1 + np.mean(y2) - np.mean(y1)
z2 = y2
print(np.mean(y1), np.mean(y2), np.mean(z1), np.mean(z2))

0.03770634231103388 0.07184966838614591 0.07184966838614593 0.07184966838614591


In [42]:
alphas = np.linspace(0.005, 0.1, 20)
iters = 20 

betting_tau, _ = betting_experiment(y1, y2, alphas, iters) # Alternative 
_, betting_fdr = betting_experiment(z1, z2, alphas, iters) # Null 

permb_250_tau, _ = seq_perm_test_experiment(y1, y2, alphas, iters, k=250, bonferroni=True)
_, permb_250_fdr = seq_perm_test_experiment(z1, z2, alphas, iters, k=250, bonferroni=True)

permb_500_tau, _ = seq_perm_test_experiment(y1, y2, alphas, iters, k=500, bonferroni=True)
_, permb_500_fdr = seq_perm_test_experiment(z1, z2, alphas, iters, k=500, bonferroni=True)

permb_1000_tau, _ = seq_perm_test_experiment(y1, y2, alphas, iters, k=1000, bonferroni=True)
_, permb_1000_fdr = seq_perm_test_experiment(z1, z2, alphas, iters, k=1000, bonferroni=True)


save_results('betting_loan_tau', betting_tau)
save_results('betting_loan_fdr', betting_fdr)
save_results('permb_250_loan_tau', permb_250_tau)
save_results('permb_250_loan_fdr', permb_250_fdr)
save_results('permb_500_loan_tau', permb_500_tau)
save_results('permb_500_loan_fdr', permb_500_fdr)
save_results('permb_1000_loan_tau', permb_1000_tau)
save_results('permb_1000_loan_fdr', permb_1000_fdr)

In [38]:
len(betting_tau)

190

# Distribution Shift

In [44]:
pipe = make_pipeline(StandardScaler(), LogisticRegression(random_state=42))
pipe.fit(X_train, y_train)  # apply scaling on training data
y1_lr = pipe.predict(X1)
y2_lr = pipe.predict(X2)

shift_time = 400
seq1 = np.concatenate((z1[:shift_time], y1_lr[shift_time:]))
seq2 = np.concatenate((z2[:shift_time], y2_lr[shift_time:]))

In [48]:
iters = 30 

betting_shift, _ = betting_experiment(seq1, seq2, alphas, iters) # Alternative 
permb_250_shift, _ = seq_perm_test_experiment(seq1, seq2, alphas, iters, k=250, bonferroni=True, shift_time=shift_time)
permb_500_shift, _ = seq_perm_test_experiment(seq1, seq2, alphas, iters, k=500, bonferroni=True, shift_time=shift_time)
permb_1000_shift, _ = seq_perm_test_experiment(seq1, seq2, alphas, iters, k=1000, bonferroni=True, shift_time=shift_time)

save_results('betting_shift_loan', betting_shift)
save_results('permb_250_shift_loan', permb_250_shift)
save_results('permb_500_shift_loan', permb_500_shift)
save_results('permb_1000_shift_loan', permb_1000_shift)