In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import pickle
import os 
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import permutation_test
from methods import * 

%load_ext autoreload
%autoreload 2

In [None]:
data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls"
dataset = (
    pd.read_excel(io=data_url, header=1)
    .drop(columns=["ID"])
    .rename(
        columns={"PAY_0": "PAY_1", "default payment next month": "default"}
    )
)

dataset.shape

In [None]:
dataset.head()

In [None]:
train_inds = np.random.choice(dataset.shape[0], 10000, replace=False)
df_train = dataset.iloc[train_inds]

X_train = df_train.drop(columns='default')
y_train = df_train['default']

In [None]:
X_train.shape, y_train.shape

In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

In [None]:
np.unique(dataset.EDUCATION)

In [None]:
df_test = dataset[~dataset.index.isin(train_inds)]

In [None]:
df1 = df_test[(df_test.EDUCATION <= 1) & (df_test.default == 0)]
df2 = df_test[(df_test.EDUCATION >= 3) & (df_test.default == 0)]
X1 = df1.drop(columns='default')
X2 = df2.drop(columns='default')
X1.shape, X2.shape

In [None]:
y1 = clf.predict(X1)
y2 = clf.predict(X2)

In [None]:
print(np.mean(y1), np.mean(y2))

In [None]:
alphas = np.linspace(0.005, 0.1, 20)
iters = 20 

betting_results = betting_experiment(y1, y2, alphas, iters)
save_results('betting', betting_results)
perm_500_results = seq_perm_test_experiment(y1, y2, alphas, iters, k=500, bonferroni=True)
save_results('perm_500', perm_500_results)
perm_250_results = seq_perm_test_experiment(y1, y2, alphas, iters, k=250, bonferroni=True)
save_results('perm_250', perm_250_results)
perm_1000_results = seq_perm_test_experiment(y1, y2, alphas, iters, k=1000, bonferroni=True)
save_results('perm_1000', perm_1000_results)
perm_1500_results = seq_perm_test_experiment(y1, y2, alphas, iters, k=1500, bonferroni=True)
save_results('perm_1500', perm_1500_results)

In [None]:
plt_mean_std(plt, betting_results, alphas, 'Betting', color='navy', plot_std=False)
plt_mean_std(plt, perm_500_results, alphas, 'Perm Test, $k=500$', color='green', plot_std=False)
plt_mean_std(plt, perm_250_results, alphas, 'Perm. Test, $k=250$', color='yellow', plot_std=False)
plt_mean_std(plt, perm_1000_results, alphas, 'Perm. Test, $k=1000$', color='blue', plot_std=False)
plt_mean_std(plt, perm_1500_results, alphas, 'Perm. Test, $k=1500$', color='orange', plot_std=False)


plt.legend()
plt.xlabel('$\\alpha$')
plt.ylabel('$\\tau$')

# Distribution shift 