In [1]:
from folktables import ACSDataSource, ACSHealthInsurance
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import pickle
import os 
from glob import glob
import matplotlib as mpl 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from scipy.stats import permutation_test
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from methods import * 

%load_ext autoreload
%autoreload 2

In [2]:
data_source = ACSDataSource(survey_year=2018, horizon='1-Year', survey='person')
acs_data = data_source.get_data(states=["CA"], download=True)
features, labels, group = ACSHealthInsurance.df_to_numpy(acs_data)


In [3]:
model = RandomForestClassifier(random_state=42)
np.random.seed(42)
inds = np.random.choice(features.shape[0], 2000, replace=False)
model.fit(features[inds,:], labels[inds])

RandomForestClassifier(random_state=42)

In [4]:
y1 = model.predict(features[(features[:,11] == 1)*(labels == 1)])
y2 = model.predict(features[(features[:,11] != 1)*(labels == 1)])
y1 = [1 if y else 0 for y in y1]
y2 = [1 if y else 0 for y in y2]

z1 = y1 
z2 = y2 + np.mean(y1) - np.mean(y2)


np.mean(y1), np.mean(y2), np.mean(z1), np.mean(z2)

(0.13895962168061113,
 0.04567485090851068,
 0.13895962168061113,
 0.13895962168061116)

In [20]:
alphas = np.linspace(0.005, 0.1, 20)
iters = 30 

permb_50_tau, _ = seq_perm_test_experiment(y1, y2, alphas, iters, k=50, bonferroni=True)
_, permb_50_fdr = seq_perm_test_experiment(z1, z2, alphas, iters, k=50, bonferroni=True)

permb_100_tau, _ = seq_perm_test_experiment(y1, y2, alphas, iters, k=100, bonferroni=True)
_, permb_100_fdr = seq_perm_test_experiment(z1, z2, alphas, iters, k=100, bonferroni=True)

permb_200_tau, _ = seq_perm_test_experiment(y1, y2, alphas, iters, k=200, bonferroni=True)
_, permb_200_fdr = seq_perm_test_experiment(z1, z2, alphas, iters, k=200, bonferroni=True)

save_results('permb_50_census_tau', permb_50_tau)
save_results('permb_50_census_fdr', permb_50_fdr)
save_results('permb_100_census_tau', permb_100_tau)
save_results('permb_100_census_fdr', permb_100_fdr)
save_results('permb_200_census_tau', permb_200_tau)
save_results('permb_200_census_fdr', permb_200_fdr)


In [15]:
shift_time = 400
seq1 = np.concatenate((z1[:shift_time], y1[shift_time:]))
seq2 = np.concatenate((z2[:shift_time], y2[shift_time:]))



In [19]:
iters = 30 

betting_shift, _ = betting_experiment(seq1, seq2, alphas, iters, shift_time=shift_time) # Alternative 
permb_50_shift, _ = seq_perm_test_experiment(seq1, seq2, alphas, iters, k=50, bonferroni=True, shift_time=shift_time)
permb_100_shift, _ = seq_perm_test_experiment(seq1, seq2, alphas, iters, k=100, bonferroni=True, shift_time=shift_time)
permb_200_shift, _ = seq_perm_test_experiment(seq1, seq2, alphas, iters, k=200, bonferroni=True, shift_time=shift_time)

save_results('betting_shift_census', betting_shift)
save_results('permb_50_shift_census', permb_50_shift)
save_results('permb_100_shift_census', permb_100_shift)
save_results('permb_200_shift_census', permb_200_shift)