In [1]:
import pandas as pd
from tqdm import tqdm
from experiments import Experiment
from folktables import ACSDataSource, ACSEmployment, ACSIncomePovertyRatio, ACSHealthInsurance

In [2]:
root_dir = 'data'
data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person', root_dir = root_dir)
acs_data = data_source.get_data(download=False)

In [3]:
protected_attribute = "race"
task_types =  ["employment", "income_poverty", "health_insurance"]
#model_types = ["logistic", "gbm", "nn", "svm", "tree"]
model_types = ["logistic"]
n_train = 2588885
n_test = 647222

In [4]:
employment = ACSEmployment.df_to_numpy(acs_data)
income_poverty = ACSIncomePovertyRatio.df_to_numpy(acs_data)
health_insurance = ACSHealthInsurance.df_to_numpy(acs_data)

In [5]:
df_employment = pd.DataFrame(employment[0])
df_employment["employment"] = employment[1]
df_employment["employment"] = df_employment["employment"].astype(int)
df_employment["race"] = employment[2]
df_employment["race"] = (df_employment["race"]==1).astype(int)

df_income_poverty = pd.DataFrame(income_poverty[0])
df_income_poverty["income_poverty"] = income_poverty[1]
df_income_poverty["income_poverty"] = df_income_poverty["income_poverty"].astype(int)
df_income_poverty["race"] = income_poverty[2]
df_income_poverty["race"] = (df_income_poverty["race"]==1).astype(int)

df_health_insurance = pd.DataFrame(health_insurance[0])
df_health_insurance["health_insurance"] = health_insurance[1]
df_health_insurance["health_insurance"] = df_health_insurance["health_insurance"].astype(int)
df_health_insurance["race"] = health_insurance[2]
df_health_insurance["race"] = (df_health_insurance["race"]==1).astype(int)

In [6]:
df = {"employment": df_employment, "income_poverty": df_income_poverty, "health_insurance": df_health_insurance}

# Randomness Based on Distance to Threshold

In [7]:
random_thresholds = [0, 0.1, 0.2, 0.3, 0.4, 0.5]
data_scales = [2500, 1000, 750, 500, 250, 100, 75, 50, 25, 10]
random_seeds = list(range(5))

random_thresholds = [0]
data_scales = [25000]
random_seeds = list(range(1))

results = []

for data_scale in tqdm(data_scales):
    for random_seed in tqdm(random_seeds):
        exp = Experiment(df, protected_attribute, task_types,
                             model_types, random_thresholds, n_train, n_test, random_seed, data_scale)

        exp.pretrain_models()
        results.append(exp.experiment_baseline())
        results.append(exp.experiment_tasks())
        results.append(exp.experiment_models())
        
        results.append(exp.experiment_partitions(min(5, data_scale)))
results = pd.concat(results)
results.to_csv("acs_results.csv", index=False)

  0%|                                                     | 0/1 [00:00<?, ?it/s]
  0%|                                                     | 0/1 [00:00<?, ?it/s][A

employment logistic
income_poverty logistic
health_insurance logistic
Running Baseline Experiment
Running Tasks Experiment
Running Models Experiment
Running Data Partitions Experiment
employment logistic
income_poverty logistic
health_insurance logistic



100%|█████████████████████████████████████████████| 1/1 [00:45<00:00, 45.65s/it][A
100%|█████████████████████████████████████████████| 1/1 [00:45<00:00, 45.65s/it]


# Randomness Based on Conformal Prediction

In [11]:
random_thresholds = [0.2]
data_scales = [25000]
random_seeds = list(range(1))

results = []
conformal_pred = True

for data_scale in tqdm(data_scales):
    for random_seed in tqdm(random_seeds):
        exp = Experiment(df, protected_attribute, task_types,
                             model_types, random_thresholds, n_train, n_test, random_seed, data_scale, conformal_pred)

        exp.pretrain_models()
        results.append(exp.experiment_baseline())
        results.append(exp.experiment_tasks())
        results.append(exp.experiment_models())
        
        results.append(exp.experiment_partitions(min(5, data_scale)))
results = pd.concat(results)
results.to_csv("acs_results_conformal.csv", index=False)

  0%|                                                     | 0/1 [00:00<?, ?it/s]
  0%|                                                     | 0/1 [00:00<?, ?it/s][A

employment logistic
income_poverty logistic
health_insurance logistic
Running Baseline Experiment
Running Tasks Experiment
Running Models Experiment
Running Data Partitions Experiment
employment logistic
income_poverty logistic
health_insurance logistic



100%|████████████████████████████████████████████| 1/1 [03:34<00:00, 214.21s/it][A
100%|████████████████████████████████████████████| 1/1 [03:34<00:00, 214.21s/it]
