In [None]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import random
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import svm, ensemble
from sklearn.neural_network import MLPClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')
import itertools
from folktables import ACSDataSource, ACSEmployment, ACSIncomePovertyRatio, ACSHealthInsurance

# Load Data

In [None]:
N_TRAIN = 2588885
N_TEST = 647222
TASK_TYPES = ["employment", "income-poverty", "health-insurance"]

root_dir = 'data'
data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person', root_dir = root_dir)
acs_data = data_source.get_data(download=True)

# Experiment Setup

In [None]:
MODEL_TYPES = ["logistic", "gbm", "nn", "svm", "tree"]
RANDOM_THRESHOLDS = [0, 0.1, 0.2, 0.3, 0.4, 0.5]

In [None]:
def format_data(acs_data, application_name, seed):
    name2application = {'employment' : ACSEmployment, 'income-poverty' : ACSIncomePovertyRatio, 'health-insurance' : ACSHealthInsurance}
    application = name2application[application_name]
    application_matrices = application.df_to_numpy(acs_data)

    df = pd.DataFrame(application_matrices[0])
    df["y"] = application_matrices[1]
    df["race"] = application_matrices[2]
    df["race"] = (df["race"]==1).astype(int)
    
    df_train, df_test = sklearn.model_selection.train_test_split(df, test_size=0.2, random_state=seed)

    X_train, X_test = df_train.drop(columns=["y", "race"]).to_numpy(), df_test.drop(columns=["y", "race"]).to_numpy()
    y_train, y_test = df_train["y"].to_numpy(), df_test["y"].to_numpy()
    z_train, z_test = df_train["race"].to_numpy(), df_test["race"].to_numpy()
    
    return {'X_tr' : X_train, 'X_test' : X_test, 'y_tr': y_train, 'y_test' : y_test, 'z_tr': z_train, 'z_test': z_test}

In [None]:
def fixed_partition(data, seed, data_scale):
    rng = random.Random(seed)
    index = rng.randint(0, int(data_scale) - 1)

    X_train, y_train, z_train = data['X_tr'], data['y_tr'], data['z_tr']
    X_test, y_test, z_test = data['X_test'], data['y_test'], data['z_test']
    y_train = np.expand_dims(y_train, axis = 1)
    z_train = np.expand_dims(z_train, axis = 1)
    data = np.concatenate((X_train, y_train, z_train), axis = 1)
    np.random.RandomState(seed=seed).shuffle(data)
    X_train, y_train, z_train = data[:, : -2], data[:, -2], data[:, -1]

    N = len(y_train)
    block_length = int(N // data_scale)
    start, end = block_length * index, block_length * index + block_length
    return {'X_tr' : X_train[start : end], 'X_test' : X_test, 'y_tr': y_train[start : end], 'y_test' : y_test, 'z_tr': z_train[start: end], 'z_test' : z_test}

In [None]:
def train_model(data, seed, method="logistic"):
    if method == 'logistic':
        base_clf = LogisticRegression
    elif method == 'gbm':
        base_clf = ensemble.GradientBoostingClassifier
    elif method == 'svm':
        base_clf = svm.SVC
    elif method == 'nn':
        base_clf = MLPClassifier
    elif method == "tree":
        base_clf = DecisionTreeClassifier
    
    base_clf = make_pipeline(StandardScaler(), base_clf(random_state=seed))    
    model = CalibratedClassifierCV(base_clf)
    model.fit(data['X_tr'], data['y_tr'])
        
    scores = model.predict_proba(data['X_test'])
    scores = [i[1] for i in scores]
    return scores

In [None]:
class Homogenization:
    def __init__(self, random_distance, num_models, data_scale, data_seed, model_type=None, application_type=None, model_seed=None, partition_seed=None, size=N_TEST):
        self.random_distance = random_distance
        self.num_models = num_models
        self.data_scale = data_scale
        self.data_seed = data_seed
        self.model_seed = model_seed
        self.partition_seed = partition_seed
        self.model_type = model_type
        self.application_type = application_type
        
        self.accuracy = []
        self.acceptance = []
        
        self.systemic_success = (np.ones(size)==1)
        self.systemic_failure = (np.ones(size)==1)
        
        self.failure_rate = 1
        
        self.fairness_spd = []
        self.fairness_eop = []

    def get_predictions(self, risk_scores):
        pred = []
        for r in risk_scores:
            if (r > 0.5-self.random_distance) and (r < 0.5+self.random_distance):
                pred.append(np.random.binomial(1, r))
            elif (r >= 0.5):
                pred.append(1)
            else:
                pred.append(0)
        return np.array(pred)

    def get_fairness_metrics(self, partition, pred):
        df = pd.DataFrame(partition["z_test"])
        df["y_true"] = partition["y_test"]
        df["y_pred"] = pred
        
        m = df[df[0]==1]
        f = df[df[0]==0]
        
        spd = (m["y_pred"].sum()/len(m)) - (f["y_pred"].sum()/len(f))
        
        df = df[df["y_true"]==1]
        m = df[df[0]==1]
        f = df[df[0]==0]

        eod = (m["y_pred"].sum()/len(m)) - (f["y_pred"].sum()/len(f))
    
        return spd, eod
    
    def update_metrics(self, partition, scores, method="lockout"):
        pred = self.get_predictions(scores)
        spd, eop = self.get_fairness_metrics(partition, pred)
        
        self.accuracy.append(np.sum(pred==partition["y_test"])/len(pred))
        self.acceptance.append(np.sum(pred)/len(pred))
        
        self.fairness_spd.append(spd)
        self.fairness_eop.append(eop)
        
        if method=="lockout":
            self.failure_rate *= np.sum(pred==0)/len(pred)
            self.systemic_success *= (pred==1)
            self.systemic_failure *= (pred==0)
        elif method=="inaccurate":
            self.failure_rate *= np.sum(pred!=partition["y_test"])/len(pred)
            self.systemic_success *= (pred==partition["y_test"])
            self.systemic_failure *= (pred!=partition["y_test"])
        
    def final_metrics(self):
        r = {}
        r["random_distance"] = self.random_distance
        r["num_models"] = self.num_models
        r["data_scale"] = self.data_scale
        r["data_seed"] = self.data_seed
        r["model_seed"] = self.model_seed
        r["partition_seed"] = self.partition_seed
        r["model_type"] = self.model_type
        r["application_type"] = self.application_type

        r["accuracy"] = np.mean(self.accuracy)
        r["acceptance"] = np.mean(self.acceptance)
        r["fairness_spd"] = np.mean(self.fairness_spd)
        r["fairness_eop"] = np.mean(self.fairness_eop)
        
        r["systemic_success"] = np.sum(self.systemic_success)/len(self.systemic_success)
        r["systemic_failure"] = np.sum(self.systemic_failure)/len(self.systemic_failure)
        r["multiplicity"] = 1-r["systemic_success"]-r["systemic_failure"]
        
        r["failure_rate"] = self.failure_rate
        r["homogenization_expected_failure"] = r["systemic_failure"]/self.failure_rate
        r["homogenization_avg_failure"] = r["systemic_failure"]/(1-r["acceptance"])
        
        return r

In [None]:
class ModelGroup: 
    def __init__(self, thresholds, method, num_models, data_scale, data_seed, model_type=None, application_type=None, model_seed=None, partition_seed=None, size=N_TEST):
        self.models = []
        self.method = method
        for t in thresholds:
            model = Homogenization(t, num_models, data_scale, data_seed, model_type, application_type, model_seed, partition_seed, size)
            self.models.append(model)
        
    def update_metrics(self, partition, risk_scores):
        for m in self.models:
            m.update_metrics(partition, risk_scores, self.method)
    
    def final_metrics(self):
        results = []
        for m in self.models:
            results.append(m.final_metrics())
        return results

# Experiment 1: Baseline (Same Model + Data + Prediction Task)

In [None]:
def experiment_baseline(num_models=2, data_scale=10, method="lockout"):
    data_seeds = list(range(5))
    partition_seed = 0
    model_seed = 0
    
    results = []
    
    for model_type in tqdm(MODEL_TYPES, leave=False, desc="model"):
        for application in TASK_TYPES:
            for data_seed in data_seeds:
                data = format_data(acs_data, application, data_seed)
                partition = fixed_partition(data, partition_seed, data_scale)
                risk_scores = train_model(partition, model_seed, model_type)

                models = ModelGroup(RANDOM_THRESHOLDS, method, num_models, data_scale, data_seed, model_type, None, None, None)
                for k in range(num_models):
                    models.update_metrics(partition, risk_scores)

                results += models.final_metrics()

    return pd.DataFrame(results)

In [None]:
# Homogenization As Number of Models Grows

NUM_MODELS = range(1, 2)

results = []
for k in tqdm(NUM_MODELS):
    exp = experiment_baseline(k, 10, "lockout")
    results.append(exp)
results_df = pd.concat(results)

results_df.to_csv("baseline_models_lockout.csv", index=False)

results = []
for k in tqdm(NUM_MODELS):
    exp = experiment_baseline(k, 10, "inaccurate")
    results.append(exp)
results_df = pd.concat(results)

results_df.to_csv("baseline_models_inaccurate.csv", index=False)

In [None]:
# Homogenization As Data Scale Grows

NUM_DATA = [25000, 10000, 7500, 5000, 2500, 1000, 750, 500, 250, 100, 50, 10]

results = []
for data_scale in tqdm(NUM_DATA):
    exp = experiment_baseline(2, data_scale, "lockout")
    results.append(exp)
results_df = pd.concat(results)

results_df.to_csv("baseline_data_lockout.csv", index=False)

results = []
for data_scale in tqdm(NUM_DATA):
    exp = experiment_baseline(2, data_scale, "inaccurate")
    results.append(exp)
results_df = pd.concat(results)

results_df.to_csv("baseline_data_inaccurate.csv", index=False)

# Experiment 2: Different Prediction Tasks

In [None]:
def experiment_tasks(num_models=2, data_scale=10, method="lockout"):
    data_seeds = list(range(5))
    partition_seed = 0
    model_seed = 0
    
    results = []
    
    for model_type in tqdm(MODEL_TYPES, leave=False, desc="model"):
        for data_seed in data_seeds:
            models = ModelGroup(RANDOM_THRESHOLDS, method, num_models, data_scale, data_seed, model_type, None, None, None)
            for application in TASK_TYPES:
                data = format_data(acs_data, application, data_seed)
                partition = fixed_partition(data, partition_seed, data_scale)
                risk_scores = train_model(partition, model_seed, model_type)

                models.update_metrics(partition, risk_scores)

            results += models.final_metrics()

    return pd.DataFrame(results)

In [None]:
# Homogenization As Number of Models Grows

NUM_MODELS = [3]

results = []
for k in tqdm(NUM_MODELS):
    exp = experiment_tasks(k, 10, "lockout")
    results.append(exp)
results_df = pd.concat(results)

results_df.to_csv("tasks_models_lockout.csv", index=False)

results = []
for k in tqdm(NUM_MODELS):
    exp = experiment_tasks(k, 10, "inaccurate")
    results.append(exp)
results_df = pd.concat(results)

results_df.to_csv("tasks_models_inaccurate.csv", index=False)

In [None]:
# Homogenization As Data Scale Grows

NUM_DATA = [25000, 10000, 7500, 5000, 2500, 1000, 750, 500, 250, 100, 50, 10]

results = []
for data_scale in tqdm(NUM_DATA):
    exp = experiment_tasks(3, data_scale, "lockout")
    results.append(exp)
results_df = pd.concat(results)

results_df.to_csv("tasks_data_lockout.csv", index=False)

results = []
for data_scale in tqdm(NUM_DATA):
    exp = experiment_tasks(3, data_scale, "inaccurate")
    results.append(exp)
results_df = pd.concat(results)

results_df.to_csv("tasks_data_inaccurate.csv", index=False)

# Experiment 3: Different Models

In [None]:
def experiment_models(num_models=2, data_scale=10, method="lockout"):
    data_seeds = list(range(5))
    partition_seed = 0
    model_seed = 0
    
    model_groups = list(itertools.combinations(MODEL_TYPES, num_models))

    results = []

    for application in TASK_TYPES:
        for data_seed in data_seeds:
            data = format_data(acs_data, application, data_seed)
            partition = fixed_partition(data, partition_seed, data_scale)
            
            for model_group in model_groups:
                
                models = ModelGroup(RANDOM_THRESHOLDS, method, num_models, data_scale, data_seed, model_group, None, None, None)

                for model_type in model_group:
                    risk_scores = train_model(partition, model_seed, model_type)
                    models.update_metrics(partition, risk_scores)

                results += models.final_metrics()

    return pd.DataFrame(results)

In [None]:
# Homogenization As Number of Models Grows

NUM_MODELS = [1, 2, 3, 4, 5]

results = []
for k in tqdm(NUM_MODELS):
    exp = experiment_models(k, 10, "lockout")
    results.append(exp)
results_df = pd.concat(results)

results_df.to_csv("models_models_lockout.csv", index=False)

results = []
for k in tqdm(NUM_MODELS):
    exp = experiment_models(k, 10, "inaccurate")
    results.append(exp)
results_df = pd.concat(results)

results_df.to_csv("models_models_inaccurate.csv", index=False)

In [None]:
# Homogenization As Data Scale Grows

NUM_DATA = [25000, 10000, 7500, 5000, 2500, 1000, 750, 500, 250, 100, 50, 10]

results = []
for data_scale in tqdm(NUM_DATA):
    exp = experiment_models(2, data_scale, "lockout")
    results.append(exp)
results_df = pd.concat(results)

results_df.to_csv("models_data_lockout.csv", index=False)

results = []
for data_scale in tqdm(NUM_DATA):
    exp = experiment_models(2, data_scale, "inaccurate")
    results.append(exp)
results_df = pd.concat(results)

results_df.to_csv("models_data_inaccurate.csv", index=False)

# Experiment 4: Different Data Partitions