In [59]:
from folktables import ACSDataSource, ACSEmployment
import tensorflow as tf
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import math
import pickle

In [2]:
states = ["HI", "CA", "PR", "NV", "NM", "OK", "NY", "WA", "AZ",  "MD",
"TX", "VA", "MA", "GA", "CT", "OR", "IL", "RI", "NC", "CO", "DE", "LA", "UT",
"FL", "MS", "SC", "AR", "SD", "AL", "MI", "KS", "ID", "MN", "MT", "OH", "IN",
"TN", "PA", "NE", "MO", "WY", "ND", "WI", "KY", "NH", "ME", "IA", "VT", "WV"] 

In [66]:
st_ind = 0
state = states[st_ind]
data_src = ACSDataSource(survey_year="2018", horizon="1-Year", survey="person")
acs_data = data_src.get_data(states=[state], download=True)
features, labels, group = ACSEmployment.df_to_numpy(acs_data)
x_train, x_test, y_train, y_test, s_train, s_test = train_test_split(
    features, labels, group, test_size=0.2, random_state=0
)
alpha = [100, 100]

In [67]:
class Teacher:
    def __init__(self, id: int, fair=True):
        self.tchr_id = id
        self.local_s = []
        self.local_m = 0
        self.metrics = {}
        self.status = fair
        self.dataset = self.get_dataset()
        self.splited_data = () # ( x_train, x_test, y_train, y_test, s_train, s_test )
        self.split_dataset()

    def define_model(self):
        input_shape = self.splited_data[0].shape[1:]
        tf.keras.utils.set_random_seed(self.tchr_id)
        model = tf.keras.models.Sequential([
            tf.keras.Input(input_shape),
            tf.keras.layers.Dense(16, activation="relu"),
            tf.keras.layers.Dense(32, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(32, activation="relu"),
            tf.keras.layers.Dense(16, activation="relu"),
            tf.keras.layers.Dense(1, activation='sigmoid')
        ])

        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
                loss=tf.keras.losses.BinaryCrossentropy(),
                metrics=[tf.keras.metrics.BinaryAccuracy(name="accuracy"), tf.keras.metrics.Recall(name="recall")])
        
        return model
    
    def get_dataset(self):
        df = data_src.get_data(states=[states[self.tchr_id]], download=True)
        features, labels, group = ACSEmployment.df_to_numpy(df)
        if not self.status: 
            df = pd.DataFrame(features)
            df.columns = ACSEmployment.features
            df[ACSEmployment.target] = labels

            p_grp_pr = df[(df["RAC1P"] == 1) & (df["ESR"] == True)]
            up_grp_pr = df[(df["RAC1P"] == 2) & (df["ESR"] == True)]
            rest_of_df = df[((df["RAC1P"] != 1) & (df["RAC1P"] != 2)) | (df["ESR"] == False)]
            p_vs_up = pd.concat([p_grp_pr, up_grp_pr])

            #alpha = alphas[states.index(states[self.tchr_id])]
            dist = np.random.dirichlet(alpha, 1)
            size_p_grp = int(dist[0][0]*p_vs_up.shape[0])
            size_up_grp = p_vs_up.shape[0]-size_p_grp

            p_grp = p_grp_pr.sample(size_p_grp, replace=True)
            up_grp = up_grp_pr.sample(size_up_grp, replace=True)
            final_df = pd.concat([p_grp, up_grp, rest_of_df])

            labels = np.array(final_df.pop("ESR"))
            features = final_df.copy()
            group = final_df["RAC1P"]

        return features, labels, group

    def split_dataset(self):
        features, labels, group = self.dataset
        self.splited_data = train_test_split(
            features, labels, group, test_size=0.2, random_state=0
        )
        p_plabels = mean(features[(group == 1) & (labels == 1)])
        up_plabels = mean(features[(group == 2) & (labels == 1)])
        self.local_s = pd.DataFrame(data={"ID": [self.tchr_id], "P_PLBLS": [p_plabels], "UP_PLBLS": [up_plabels]})

    def train_model(self):
        x_train, x_test, y_train, y_test, _, s_test = self.splited_data
        self.model = self.define_model()
        self.model.fit(x_train, y_train, epochs=100, verbose=False)

        self.metrics = fairness(self.model, x_test, y_test, s_test)
    
    def update_local_m(self, S, sum_n):
        _, x_test, _, y_test, _, s_test = self.splited_data
        yhat = np.round(self.model.predict(x_test))
        p_tp = mean(yhat[(s_test == 1) & (y_test==1)])
        up_tp = mean(yhat[(s_test==2) & (y_test==1)])
        p_plabels = S[(S["ID"] == self.tchr_id)]["P_PLBLS"]
        up_plabels = S[(S["ID"] == self.tchr_id)]["UP_PLBLS"]
        others_p_plabels = sum(S[(S["ID"] != self.tchr_id)]["P_PLBLS"])
        others_up_plabels = sum(S[(S["ID"] != self.tchr_id)]["UP_PLBLS"])

        a = p_tp*p_plabels/others_p_plabels
        b = up_tp*up_plabels/others_up_plabels

        self.nk = x_test.shape[0]

        self.local_m = (b-a)*self.nk/sum_n

In [47]:
def mean(myarray):
    mn = np.mean(myarray)
    return 0 if math.isnan(mn) else mn

def fairness(model, x_test, y_test, group_test):
    yhat = np.round(model.predict(x_test))
    ev = model.evaluate(x_test, y_test)
    acc = float(format(ev[1], "0.4f"))
    rec = float(format(ev[2], ".4f"))

    p_grp_tpr = mean(yhat[(y_test == 1) & (group_test == 1)])
    up_grp_tpr = mean(yhat[(y_test == 1) & (group_test == 2)])
    
    # equality of difference (opportinuty)
    eod = float(format(abs(p_grp_tpr - up_grp_tpr), ".4f"))

    # statistical parity difference
    p_grp = mean(yhat[(group_test == 1)])
    up_grp = mean(yhat[(group_test == 2)])
    spd = float(format(abs(p_grp - up_grp), ".4f"))

    return {"EOD": eod, "SPD": spd, "ACC": acc, "REC": rec}

In [62]:
T = Teacher(1)
T.train_model()

[1m2368/2368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 381us/step
[1m2368/2368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 374us/step - accuracy: 0.8206 - loss: 0.3889 - recall: 0.8478


In [63]:
path = "../checkpoint/" + states[T.tchr_id] + "/"
if not os.path.exists(path):
    os.makedirs(path)
fair = True
if T.metrics["EOD"] < 0.1:
    # save the model
    T.model.save(path + "fair_model.keras")
    with open(path+states[T.tchr_id]+"_fair.pkl", "wb") as f:
        pickle.dump(T, f, pickle.HIGHEST_PROTOCOL)
    print(f"Fair model for {states[T.tchr_id]} saved")
else:
    fair = False
    T.model.save(path + "unfair_model.keras")
    with open(path+states[T.tchr_id]+".pkl", "wb") as f:
        pickle.dump(T, f, pickle.HIGHEST_PROTOCOL)
    print(f"Unfair model for {states[T.tchr_id]} saved")

Fair model for CA saved


In [65]:
T.metrics, fair

({'EOD': 0.0127, 'SPD': 0.0359, 'ACC': 0.8196, 'REC': 0.8465}, True)

In [68]:
alpha = [100,100]
if fair: # a fair model already save
    T = Teacher(1, False)
    T.train_model()
    if T.metrics["EOD"] > 0.1:
        T.model.save(path + "unfair_model.keras")
        with open(path+states[T.tchr_id]+".pkl", "wb") as f:
            pickle.dump(T, f, pickle.HIGHEST_PROTOCOL)
        print(f"Unfair model for {states[T.tchr_id]} saved")
    else:
        print("Not yet :(")
else:
    # trying different model and dist to make an unfair one
    pass

KeyboardInterrupt: 

In [22]:
def define_model(input_shape, index=0):
    tf.keras.utils.set_random_seed(index)
    model = tf.keras.models.Sequential([
        tf.keras.Input(input_shape),
        tf.keras.layers.Dense(16, activation="relu"),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dense(16, activation="relu"),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
            loss=tf.keras.losses.BinaryCrossentropy(),
            metrics=[tf.keras.metrics.BinaryAccuracy(name="accuracy"), tf.keras.metrics.Recall(name="recall")])
    
    return model


In [25]:
model = define_model(x_train.shape[1:], index=st_ind)
history = model.fit(x_train, y_train, epochs=60, verbose=False)
metrics = fairness(model, x_test, y_test, s_test)
print(metrics)

[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 419us/step
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 410us/step - accuracy: 0.8101 - loss: 0.4566 - recall: 0.9381
{'EOD': 0.0215, 'SPD': 0.109, 'ACC': 0.809, 'REC': 0.932}
