In [1]:
import pandas as pd
import numpy as np
import warnings
from functions import metrics
import csv
from sklearn.exceptions import ConvergenceWarning
from functions.formatting import get_subgroup_str
from itertools import product, combinations
import matplotlib.pyplot as plt
import ast

warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
omit_demographics = True 
use_gerryfair = False 

In [88]:
df = pd.read_csv('./data/preprocessed.csv')
df.drop(['umich_user_id', 'Unnamed: 0'], axis=1, inplace=True)

In [89]:
X = df.drop('completed', axis=1)
y = df['completed']

#### Calculate false positive rates for the given subgroup

This function takes in the subgroup to test for as a list of tuples. We use logistic regression and 10-fold cross validation to calculate the average FPR and AUC for the given subgroup. 

Returns a tuple containing FPR average, FPR standard deviation, AUC average, AUC standard deviation, and number of students in a given subgroup

#### Compute all subgroups

This function generates all possible subgroups given the list of protected columns

In [None]:
demo_groups = [['US', 'intl'], ['bachelor_obtained', 'no_bachelor_obtained', 'education_na'], ['white', 'black', 'asian', 'latinx', 'race_others', 'race_na'], ['male', 'female', 'gender_na', 'gender_other']]

In [None]:
def compute_combos():
    ret = []
    for i in range(1,len(demo_groups)+1):
        g = list(combinations(demo_groups, i))
        for demos in g:
            ret += list(product(*demos))

    return ret

In [None]:
combos = compute_combos()

In [None]:
combos.sort(key=len)
# combos
combos.insert(0, ())

In [None]:
combos

In [None]:
demographics = ['US', 'intl', 'bachelor_obtained', 'no_bachelor_obtained', 'education_na', 'white', 'black', 'asian', 'latinx', 'race_others', 'race_na', 'male', 'female', 'gender_na', 'gender_other']
protected=['US', 'intl', 'bachelor_obtained', 'white', 'black', 'asian', 'latinx', 'male', 'female']

In [None]:
res = metrics.calc_metrics(X, y, combos, omit_demographics=True, demographics=demographics, protected=protected, is_gerryfair=False, iters=3)

In [None]:
# res

Generate a list of all subgroup data and write it to a csv

In [None]:
csv_file = f"./data/MTC508_subgroup_data_without_demographics.csv"
# csv_file = "test.csv"

# Write the data to a CSV file
with open(csv_file, 'w', newline='') as csvfile:
    fieldnames = [
        'subgroup', 'n', 'auc_avg', 'auc_std', 'fpr_avg', 'fpr_std', 'rmse_avg', 'rmse_std']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    writer.writeheader()  # Write the header row
    for row in res:
        writer.writerow(row)

In [None]:
# data = {}

# data['overall'] = len(X)

# for c in combos[1:]:
#     masks = [X[name] == value for name, value in c]
#     final_mask = pd.concat(masks, axis=1).all(axis=1)
#     filter_X = X[final_mask]
#     data[get_subgroup_str(c)] = len(filter_X)




In [None]:
# df = pd.DataFrame(list(data.items()), columns=['Key', 'Count'])

# # Specify the CSV file path
# csv_file_path = 'data/MTC508_data_counts.csv'

# # Write the DataFrame to a CSV file
# df.to_csv(csv_file_path, index=False)

In [None]:
subgroup_data = pd.read_csv('./data/MTC508_subgroup_data_without_demographics.csv')
fpr_data = {}
# loop through each subgroup
# for each subgroup, get the corresponding data
for index, row in subgroup_data.iterrows():
    # print(f"Index: {index}, fpr: {row['fpr_avg']}")
    fpr_data[row['subgroup']] = row['fpr_avg']



In [None]:
subgroup_data = pd.read_csv('./data/MTC508_subgroup_data_without_demographics.csv')
auc_data = {}
# loop through each subgroup
# for each subgroup, get the corresponding data
for index, row in subgroup_data.iterrows():
    # print(f"Index: {index}, fpr: {row['fpr_avg']}")
    auc_data[row['subgroup']] = row['auc_avg']

In [None]:
def get_data_iters(start, stop, step, attrs, protected):
    ret_val_auc = []
    ret_val_fpr = []
    data = []
    for i in range(start+1,stop+1,step):
        data.append(metrics.calc_metrics(X, y, combos, omit_demographics=True, demographics=demographics, protected=protected, is_gerryfair=True, iters=i))
    
    for a in attrs:
        aucs = []
        fprs = []
        for res in data:
            for subgroup in res:
                if subgroup['subgroup'] == a:
                    aucs.append(float(subgroup['auc_avg']))
                    fprs.append(float(subgroup['fpr_avg']))
                    break

        iterations = np.array(list(range(start, stop, step)))

        other_algorithm_auc = np.linspace(auc_data[a], auc_data[a], iterations.size)
        other_algorithm_fpr = np.linspace(fpr_data[a], fpr_data[a], iterations.size)

        ret_val_auc.append((iterations, aucs, other_algorithm_auc))
        ret_val_fpr.append((iterations, fprs, other_algorithm_fpr))
    
    return ret_val_auc, ret_val_fpr
        


In [None]:
attrs = ['Overall', 'white', 'black', 'female', 'black, female']

In [74]:
val_auc, val_fpr = get_data_iters(0, 301, 20, attrs, ['white', 'black', 'asian', 'latinx'])

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2121546961325967, most acc-class unfairness: 0.025839538794138844, most acc-class size 0.33259668508287293
iteration: 2
iteration: 3
barrier
iteration: 4
barrier
iteration: 5
barrier
iteration: 6
barrier
iteration: 7
barrier
iteration: 8
barrier
iteration: 9
barrier
iteration: 10
barrier
iteration: 11
barrier
iteration: 12
iteration: 13
iteration: 14
iteration: 15
iteration: 16
iteration: 17
iteration: 18
iteration: 19
iteration: 20
iteration: 1
most accurate classifier accuracy: 0.19889502762430938, most acc-class unfairness: 0.021589460263493412, most acc-class size 0.34143646408839784
iteration: 2
iteration: 3
barrier
iteration: 4
barrier
iteration: 5
barrier
iteration: 6
barrier
iteration: 7
barrier
iteration: 8
barrier
iteration: 9
barrier
iteration: 10
barrier
iteration: 11
barrier
iteration: 12
iteration: 13
iteration: 14
iteration: 15
iteration: 16
iteration: 17
iteration: 18
iteration: 19
iteration: 20
iteration: 1
most accurat

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2121546961325967, most acc-class unfairness: 0.025839538794138844, most acc-class size 0.33259668508287293
iteration: 2
iteration: 3
barrier
iteration: 4
barrier
iteration: 5
barrier
iteration: 6
barrier
iteration: 7
barrier
iteration: 8
barrier
iteration: 9
barrier
iteration: 10
barrier
iteration: 11
barrier
iteration: 12
iteration: 13
iteration: 14
iteration: 15
iteration: 16
iteration: 17
iteration: 18
iteration: 19
iteration: 20
iteration: 21
iteration: 22
iteration: 23
iteration: 24
iteration: 25
iteration: 26
iteration: 27
iteration: 28
iteration: 29
barrier
iteration: 30
barrier
iteration: 31
barrier
iteration: 32
barrier
iteration: 33
barrier
iteration: 34
barrier
iteration: 35
barrier
iteration: 36
barrier
iteration: 37
barrier
iteration: 38
barrier
iteration: 39
barrier
iteration: 40
barrier
iteration: 1
most accurate classifier accuracy: 0.19889502762430938, most acc-class unfairness: 0.021589460263493412, most acc-class size

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2121546961325967, most acc-class unfairness: 0.025839538794138844, most acc-class size 0.33259668508287293
iteration: 2
iteration: 3
barrier
iteration: 4
barrier
iteration: 5
barrier
iteration: 6
barrier
iteration: 7
barrier
iteration: 8
barrier
iteration: 9
barrier
iteration: 10
barrier
iteration: 11
barrier
iteration: 12
iteration: 13
iteration: 14
iteration: 15
iteration: 16
iteration: 17
iteration: 18
iteration: 19
iteration: 20
iteration: 21
iteration: 22
iteration: 23
iteration: 24
iteration: 25
iteration: 26
iteration: 27
iteration: 28
iteration: 29
barrier
iteration: 30
barrier
iteration: 31
barrier
iteration: 32
barrier
iteration: 33
barrier
iteration: 34
barrier
iteration: 35
barrier
iteration: 36
barrier
iteration: 37
barrier
iteration: 38
barrier
iteration: 39
barrier
iteration: 40
barrier
iteration: 41
barrier
iteration: 42
barrier
iteration: 43
barrier
iteration: 44
barrier
iteration: 45
barrier
iteration: 46
barrier
itera

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2121546961325967, most acc-class unfairness: 0.025839538794138844, most acc-class size 0.33259668508287293
iteration: 2
iteration: 3
barrier
iteration: 4
barrier
iteration: 5
barrier
iteration: 6
barrier
iteration: 7
barrier
iteration: 8
barrier
iteration: 9
barrier
iteration: 10
barrier
iteration: 11
barrier
iteration: 12
iteration: 13
iteration: 14
iteration: 15
iteration: 16
iteration: 17
iteration: 18
iteration: 19
iteration: 20
iteration: 21
iteration: 22
iteration: 23
iteration: 24
iteration: 25
iteration: 26
iteration: 27
iteration: 28
iteration: 29
barrier
iteration: 30
barrier
iteration: 31
barrier
iteration: 32
barrier
iteration: 33
barrier
iteration: 34
barrier
iteration: 35
barrier
iteration: 36
barrier
iteration: 37
barrier
iteration: 38
barrier
iteration: 39
barrier
iteration: 40
barrier
iteration: 41
barrier
iteration: 42
barrier
iteration: 43
barrier
iteration: 44
barrier
iteration: 45
barrier
iteration: 46
barrier
itera

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2121546961325967, most acc-class unfairness: 0.025839538794138844, most acc-class size 0.33259668508287293
iteration: 2
iteration: 3
barrier
iteration: 4
barrier
iteration: 5
barrier
iteration: 6
barrier
iteration: 7
barrier
iteration: 8
barrier
iteration: 9
barrier
iteration: 10
barrier
iteration: 11
barrier
iteration: 12
iteration: 13
iteration: 14
iteration: 15
iteration: 16
iteration: 17
iteration: 18
iteration: 19
iteration: 20
iteration: 21
iteration: 22
iteration: 23
iteration: 24
iteration: 25
iteration: 26
iteration: 27
iteration: 28
iteration: 29
barrier
iteration: 30
barrier
iteration: 31
barrier
iteration: 32
barrier
iteration: 33
barrier
iteration: 34
barrier
iteration: 35
barrier
iteration: 36
barrier
iteration: 37
barrier
iteration: 38
barrier
iteration: 39
barrier
iteration: 40
barrier
iteration: 41
barrier
iteration: 42
barrier
iteration: 43
barrier
iteration: 44
barrier
iteration: 45
barrier
iteration: 46
barrier
itera

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2121546961325967, most acc-class unfairness: 0.025839538794138844, most acc-class size 0.33259668508287293
iteration: 2
iteration: 3
barrier
iteration: 4
barrier
iteration: 5
barrier
iteration: 6
barrier
iteration: 7
barrier
iteration: 8
barrier
iteration: 9
barrier
iteration: 10
barrier
iteration: 11
barrier
iteration: 12
iteration: 13
iteration: 14
iteration: 15
iteration: 16
iteration: 17
iteration: 18
iteration: 19
iteration: 20
iteration: 21
iteration: 22
iteration: 23
iteration: 24
iteration: 25
iteration: 26
iteration: 27
iteration: 28
iteration: 29
barrier
iteration: 30
barrier
iteration: 31
barrier
iteration: 32
barrier
iteration: 33
barrier
iteration: 34
barrier
iteration: 35
barrier
iteration: 36
barrier
iteration: 37
barrier
iteration: 38
barrier
iteration: 39
barrier
iteration: 40
barrier
iteration: 41
barrier
iteration: 42
barrier
iteration: 43
barrier
iteration: 44
barrier
iteration: 45
barrier
iteration: 46
barrier
itera

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2121546961325967, most acc-class unfairness: 0.025839538794138844, most acc-class size 0.33259668508287293
iteration: 2
iteration: 3
barrier
iteration: 4
barrier
iteration: 5
barrier
iteration: 6
barrier
iteration: 7
barrier
iteration: 8
barrier
iteration: 9
barrier
iteration: 10
barrier
iteration: 11
barrier
iteration: 12
iteration: 13
iteration: 14
iteration: 15
iteration: 16
iteration: 17
iteration: 18
iteration: 19
iteration: 20
iteration: 21
iteration: 22
iteration: 23
iteration: 24
iteration: 25
iteration: 26
iteration: 27
iteration: 28
iteration: 29
barrier
iteration: 30
barrier
iteration: 31
barrier
iteration: 32
barrier
iteration: 33
barrier
iteration: 34
barrier
iteration: 35
barrier
iteration: 36
barrier
iteration: 37
barrier
iteration: 38
barrier
iteration: 39
barrier
iteration: 40
barrier
iteration: 41
barrier
iteration: 42
barrier
iteration: 43
barrier
iteration: 44
barrier
iteration: 45
barrier
iteration: 46
barrier
itera

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2121546961325967, most acc-class unfairness: 0.025839538794138844, most acc-class size 0.33259668508287293
iteration: 2
iteration: 3
barrier
iteration: 4
barrier
iteration: 5
barrier
iteration: 6
barrier
iteration: 7
barrier
iteration: 8
barrier
iteration: 9
barrier
iteration: 10
barrier
iteration: 11
barrier
iteration: 12
iteration: 13
iteration: 14
iteration: 15
iteration: 16
iteration: 17
iteration: 18
iteration: 19
iteration: 20
iteration: 21
iteration: 22
iteration: 23
iteration: 24
iteration: 25
iteration: 26
iteration: 27
iteration: 28
iteration: 29
barrier
iteration: 30
barrier
iteration: 31
barrier
iteration: 32
barrier
iteration: 33
barrier
iteration: 34
barrier
iteration: 35
barrier
iteration: 36
barrier
iteration: 37
barrier
iteration: 38
barrier
iteration: 39
barrier
iteration: 40
barrier
iteration: 41
barrier
iteration: 42
barrier
iteration: 43
barrier
iteration: 44
barrier
iteration: 45
barrier
iteration: 46
barrier
itera

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2121546961325967, most acc-class unfairness: 0.025839538794138844, most acc-class size 0.33259668508287293
iteration: 2
iteration: 3
barrier
iteration: 4
barrier
iteration: 5
barrier
iteration: 6
barrier
iteration: 7
barrier
iteration: 8
barrier
iteration: 9
barrier
iteration: 10
barrier
iteration: 11
barrier
iteration: 12
iteration: 13
iteration: 14
iteration: 15
iteration: 16
iteration: 17
iteration: 18
iteration: 19
iteration: 20
iteration: 21
iteration: 22
iteration: 23
iteration: 24
iteration: 25
iteration: 26
iteration: 27
iteration: 28
iteration: 29
barrier
iteration: 30
barrier
iteration: 31
barrier
iteration: 32
barrier
iteration: 33
barrier
iteration: 34
barrier
iteration: 35
barrier
iteration: 36
barrier
iteration: 37
barrier
iteration: 38
barrier
iteration: 39
barrier
iteration: 40
barrier
iteration: 41
barrier
iteration: 42
barrier
iteration: 43
barrier
iteration: 44
barrier
iteration: 45
barrier
iteration: 46
barrier
itera

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2121546961325967, most acc-class unfairness: 0.025839538794138844, most acc-class size 0.33259668508287293
iteration: 2
iteration: 3
barrier
iteration: 4
barrier
iteration: 5
barrier
iteration: 6
barrier
iteration: 7
barrier
iteration: 8
barrier
iteration: 9
barrier
iteration: 10
barrier
iteration: 11
barrier
iteration: 12
iteration: 13
iteration: 14
iteration: 15
iteration: 16
iteration: 17
iteration: 18
iteration: 19
iteration: 20
iteration: 21
iteration: 22
iteration: 23
iteration: 24
iteration: 25
iteration: 26
iteration: 27
iteration: 28
iteration: 29
barrier
iteration: 30
barrier
iteration: 31
barrier
iteration: 32
barrier
iteration: 33
barrier
iteration: 34
barrier
iteration: 35
barrier
iteration: 36
barrier
iteration: 37
barrier
iteration: 38
barrier
iteration: 39
barrier
iteration: 40
barrier
iteration: 41
barrier
iteration: 42
barrier
iteration: 43
barrier
iteration: 44
barrier
iteration: 45
barrier
iteration: 46
barrier
itera

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2121546961325967, most acc-class unfairness: 0.025839538794138844, most acc-class size 0.33259668508287293
iteration: 2
iteration: 3
barrier
iteration: 4
barrier
iteration: 5
barrier
iteration: 6
barrier
iteration: 7
barrier
iteration: 8
barrier
iteration: 9
barrier
iteration: 10
barrier
iteration: 11
barrier
iteration: 12
iteration: 13
iteration: 14
iteration: 15
iteration: 16
iteration: 17
iteration: 18
iteration: 19
iteration: 20
iteration: 21
iteration: 22
iteration: 23
iteration: 24
iteration: 25
iteration: 26
iteration: 27
iteration: 28
iteration: 29
barrier
iteration: 30
barrier
iteration: 31
barrier
iteration: 32
barrier
iteration: 33
barrier
iteration: 34
barrier
iteration: 35
barrier
iteration: 36
barrier
iteration: 37
barrier
iteration: 38
barrier
iteration: 39
barrier
iteration: 40
barrier
iteration: 41
barrier
iteration: 42
barrier
iteration: 43
barrier
iteration: 44
barrier
iteration: 45
barrier
iteration: 46
barrier
itera

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2121546961325967, most acc-class unfairness: 0.025839538794138844, most acc-class size 0.33259668508287293
iteration: 2
iteration: 3
barrier
iteration: 4
barrier
iteration: 5
barrier
iteration: 6
barrier
iteration: 7
barrier
iteration: 8
barrier
iteration: 9
barrier
iteration: 10
barrier
iteration: 11
barrier
iteration: 12
iteration: 13
iteration: 14
iteration: 15
iteration: 16
iteration: 17
iteration: 18
iteration: 19
iteration: 20
iteration: 21
iteration: 22
iteration: 23
iteration: 24
iteration: 25
iteration: 26
iteration: 27
iteration: 28
iteration: 29
barrier
iteration: 30
barrier
iteration: 31
barrier
iteration: 32
barrier
iteration: 33
barrier
iteration: 34
barrier
iteration: 35
barrier
iteration: 36
barrier
iteration: 37
barrier
iteration: 38
barrier
iteration: 39
barrier
iteration: 40
barrier
iteration: 41
barrier
iteration: 42
barrier
iteration: 43
barrier
iteration: 44
barrier
iteration: 45
barrier
iteration: 46
barrier
itera

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2121546961325967, most acc-class unfairness: 0.025839538794138844, most acc-class size 0.33259668508287293
iteration: 2
iteration: 3
barrier
iteration: 4
barrier
iteration: 5
barrier
iteration: 6
barrier
iteration: 7
barrier
iteration: 8
barrier
iteration: 9
barrier
iteration: 10
barrier
iteration: 11
barrier
iteration: 12
iteration: 13
iteration: 14
iteration: 15
iteration: 16
iteration: 17
iteration: 18
iteration: 19
iteration: 20
iteration: 21
iteration: 22
iteration: 23
iteration: 24
iteration: 25
iteration: 26
iteration: 27
iteration: 28
iteration: 29
barrier
iteration: 30
barrier
iteration: 31
barrier
iteration: 32
barrier
iteration: 33
barrier
iteration: 34
barrier
iteration: 35
barrier
iteration: 36
barrier
iteration: 37
barrier
iteration: 38
barrier
iteration: 39
barrier
iteration: 40
barrier
iteration: 41
barrier
iteration: 42
barrier
iteration: 43
barrier
iteration: 44
barrier
iteration: 45
barrier
iteration: 46
barrier
itera

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2121546961325967, most acc-class unfairness: 0.025839538794138844, most acc-class size 0.33259668508287293
iteration: 2
iteration: 3
barrier
iteration: 4
barrier
iteration: 5
barrier
iteration: 6
barrier
iteration: 7
barrier
iteration: 8
barrier
iteration: 9
barrier
iteration: 10
barrier
iteration: 11
barrier
iteration: 12
iteration: 13
iteration: 14
iteration: 15
iteration: 16
iteration: 17
iteration: 18
iteration: 19
iteration: 20
iteration: 21
iteration: 22
iteration: 23
iteration: 24
iteration: 25
iteration: 26
iteration: 27
iteration: 28
iteration: 29
barrier
iteration: 30
barrier
iteration: 31
barrier
iteration: 32
barrier
iteration: 33
barrier
iteration: 34
barrier
iteration: 35
barrier
iteration: 36
barrier
iteration: 37
barrier
iteration: 38
barrier
iteration: 39
barrier
iteration: 40
barrier
iteration: 41
barrier
iteration: 42
barrier
iteration: 43
barrier
iteration: 44
barrier
iteration: 45
barrier
iteration: 46
barrier
itera

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2121546961325967, most acc-class unfairness: 0.025839538794138844, most acc-class size 0.33259668508287293
iteration: 2
iteration: 3
barrier
iteration: 4
barrier
iteration: 5
barrier
iteration: 6
barrier
iteration: 7
barrier
iteration: 8
barrier
iteration: 9
barrier
iteration: 10
barrier
iteration: 11
barrier
iteration: 12
iteration: 13
iteration: 14
iteration: 15
iteration: 16
iteration: 17
iteration: 18
iteration: 19
iteration: 20
iteration: 21
iteration: 22
iteration: 23
iteration: 24
iteration: 25
iteration: 26
iteration: 27
iteration: 28
iteration: 29
barrier
iteration: 30
barrier
iteration: 31
barrier
iteration: 32
barrier
iteration: 33
barrier
iteration: 34
barrier
iteration: 35
barrier
iteration: 36
barrier
iteration: 37
barrier
iteration: 38
barrier
iteration: 39
barrier
iteration: 40
barrier
iteration: 41
barrier
iteration: 42
barrier
iteration: 43
barrier
iteration: 44
barrier
iteration: 45
barrier
iteration: 46
barrier
itera

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


In [75]:
val_auc, val_fpr

([(array([  0,  20,  40,  60,  80, 100, 120, 140, 160, 180, 200, 220, 240,
          260, 280, 300]),
   [0.785,
    0.753,
    0.746,
    0.746,
    0.746,
    0.746,
    0.738,
    0.738,
    0.739,
    0.739,
    0.739,
    0.739,
    0.739,
    0.735,
    0.735,
    0.735],
   array([0.776, 0.776, 0.776, 0.776, 0.776, 0.776, 0.776, 0.776, 0.776,
          0.776, 0.776, 0.776, 0.776, 0.776, 0.776, 0.776])),
  (array([  0,  20,  40,  60,  80, 100, 120, 140, 160, 180, 200, 220, 240,
          260, 280, 300]),
   [0.75,
    0.773,
    0.77,
    0.77,
    0.77,
    0.77,
    0.763,
    0.763,
    0.763,
    0.763,
    0.763,
    0.763,
    0.763,
    0.759,
    0.759,
    0.759],
   array([0.754, 0.754, 0.754, 0.754, 0.754, 0.754, 0.754, 0.754, 0.754,
          0.754, 0.754, 0.754, 0.754, 0.754, 0.754, 0.754])),
  (array([  0,  20,  40,  60,  80, 100, 120, 140, 160, 180, 200, 220, 240,
          260, 280, 300]),
   [0.685,
    0.657,
    0.656,
    0.656,
    0.656,
    0.656,
    0.656

In [67]:
def save_results(data, name):
    arr = np.array(data)
    flat_data = arr.reshape(arr.shape[0], -1)

    # Create a DataFrame with columns representing each element of the 2D array
    df = pd.DataFrame(flat_data, columns=[f'Value_{i}' for i in range(flat_data.shape[1])])

    # Save DataFrame to CSV file
    df.to_csv('results/' + name, index=False)


In [68]:
def read_results(name):
    read_df = pd.read_csv('results/' + name)

    # Convert the 'Values' column back to a 3D array
    read_array_data = read_df.values.reshape(read_df.shape[0], 3, -1)
    return read_array_data

In [79]:
save_results(val_fpr, 'race_protected_fpr.csv')

In [71]:
data = read_results('g_protected_auc.csv')

In [80]:
import matplotlib.pyplot as plt

def build_combined_graph(val, attributes):
    # Set distinct colors for each attribute
    colors = ['b', 'g', 'r', 'c', 'm']  # You can add more colors if needed

    # Create a single plot
    plt.figure(figsize=(12, 6))  # Adjust the figure size as needed

    for i, attr in enumerate(attributes):
        x_axis, y_axis, other_algorithm_fpr = val[i]
        plt.plot(x_axis, y_axis, linestyle='-', label=f'GerryFair - {attr.capitalize()}', color=colors[i])
        plt.plot(x_axis, other_algorithm_fpr, linestyle='-', label=f'LR Without Gerryfair - {attr.capitalize()}', color=colors[i], alpha=0.5)

    # Set the y-axis range to 0 to 1
    plt.ylim(0, 1)

    plt.xlabel('Iterations')
    plt.ylabel('AUC')
    plt.title('Comparison of Algorithms - AUC vs Iterations (Race Protected)')

    # Move the legend outside the axes
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

    # Automatically adjust subplot parameters to give specified padding
    plt.tight_layout()

    plt.show()

attributes = ["Overall", "White", "Black", "Female", "Black + Female"]

build_combined_graph(val_auc, attributes)


In [None]:
def build_graph_gamma(iters, gammas, attrs):
    results = []
    for g in gammas:
        results.append(metrics.calc_metrics(X, y, groups, omit_demographics=not include_demographics, demographics=['country_cd_US', 'bachelor_obtained', 'white', 'black', 'asian', 'latinx', 'male', 'female'], is_gerryfair=True, iters=iters+1, gamma=g))
    
    data = results
    for a in attrs:
        fprs = []
        for res in results:
            for subgroup in res:
                if subgroup['subgroup'] == a:
                    fprs.append(float(subgroup['auc_avg']))
                    break
        
        fprs = np.array(fprs)
        gammas = np.array(gammas)

        other_algorithm_fpr = np.linspace(default_fprs[a], default_fprs[a], gammas.size) 

        plt.plot(gammas, fprs, marker='o', linestyle='-', label='GerryFair')

        plt.plot(gammas, other_algorithm_fpr, marker='s', linestyle='-', label='Other Algorithm')

        plt.xlabel('Gamma')
        plt.ylabel('False Positive Rate (FPR)')
        plt.title('Comparison of Algorithms - FPR vs Gamma For ' + a.capitalize())

        plt.legend()

        plt.show()

In [None]:
build_graph_gamma(5, [.002, .005, .01, .02, .05, .1], ['Overall', 'black', 'female', 'black, female'])

In [None]:
len(data)

In [None]:
def gen_data_attrs(iters, attrs, sets):
    results = []
    for i in range(len(sets)):
        results.append(metrics.calc_metrics(X, y, groups, omit_demographics=not include_demographics, demographics=demographics[:i+1], is_gerryfair=True, iters=iters+1, gamma=g))
    
    for a in attrs:
        fprs = []
        for res in results:
            for subgroup in res:
                if subgroup['subgroup'] == a:
                    fprs.append(float(subgroup['fpr_avg']))
                    break
        
        fprs = np.array(fprs)
        num_attrs = np.array(list(range(1,len(demographics) + 1)))

        other_algorithm_fpr = np.linspace(default_fprs[a], default_fprs[a], len(demographics)) 

        plt.plot(num_attrs, fprs, marker='o', linestyle='-', label='GerryFair')

        plt.plot(num_attrs, other_algorithm_fpr, marker='s', linestyle='-', label='Other Algorithm')

        plt.xlabel('Number of Attributes')
        plt.ylabel('False Positive Rate (FPR)')
        plt.title('Comparison of Algorithms - FPR vs Number of Attributes For ' + a.capitalize())

        plt.legend()

        plt.show()

In [None]:
gerryfair_groups = {
    'all': protected,
    'location': demo_groups[0], 
    'education': demo_groups[1][:-1], 
    'race': demo_groups[2][:-2], 
    'gender': demo_groups[3][:-2],
    'race+gender': demo_groups[2][:-2] + demo_groups[3][:-2]
    }

In [None]:
def gen_data_categories(iters, attrs):
    for g in attrs:
        res = metrics.calc_metrics(X, y, combos, omit_demographics=False, demographics=demographics, protected=attrs[g], is_gerryfair=True, iters=iters)
        csv_file = f"./data/graph_attrs_{str(iters)}_{g}.csv"

        # Write the data to a CSV file
        with open(csv_file, 'w', newline='') as csvfile:
            fieldnames = [
                'subgroup', 'n', 'auc_avg', 'auc_std', 'fpr_avg', 'fpr_std', 'rmse_avg', 'rmse_std']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            
            writer.writeheader()  # Write the header row
            for row in res:
                writer.writerow(row)

In [None]:
gen_data_categories(200, gerryfair_groups)

In [None]:
demographics

In [None]:
demo_groups[2][-2]

In [None]:
protected

In [96]:
part = df[(df['black'] == 1) & (df['female'] == 1)]
part['completed'].sum() / len(part['completed'])

0.47058823529411764

In [12]:
df.columns

Index(['answer_count', 'average_answer_length', 'total_votes_given_answers',
       'total_votes_received_answers', 'question_count',
       'average_question_length', 'total_votes_given_questions',
       'total_votes_received_questions', 'total_questions_following', 'US',
       'intl', 'bachelor_obtained', 'no_bachelor_obtained', 'education_na',
       'male', 'female', 'gender_other', 'gender_na', 'white', 'latinx',
       'black', 'asian', 'race_others', 'race_na', 'completed'],
      dtype='object')

In [26]:
df = pd.read_csv('./data/MTC508_subgroup_data_without_demographics.csv')

In [86]:
df[df['subgroup'] == 'Overall']['fpr_avg'][0]  - df[df['subgroup'] == 'no_bachelor_obtained, female']['fpr_avg']

65    0.107
Name: fpr_avg, dtype: float64

In [68]:
df[df['subgroup'] == 'white']['fpr_avg']

6    0.277
Name: fpr_avg, dtype: float64