In [1]:
import pandas as pd
import numpy as np
import warnings
from functions import metrics
import csv
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import mean_squared_error
from functions.formatting import get_indices, get_subgroup_str

warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
include_demographics = False 
use_gerryfair = False 

In [3]:
df = pd.read_csv('./data/preprocessed.csv')
df.drop(['umich_user_id', 'Unnamed: 0'], axis=1, inplace=True)

In [4]:
X = df.drop('completed', axis=1)
y = df['completed']

#### Calculate false positive rates for the given subgroup

This function takes in the subgroup to test for as a list of tuples. We use logistic regression and 10-fold cross validation to calculate the average FPR and AUC for the given subgroup. 

Returns a tuple containing FPR average, FPR standard deviation, AUC average, AUC standard deviation, and number of students in a given subgroup

#### Compute all subgroups

This function generates all possible subgroups given the list of protected columns

In [5]:
def compute_combo(cols, races, genders):
    retval = [[]]
    def recurse(i, curr):
        for r in races:
            opt3 = curr.copy()
            opt3.append((r, 1))
            if opt3 not in retval:
                retval.append(opt3)
        for g in genders:
            opt4 = curr.copy()
            opt4.append((g, 1))
            if opt4 not in retval:
                retval.append(opt4)
        for r in races:
            for g in genders:
                opt5 = curr.copy()
                opt5.append((r, 1))
                opt5.append((g, 1))
                if opt5 not in retval:
                    retval.append(opt5)
        for j in range(i, len(cols)):
            opt1 = curr.copy()
            opt1.append((cols[j], 0))

            opt2 = curr.copy()
            opt2.append((cols[j], 1))
            
            retval.append(opt1)
            retval.append(opt2)
            
            if j < len(cols):
                recurse(j+1, opt1.copy())
            if j < len(cols):
                recurse(j+1, opt2.copy())
        
    recurse(0, [])

    return retval

In [6]:
combos = compute_combo(['country_cd_US', 'bachelor_obtained'], ['white', 'black', 'asian', 'latinx', 'race_others', 'race_na'], ['male', 'female', 'gender_na', 'gender_other']) 

In [7]:
combos.sort(key=len)
combos

[[],
 [('white', 1)],
 [('black', 1)],
 [('asian', 1)],
 [('latinx', 1)],
 [('race_others', 1)],
 [('race_na', 1)],
 [('male', 1)],
 [('female', 1)],
 [('gender_na', 1)],
 [('gender_other', 1)],
 [('country_cd_US', 0)],
 [('country_cd_US', 1)],
 [('bachelor_obtained', 0)],
 [('bachelor_obtained', 1)],
 [('white', 1), ('male', 1)],
 [('white', 1), ('female', 1)],
 [('white', 1), ('gender_na', 1)],
 [('white', 1), ('gender_other', 1)],
 [('black', 1), ('male', 1)],
 [('black', 1), ('female', 1)],
 [('black', 1), ('gender_na', 1)],
 [('black', 1), ('gender_other', 1)],
 [('asian', 1), ('male', 1)],
 [('asian', 1), ('female', 1)],
 [('asian', 1), ('gender_na', 1)],
 [('asian', 1), ('gender_other', 1)],
 [('latinx', 1), ('male', 1)],
 [('latinx', 1), ('female', 1)],
 [('latinx', 1), ('gender_na', 1)],
 [('latinx', 1), ('gender_other', 1)],
 [('race_others', 1), ('male', 1)],
 [('race_others', 1), ('female', 1)],
 [('race_others', 1), ('gender_na', 1)],
 [('race_others', 1), ('gender_other',

In [8]:
groups = {}
for combo in combos:
    group_key = []
    for c in combo:
        group_key.append(c[0])
    if tuple(group_key) not in groups:
        groups[tuple(group_key)] = []
    groups[tuple(group_key)].append(combo)

# sort dict by length of key
groups = dict(sorted(groups.items(), key=lambda item: len(item[0])))

print(groups)


{(): [[]], ('white',): [[('white', 1)]], ('black',): [[('black', 1)]], ('asian',): [[('asian', 1)]], ('latinx',): [[('latinx', 1)]], ('race_others',): [[('race_others', 1)]], ('race_na',): [[('race_na', 1)]], ('male',): [[('male', 1)]], ('female',): [[('female', 1)]], ('gender_na',): [[('gender_na', 1)]], ('gender_other',): [[('gender_other', 1)]], ('country_cd_US',): [[('country_cd_US', 0)], [('country_cd_US', 1)]], ('bachelor_obtained',): [[('bachelor_obtained', 0)], [('bachelor_obtained', 1)]], ('white', 'male'): [[('white', 1), ('male', 1)]], ('white', 'female'): [[('white', 1), ('female', 1)]], ('white', 'gender_na'): [[('white', 1), ('gender_na', 1)]], ('white', 'gender_other'): [[('white', 1), ('gender_other', 1)]], ('black', 'male'): [[('black', 1), ('male', 1)]], ('black', 'female'): [[('black', 1), ('female', 1)]], ('black', 'gender_na'): [[('black', 1), ('gender_na', 1)]], ('black', 'gender_other'): [[('black', 1), ('gender_other', 1)]], ('asian', 'male'): [[('asian', 1), ('

In [12]:
res = metrics.calc_metrics(X, y, groups, omit_demographics=not include_demographics, demographics=['country_cd_US', 'bachelor_obtained', 'white', 'black', 'asian', 'latinx', 'male', 'female'], is_gerryfair=True, iters=3)

iteration: 1
most accurate classifier accuracy: 0.2276243093922652, most acc-class unfairness: 0.026538680077855824, most acc-class size 0.15138121546961328
iteration: 2
iteration: 1
most accurate classifier accuracy: 0.20552486187845304, most acc-class unfairness: 0.023315642491869876, most acc-class size 0.35138121546961326
iteration: 2
iteration: 1
most accurate classifier accuracy: 0.2185430463576159, most acc-class unfairness: 0.030836092715231793, most acc-class size 0.35209713024282563
iteration: 2
iteration: 1
most accurate classifier accuracy: 0.2251655629139073, most acc-class unfairness: 0.025553406100999265, most acc-class size 0.3620309050772627
iteration: 2
iteration: 1
most accurate classifier accuracy: 0.2152317880794702, most acc-class unfairness: 0.024711660093756978, most acc-class size 0.3443708609271523
iteration: 2


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


Generate a list of all subgroup data and write it to a csv

In [None]:
csv_file = f"./data/MTC508_subgroup_data_gerryfair_with_all.csv"
# csv_file = "test.csv"

# Write the data to a CSV file
with open(csv_file, 'w', newline='') as csvfile:
    fieldnames = [
        'subgroup', 'n', 'auc_avg', 'auc_std', 'fpr_avg', 'fpr_std', 'rmse_avg', 'rmse_std']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    writer.writeheader()  # Write the header row
    for row in res:
        writer.writerow(row)

In [None]:
demo_groups = [['country_cd_US'], ['bachelor_obtained'], ['white', 'black', 'asian', 'latinx', 'race_others', 'race_na'], ['male', 'female', 'gender_other', 'gender_na']]

In [9]:
data = {}

data['overall'] = len(X)

for c in combos[1:]:
    masks = [X[name] == value for name, value in c]
    final_mask = pd.concat(masks, axis=1).all(axis=1)
    filter_X = X[final_mask]
    data[get_subgroup_str(c)] = len(filter_X)




In [None]:
data

{'overall': 1132,
 'white': 382,
 'black': 93,
 'asian': 57,
 'latinx': 48,
 'race_others': 19,
 'race_na': 470,
 'male': 182,
 'female': 759,
 'gender_na': 189,
 'gender_other': 5,
 'International': 125,
 'US': 1007,
 'No Bachelor': 613,
 'Bachelor or higher': 519,
 'white, male': 64,
 'white, female': 291,
 'white, gender_na': 25,
 'white, gender_other': 4,
 'black, male': 21,
 'black, female': 51,
 'black, gender_na': 20,
 'black, gender_other': 0,
 'asian, male': 10,
 'asian, female': 38,
 'asian, gender_na': 9,
 'asian, gender_other': 1,
 'latinx, male': 6,
 'latinx, female': 38,
 'latinx, gender_na': 5,
 'latinx, gender_other': 1,
 'race_others, male': 1,
 'race_others, female': 14,
 'race_others, gender_na': 4,
 'race_others, gender_other': 0,
 'race_na, male': 79,
 'race_na, female': 285,
 'race_na, gender_na': 106,
 'race_na, gender_other': 0,
 'International, white': 24,
 'International, black': 17,
 'International, asian': 14,
 'International, latinx': 2,
 'International, ra

In [None]:
df = pd.DataFrame(list(data.items()), columns=['Key', 'Count'])

# Specify the CSV file path
csv_file_path = 'data/MTC508_data_counts.csv'

# Write the DataFrame to a CSV file
df.to_csv(csv_file_path, index=False)

In [13]:
data = []
for i in range(2,11):
    data.append(metrics.calc_metrics(X, y, groups, omit_demographics=not include_demographics, demographics=['country_cd_US', 'bachelor_obtained', 'white', 'black', 'asian', 'latinx', 'male', 'female'], is_gerryfair=True, iters=i))

iteration: 1
most accurate classifier accuracy: 0.2276243093922652, most acc-class unfairness: 0.026538680077855824, most acc-class size 0.15138121546961328
iteration: 1
most accurate classifier accuracy: 0.20552486187845304, most acc-class unfairness: 0.023315642491869876, most acc-class size 0.35138121546961326
iteration: 1
most accurate classifier accuracy: 0.2185430463576159, most acc-class unfairness: 0.030836092715231793, most acc-class size 0.35209713024282563
iteration: 1
most accurate classifier accuracy: 0.2251655629139073, most acc-class unfairness: 0.025553406100999265, most acc-class size 0.3620309050772627
iteration: 1
most accurate classifier accuracy: 0.2152317880794702, most acc-class unfairness: 0.024711660093756978, most acc-class size 0.3443708609271523


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2276243093922652, most acc-class unfairness: 0.026538680077855824, most acc-class size 0.15138121546961328
iteration: 2
iteration: 1
most accurate classifier accuracy: 0.20552486187845304, most acc-class unfairness: 0.023315642491869876, most acc-class size 0.35138121546961326
iteration: 2
iteration: 1
most accurate classifier accuracy: 0.2185430463576159, most acc-class unfairness: 0.030836092715231793, most acc-class size 0.35209713024282563
iteration: 2
iteration: 1
most accurate classifier accuracy: 0.2251655629139073, most acc-class unfairness: 0.025553406100999265, most acc-class size 0.3620309050772627
iteration: 2
iteration: 1
most accurate classifier accuracy: 0.2152317880794702, most acc-class unfairness: 0.024711660093756978, most acc-class size 0.3443708609271523
iteration: 2


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2276243093922652, most acc-class unfairness: 0.026538680077855824, most acc-class size 0.15138121546961328
iteration: 2
iteration: 3
iteration: 1
most accurate classifier accuracy: 0.20552486187845304, most acc-class unfairness: 0.023315642491869876, most acc-class size 0.35138121546961326
iteration: 2
iteration: 3
iteration: 1
most accurate classifier accuracy: 0.2185430463576159, most acc-class unfairness: 0.030836092715231793, most acc-class size 0.35209713024282563
iteration: 2
iteration: 3
iteration: 1
most accurate classifier accuracy: 0.2251655629139073, most acc-class unfairness: 0.025553406100999265, most acc-class size 0.3620309050772627
iteration: 2
iteration: 3
iteration: 1
most accurate classifier accuracy: 0.2152317880794702, most acc-class unfairness: 0.024711660093756978, most acc-class size 0.3443708609271523
iteration: 2
iteration: 3


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2276243093922652, most acc-class unfairness: 0.026538680077855824, most acc-class size 0.15138121546961328
iteration: 2
iteration: 3
iteration: 4
iteration: 1
most accurate classifier accuracy: 0.20552486187845304, most acc-class unfairness: 0.023315642491869876, most acc-class size 0.35138121546961326
iteration: 2
iteration: 3
iteration: 4
iteration: 1
most accurate classifier accuracy: 0.2185430463576159, most acc-class unfairness: 0.030836092715231793, most acc-class size 0.35209713024282563
iteration: 2
iteration: 3
iteration: 4
iteration: 1
most accurate classifier accuracy: 0.2251655629139073, most acc-class unfairness: 0.025553406100999265, most acc-class size 0.3620309050772627
iteration: 2
iteration: 3
iteration: 4
iteration: 1
most accurate classifier accuracy: 0.2152317880794702, most acc-class unfairness: 0.024711660093756978, most acc-class size 0.3443708609271523
iteration: 2
iteration: 3
iteration: 4


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2276243093922652, most acc-class unfairness: 0.026538680077855824, most acc-class size 0.15138121546961328
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 1
most accurate classifier accuracy: 0.20552486187845304, most acc-class unfairness: 0.023315642491869876, most acc-class size 0.35138121546961326
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 1
most accurate classifier accuracy: 0.2185430463576159, most acc-class unfairness: 0.030836092715231793, most acc-class size 0.35209713024282563
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 1
most accurate classifier accuracy: 0.2251655629139073, most acc-class unfairness: 0.025553406100999265, most acc-class size 0.3620309050772627
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 1
most accurate classifier accuracy: 0.2152317880794702, most acc-class unfairness: 0.024711660093756978, most acc-class size 0.3443708609271523
iteratio

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2276243093922652, most acc-class unfairness: 0.026538680077855824, most acc-class size 0.15138121546961328
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 1
most accurate classifier accuracy: 0.20552486187845304, most acc-class unfairness: 0.023315642491869876, most acc-class size 0.35138121546961326
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 1
most accurate classifier accuracy: 0.2185430463576159, most acc-class unfairness: 0.030836092715231793, most acc-class size 0.35209713024282563
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 1
most accurate classifier accuracy: 0.2251655629139073, most acc-class unfairness: 0.025553406100999265, most acc-class size 0.3620309050772627
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 1
most accurate classifier accuracy: 0.2152317880794702, most acc-class unfairness: 0.024711660093756

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2276243093922652, most acc-class unfairness: 0.026538680077855824, most acc-class size 0.15138121546961328
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 1
most accurate classifier accuracy: 0.20552486187845304, most acc-class unfairness: 0.023315642491869876, most acc-class size 0.35138121546961326
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 1
most accurate classifier accuracy: 0.2185430463576159, most acc-class unfairness: 0.030836092715231793, most acc-class size 0.35209713024282563
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 1
most accurate classifier accuracy: 0.2251655629139073, most acc-class unfairness: 0.025553406100999265, most acc-class size 0.3620309050772627
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 1
most accurate classifier accuracy: 0.2152317880

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2276243093922652, most acc-class unfairness: 0.026538680077855824, most acc-class size 0.15138121546961328
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
iteration: 1
most accurate classifier accuracy: 0.20552486187845304, most acc-class unfairness: 0.023315642491869876, most acc-class size 0.35138121546961326
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
iteration: 1
most accurate classifier accuracy: 0.2185430463576159, most acc-class unfairness: 0.030836092715231793, most acc-class size 0.35209713024282563
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
barrier
iteration: 1
most accurate classifier accuracy: 0.2251655629139073, most acc-class unfairness: 0.025553406100999265, most acc-class size 0.3620309050772627
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2276243093922652, most acc-class unfairness: 0.026538680077855824, most acc-class size 0.15138121546961328
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
iteration: 9
iteration: 1
most accurate classifier accuracy: 0.20552486187845304, most acc-class unfairness: 0.023315642491869876, most acc-class size 0.35138121546961326
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
iteration: 9
iteration: 1
most accurate classifier accuracy: 0.2185430463576159, most acc-class unfairness: 0.030836092715231793, most acc-class size 0.35209713024282563
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
barrier
iteration: 9
iteration: 1
most accurate classifier accuracy: 0.2251655629139073, most acc-class unfairness: 0.025553406100999265, most acc-class size 0.3620309050772627
iteration: 2
iteration: 3
iteration: 4
iteration: 5


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


In [15]:
data

[[{'subgroup': 'Overall',
   'n': '226.4',
   'auc_avg': '0.769',
   'auc_std': '0.029',
   'fpr_avg': '0.181',
   'fpr_std': '0.022',
   'rmse_avg': '0.478',
   'rmse_std': '0.029'},
  {'subgroup': 'white',
   'n': '76.4',
   'auc_avg': '0.745',
   'auc_std': '0.064',
   'fpr_avg': '0.339',
   'fpr_std': '0.095',
   'rmse_avg': '0.473',
   'rmse_std': '0.067'},
  {'subgroup': 'black',
   'n': '18.6',
   'auc_avg': '0.659',
   'auc_std': '0.092',
   'fpr_avg': '0.365',
   'fpr_std': '0.093',
   'rmse_avg': '0.584',
   'rmse_std': '0.087'},
  {'subgroup': 'asian',
   'n': '11.4',
   'auc_avg': '0.863',
   'auc_std': '0.089',
   'fpr_avg': '0.124',
   'fpr_std': '0.110',
   'rmse_avg': '0.364',
   'rmse_std': '0.213'},
  {'subgroup': 'latinx',
   'n': '9.6',
   'auc_avg': '0.646',
   'auc_std': '0.212',
   'fpr_avg': '0.238',
   'fpr_std': '0.217',
   'rmse_avg': '0.515',
   'rmse_std': '0.269'},
  {'subgroup': 'race_others',
   'n': '3.8',
   'auc_avg': '0.706',
   'auc_std': '0.034',
 

In [30]:
fprs = []
for res in data:
    for subgroup in res:
        if subgroup['subgroup'] == 'Overall':
            fprs.append(float(subgroup['fpr_avg']))
            break

In [31]:
fprs = np.array(fprs)
fprs.size

9

In [33]:
import matplotlib.pyplot as plt
import numpy as np

iterations = np.arange(1, 10)

other_algorithm_fpr = np.linspace(0.5, 0.5, 9) 

plt.plot(iterations, fprs, marker='o', linestyle='-', label='GerryFair')

plt.plot(iterations, other_algorithm_fpr, marker='s', linestyle='-', label='Other Algorithm')

plt.xlabel('Iterations')
plt.ylabel('False Positive Rate (FPR)')
plt.title('Comparison of Algorithms - FPR vs Iterations')

plt.legend()

plt.show()

In [29]:
fprs

array(['0.181', '0.181', '0.785', '0.496', '0.558', '0.244', '0.297',
       '0.222', '0.259'], dtype='<U5')

In [34]:
attrs = ['country_cd_US', 'bachelor_obtained', 'white', 'black', 'asian', 'latinx', 'male', 'female']

In [36]:
default_fprs = {"Overall": 0.157, "black": 0.360, "female": 0.177, "black, female": 0.537}


In [46]:
def build_graph_iters(start, stop, step, attrs):
    data = []
    for i in range(start+1,stop+1,step):
        data.append(metrics.calc_metrics(X, y, groups, omit_demographics=not include_demographics, demographics=['country_cd_US', 'bachelor_obtained', 'white', 'black', 'asian', 'latinx', 'male', 'female'], is_gerryfair=True, iters=i))
    

    for a in attrs:
        fprs = []
        for res in data:
            
            for subgroup in res:
                if subgroup['subgroup'] == a:
                    fprs.append(float(subgroup['fpr_avg']))
                    break

        iterations = np.array(list(range(start, stop, step)))

        other_algorithm_fpr = np.linspace(default_fprs[a], default_fprs[a], iterations.size) 

        plt.plot(iterations, fprs, marker='o', linestyle='-', label='GerryFair')

        plt.plot(iterations, other_algorithm_fpr, marker='s', linestyle='-', label='Other Algorithm')

        plt.xlabel('Iterations')
        plt.ylabel('False Positive Rate (FPR)')
        plt.title('Comparison of Algorithms - FPR vs Iterations For ' + a.capitalize())

        plt.legend()

        plt.show()

In [47]:
build_graph_iters(1,10,1,['Overall', 'black', 'female', 'black, female'])

iteration: 1
most accurate classifier accuracy: 0.2276243093922652, most acc-class unfairness: 0.026538680077855824, most acc-class size 0.15138121546961328
iteration: 1
most accurate classifier accuracy: 0.20552486187845304, most acc-class unfairness: 0.023315642491869876, most acc-class size 0.35138121546961326
iteration: 1
most accurate classifier accuracy: 0.2185430463576159, most acc-class unfairness: 0.030836092715231793, most acc-class size 0.35209713024282563
iteration: 1
most accurate classifier accuracy: 0.2251655629139073, most acc-class unfairness: 0.025553406100999265, most acc-class size 0.3620309050772627
iteration: 1
most accurate classifier accuracy: 0.2152317880794702, most acc-class unfairness: 0.024711660093756978, most acc-class size 0.3443708609271523


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2276243093922652, most acc-class unfairness: 0.026538680077855824, most acc-class size 0.15138121546961328
iteration: 2
iteration: 1
most accurate classifier accuracy: 0.20552486187845304, most acc-class unfairness: 0.023315642491869876, most acc-class size 0.35138121546961326
iteration: 2
iteration: 1
most accurate classifier accuracy: 0.2185430463576159, most acc-class unfairness: 0.030836092715231793, most acc-class size 0.35209713024282563
iteration: 2
iteration: 1
most accurate classifier accuracy: 0.2251655629139073, most acc-class unfairness: 0.025553406100999265, most acc-class size 0.3620309050772627
iteration: 2
iteration: 1
most accurate classifier accuracy: 0.2152317880794702, most acc-class unfairness: 0.024711660093756978, most acc-class size 0.3443708609271523
iteration: 2


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2276243093922652, most acc-class unfairness: 0.026538680077855824, most acc-class size 0.15138121546961328
iteration: 2
iteration: 3
iteration: 1
most accurate classifier accuracy: 0.20552486187845304, most acc-class unfairness: 0.023315642491869876, most acc-class size 0.35138121546961326
iteration: 2
iteration: 3
iteration: 1
most accurate classifier accuracy: 0.2185430463576159, most acc-class unfairness: 0.030836092715231793, most acc-class size 0.35209713024282563
iteration: 2
iteration: 3
iteration: 1
most accurate classifier accuracy: 0.2251655629139073, most acc-class unfairness: 0.025553406100999265, most acc-class size 0.3620309050772627
iteration: 2
iteration: 3
iteration: 1
most accurate classifier accuracy: 0.2152317880794702, most acc-class unfairness: 0.024711660093756978, most acc-class size 0.3443708609271523
iteration: 2
iteration: 3


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2276243093922652, most acc-class unfairness: 0.026538680077855824, most acc-class size 0.15138121546961328
iteration: 2
iteration: 3
iteration: 4
iteration: 1
most accurate classifier accuracy: 0.20552486187845304, most acc-class unfairness: 0.023315642491869876, most acc-class size 0.35138121546961326
iteration: 2
iteration: 3
iteration: 4
iteration: 1
most accurate classifier accuracy: 0.2185430463576159, most acc-class unfairness: 0.030836092715231793, most acc-class size 0.35209713024282563
iteration: 2
iteration: 3
iteration: 4
iteration: 1
most accurate classifier accuracy: 0.2251655629139073, most acc-class unfairness: 0.025553406100999265, most acc-class size 0.3620309050772627
iteration: 2
iteration: 3
iteration: 4
iteration: 1
most accurate classifier accuracy: 0.2152317880794702, most acc-class unfairness: 0.024711660093756978, most acc-class size 0.3443708609271523
iteration: 2
iteration: 3
iteration: 4


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2276243093922652, most acc-class unfairness: 0.026538680077855824, most acc-class size 0.15138121546961328
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 1
most accurate classifier accuracy: 0.20552486187845304, most acc-class unfairness: 0.023315642491869876, most acc-class size 0.35138121546961326
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 1
most accurate classifier accuracy: 0.2185430463576159, most acc-class unfairness: 0.030836092715231793, most acc-class size 0.35209713024282563
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 1
most accurate classifier accuracy: 0.2251655629139073, most acc-class unfairness: 0.025553406100999265, most acc-class size 0.3620309050772627
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 1
most accurate classifier accuracy: 0.2152317880794702, most acc-class unfairness: 0.024711660093756978, most acc-class size 0.3443708609271523
iteratio

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2276243093922652, most acc-class unfairness: 0.026538680077855824, most acc-class size 0.15138121546961328
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 1
most accurate classifier accuracy: 0.20552486187845304, most acc-class unfairness: 0.023315642491869876, most acc-class size 0.35138121546961326
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 1
most accurate classifier accuracy: 0.2185430463576159, most acc-class unfairness: 0.030836092715231793, most acc-class size 0.35209713024282563
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 1
most accurate classifier accuracy: 0.2251655629139073, most acc-class unfairness: 0.025553406100999265, most acc-class size 0.3620309050772627
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 1
most accurate classifier accuracy: 0.2152317880794702, most acc-class unfairness: 0.024711660093756

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2276243093922652, most acc-class unfairness: 0.026538680077855824, most acc-class size 0.15138121546961328
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 1
most accurate classifier accuracy: 0.20552486187845304, most acc-class unfairness: 0.023315642491869876, most acc-class size 0.35138121546961326
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 1
most accurate classifier accuracy: 0.2185430463576159, most acc-class unfairness: 0.030836092715231793, most acc-class size 0.35209713024282563
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 1
most accurate classifier accuracy: 0.2251655629139073, most acc-class unfairness: 0.025553406100999265, most acc-class size 0.3620309050772627
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 1
most accurate classifier accuracy: 0.2152317880

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2276243093922652, most acc-class unfairness: 0.026538680077855824, most acc-class size 0.15138121546961328
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
iteration: 1
most accurate classifier accuracy: 0.20552486187845304, most acc-class unfairness: 0.023315642491869876, most acc-class size 0.35138121546961326
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
iteration: 1
most accurate classifier accuracy: 0.2185430463576159, most acc-class unfairness: 0.030836092715231793, most acc-class size 0.35209713024282563
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
barrier
iteration: 1
most accurate classifier accuracy: 0.2251655629139073, most acc-class unfairness: 0.025553406100999265, most acc-class size 0.3620309050772627
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


iteration: 1
most accurate classifier accuracy: 0.2276243093922652, most acc-class unfairness: 0.026538680077855824, most acc-class size 0.15138121546961328
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
iteration: 9
iteration: 1
most accurate classifier accuracy: 0.20552486187845304, most acc-class unfairness: 0.023315642491869876, most acc-class size 0.35138121546961326
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
iteration: 9
iteration: 1
most accurate classifier accuracy: 0.2185430463576159, most acc-class unfairness: 0.030836092715231793, most acc-class size 0.35209713024282563
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
barrier
iteration: 9
iteration: 1
most accurate classifier accuracy: 0.2251655629139073, most acc-class unfairness: 0.025553406100999265, most acc-class size 0.3620309050772627
iteration: 2
iteration: 3
iteration: 4
iteration: 5


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


[1 2 3 4 5 6 7 8 9]
[1 2 3 4 5 6 7 8 9]
[1 2 3 4 5 6 7 8 9]
[1 2 3 4 5 6 7 8 9]


In [None]:
def build_graph_gamma(iters, gammas, attrs):
    data = []
    for g in gammas:
        data.append(metrics.calc_metrics(X, y, groups, omit_demographics=not include_demographics, demographics=['country_cd_US', 'bachelor_obtained', 'white', 'black', 'asian', 'latinx', 'male', 'female'], is_gerryfair=True, iters=iters, gamma=g))
    

    for a in attrs:
        fprs = []
        for res in data:
            
            for subgroup in res:
                if subgroup['subgroup'] == a:
                    fprs.append(float(subgroup['fpr_avg']))
                    break

        gammas = np.array(gammas)
        print(iterations)

        other_algorithm_fpr = np.linspace(default_fprs[a], default_fprs[a], iterations.size) 

        plt.plot(iterations, gammas, marker='o', linestyle='-', label='GerryFair')

        plt.plot(iterations, other_algorithm_fpr, marker='s', linestyle='-', label='Other Algorithm')

        plt.xlabel('Iterations')
        plt.ylabel('False Positive Rate (FPR)')
        plt.title('Comparison of Algorithms - FPR vs Iterations For ' + a.capitalize())

        plt.legend()

        plt.show()