In [None]:
import pandas as pd
import numpy as np
import warnings
from functions import metrics
import csv
from sklearn.exceptions import ConvergenceWarning
from functions.formatting import get_subgroup_str
from itertools import product, combinations
import matplotlib.pyplot as plt
import ast

warnings.filterwarnings("ignore", category=ConvergenceWarning)

### Load Preprocessed File

First we will load the preprocessed data file and create the X and y dataframes. 

The variable `preprocessed_filename` should hold the path to the preprocessed data.

In [None]:
preprocessed_filename = './data/preprocessed.csv'

df = pd.read_csv(preprocessed_filename)
df.drop(['umich_user_id', 'Unnamed: 0'], axis=1, inplace=True)

X = df.drop('completed', axis=1)
y = df['completed']

### Generating Combos

Now we will generate all possible intersectional combinations of the protected attributes, which are country, education, race, and gender. We will be able to see fpr and auc data for all combinatorially possible intersectional groups.

In [None]:
def compute_combos(demo_groups):
    """
    Compute all possible combinations of demographic groups.

    Args:
        demo_groups (list[list]): A 2D array of demographic groups.

    Returns:
        list[tuple]: A list of tuples representing all possible combinations of demographic groups.
    """
    ret = []
    for i in range(1,len(demo_groups)+1):
        g = list(combinations(demo_groups, i))
        for demos in g:
            ret += list(product(*demos))
    
    ret.sort(key=len)
    ret.insert(0, ())

    return ret

In [None]:
demo_groups = [['US', 'intl'], ['bachelor_obtained', 'no_bachelor_obtained', 'education_na'], ['white', 'black', 'asian', 'latinx', 'race_others', 'race_na'], ['male', 'female', 'gender_na', 'gender_other']]

combos = compute_combos(demo_groups)

### Setting demographic and protected attributes

Here we set the demographic and protected attributes for gerryfair. Update the `protected` variable to choose whether you want all variables protected, race + gender protected, or black + female protected.


In [None]:
demographics = ['US', 'intl', 'bachelor_obtained', 'no_bachelor_obtained', 'education_na', 'white', 'black', 'asian', 'latinx', 'race_others', 'race_na', 'male', 'female', 'gender_na', 'gender_other']

all_protected=['US', 'intl', 'bachelor_obtained', 'no_bachelor_obtained', 'white', 'black', 'asian', 'latinx', 'male', 'female']
race_gender_protected = ['white', 'black', 'asian', 'latinx', 'male', 'female']
black_female_protected = ['black', 'female']

In [None]:
protected = all_protected

In [None]:
lin_reg_res = metrics.calc_metrics(X, y, combos, omit_demographics=True, demographics=demographics, protected=protected, is_gerryfair=False, iters=300)

In [None]:
# gerryfair_res = metrics.calc_metrics(X, y, combos, omit_demographics=True, demographics=demographics, protected=all_protected, is_gerryfair=True, iters=20)

In [None]:
output_file = "./data/MTC508_lin_reg_subgroup_data_without_demographics.csv"

# Write the data to a CSV file
with open(output_file, 'w', newline='') as csvfile:
    fieldnames = [
        'subgroup', 'n', 'auc_avg', 'auc_std', 'fpr_avg', 'fpr_std', 'rmse_avg', 'rmse_std']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    writer.writeheader()  # Write the header row
    for row in lin_reg_res:
        writer.writerow(row)

Read from file

In [None]:
subgroup_data = pd.read_csv('./data/MTC508_subgroup_data_without_demographics.csv')
fpr_data = {}
# loop through each subgroup
# for each subgroup, get the corresponding data
for index, row in subgroup_data.iterrows():
    fpr_data[row['subgroup']] = row['fpr_avg']

In [None]:
subgroup_data = pd.read_csv('./data/MTC508_subgroup_data_without_demographics.csv')
auc_data = {}
# loop through each subgroup
# for each subgroup, get the corresponding data
for index, row in subgroup_data.iterrows():
    auc_data[row['subgroup']] = row['auc_avg']

In [None]:
def get_data_iters(start, stop, step, attrs, protected):
    ret_val_auc = []
    ret_val_fpr = []
    data = []
    for i in range(start+1,stop+1,step):
        data.append(metrics.calc_metrics(X, y, combos, omit_demographics=True, demographics=demographics, protected=protected, is_gerryfair=True, iters=i))
    
    for a in attrs:
        aucs = []
        fprs = []
        for res in data:
            for subgroup in res:
                if subgroup['subgroup'] == a:
                    aucs.append(float(subgroup['auc_avg']))
                    fprs.append(float(subgroup['fpr_avg']))
                    break

        iterations = np.array(list(range(start, stop, step)))

        other_algorithm_auc = np.linspace(auc_data[a], auc_data[a], iterations.size)
        other_algorithm_fpr = np.linspace(fpr_data[a], fpr_data[a], iterations.size)

        ret_val_auc.append((iterations, aucs, other_algorithm_auc))
        ret_val_fpr.append((iterations, fprs, other_algorithm_fpr))
    
    return ret_val_auc, ret_val_fpr

In [None]:
attrs = ['Overall', 'black', 'white', 'female', 'male', 'black, female']

In [None]:
val_auc, val_fpr = get_data_iters(0, 301, 20, attrs, all_protected)

In [None]:
def save_results(data, name):
    arr = np.array(data)
    flat_data = arr.reshape(arr.shape[0], -1)

    # Create a DataFrame with columns representing each element of the 2D array
    df = pd.DataFrame(flat_data, columns=[f'Value_{i}' for i in range(flat_data.shape[1])])

    # Save DataFrame to CSV file
    df.to_csv('results/' + name, index=False)

In [None]:
def read_results(name):
    read_df = pd.read_csv('results/' + name)

    # Convert the 'Values' column back to a 3D array
    read_array_data = read_df.values.reshape(read_df.shape[0], 3, -1)
    return read_array_data

In [None]:
fpr_file = 'fpr_results.csv'
auc_file = 'auc_results.csv'

save_results(val_fpr, fpr_file)
save_results(val_auc, auc_file)

In [None]:
fpr_data = read_results(fpr_file)
auc_data = read_results(auc_file)

In [None]:
def build_combined_graph(val, attributes):
    # Set distinct colors for each attribute
    colors = ['b', 'g', 'r', 'c', 'm', 'y']  # You can add more colors if needed

    # Create a single plot
    plt.figure(figsize=(12, 6))  # Adjust the figure size as needed

    for i, attr in enumerate(attributes):
        x_axis, y_axis, other_algorithm_fpr = val[i]
        plt.plot(x_axis, y_axis, linestyle='-', label=f'GerryFair - {attr.capitalize()}', color=colors[i])
        plt.plot(x_axis, other_algorithm_fpr, linestyle='-', label=f'LR Without Gerryfair - {attr.capitalize()}', color=colors[i], alpha=0.5)

    # Set the y-axis range to 0 to 1
    plt.ylim(0, 1)

    plt.xlabel('Iterations')
    plt.ylabel('FPR')
    plt.title('Comparison of Algorithms - FPR vs Iterations (Race + Gender Protected)')

    # Move the legend outside the axes
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

    # Automatically adjust subplot parameters to give specified padding
    plt.tight_layout()

    plt.show()

In [None]:
attributes = ["Overall", "Black", "White", "Female", "Male", "Black + Female"]

In [None]:
build_combined_graph(fpr_data, attributes)

In [None]:
build_combined_graph(auc_data, attributes)

In [None]:
def build_graph_gamma(iters, gammas, attrs, protected, is_fpr=True):
    results = []
    for g in gammas:
        results.append(metrics.calc_metrics(X, y, combos, omit_demographics=True, demographics=demographics, protected=protected, is_gerryfair=True, iters=iters+1, gamma=g))
    
    if is_fpr:
        metric = 'fpr'
    else:
        metric = 'auc'

    data = results
    for a in attrs:
        values = []
        for res in results:
            for subgroup in res:
                if subgroup['subgroup'] == a:
                    values.append(float(subgroup[f'{metric}_avg']))
                    break
        
        values = np.array(values)
        gammas = np.array(gammas)

        if is_fpr:
            other_algorithm = np.linspace(fpr_data[a], fpr_data[a], gammas.size)
        else:
            other_algorithm = np.linspace(auc_data[a], auc_data[a], gammas.size)    

        plt.plot(gammas, data, marker='o', linestyle='-', label='GerryFair')

        plt.plot(gammas, other_algorithm, marker='s', linestyle='-', label='Other Algorithm')

        plt.xlabel('Gamma')
        plt.ylabel(f'False Positive Rate ({metric.upper()})')
        plt.title(f'Comparison of Algorithms - {metric.upper()} vs Gamma For ' + a.capitalize())

        plt.legend()

        plt.show()

In [None]:
build_graph_gamma(5, [.002, .005, .01, .02, .05, .1], ['Overall', 'black', 'female', 'black, female'])