In [70]:
import pandas as pd
import numpy as np
import warnings
import csv
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [71]:
df = pd.read_csv('./preprocessed.csv')
df.drop(['umich_user_id', 'Unnamed: 0'], axis=1, inplace=True)

In [72]:
X = df.drop('completed', axis=1)
y = df['completed']

#### Calculate false positive rates for the given subgroup

This function takes in the subgroup to test for as a list of tuples. We use logistic regression and 10-fold cross validation to calculate the average FPR and AUC for the given subgroup. 

Returns a tuple containing FPR average, FPR standard deviation, AUC average, AUC standard deviation, and number of students in a given subgroup

In [73]:
def calculate_fp_rates(X, y, subgroup):
    col_subgroup = subgroup.copy()
    model = LogisticRegression(verbose=0)

    kfold = KFold(n_splits=10, shuffle=True, random_state=42)

    # Gets index of given columns
    for i,d in enumerate(col_subgroup):
        col_subgroup[i] = (X.columns.get_loc(d[0]), d[1])

    aucs = []
    fprs = []

    y = np.array(y)
    X = np.array(X)

    subgroup_size = 0
    # Iterate through the each fold
    for train_index, test_index in kfold.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Check if a subgroup is provided or not
        if subgroup:
            # Filter out the data
            conditions = np.array([(X_test[:, name] == val) for name, val in col_subgroup]).all(axis=0)
            X_test_filtered = X_test[conditions]
            y_test_filtered = y_test[conditions]
            y_pred_filtered = None
            auc = None

            # get total number of members in this subgroup
            subgroup_size += X_test_filtered.shape[0]

            # This exception is thrown when there is not enough data for the given subgroup
            try:
                y_pred_filtered = model.predict(X_test_filtered)
                auc = roc_auc_score(y_test_filtered, y_pred_filtered)
            except Exception:
                continue

            aucs.append(auc)

            # TP = np.sum((y_test[conditions] == 1) & (y_pred[conditions] == 1))
            # FN = np.sum((y_test[conditions] == 1) & (y_pred[conditions] == 0))
            TN = np.sum((y_test[conditions] == 0) & (y_pred[conditions] == 0))
            FP = np.sum((y_test[conditions] == 0) & (y_pred[conditions] == 1))      
        else:
            subgroup_size += X_test.shape[0]
            auc = roc_auc_score(y_test, y_pred)
            aucs.append(auc) 

            # TP = np.sum((y_test == 1) & (y_pred == 1))
            # FN = np.sum((y_test == 1) & (y_pred == 0))
            TN = np.sum((y_test == 0) & (y_pred == 0))
            FP = np.sum((y_test == 0) & (y_pred == 1))

        # Calculate FPR
        FPR = FP / (FP + TN)
        fprs.append(FPR)

    auc_avg = np.average(aucs)
    auc_std = np.std(aucs)
    fpr_avg = np.average(fprs)
    fpr_std = np.std(fprs)
    # print(f"AUC: {auc_avg} +/- {auc_std}")
    # print(f"FPR: {fpr_avg} +/- {fpr_std}")
    return (auc_avg, auc_std, fpr_avg, fpr_std, subgroup_size)

In [74]:
# calculate_fp_rates(X, y, [])

#### Compute all subgroups

This function generates all possible subgroups given the list of protected columns

In [75]:
def compute_combo(cols):
    retval = [[]]
    def recurse(i, curr):
        for j in range(i, len(cols)):
            opt1 = curr.copy()
            opt1.append((cols[j], 0))

            opt2 = curr.copy()
            opt2.append((cols[j], 1))
            
            retval.append(opt1)
            retval.append(opt2)
            if j < len(cols) - 1:
                recurse(j+1, opt1.copy())
            if j < len(cols) - 1:
                recurse(j+1, opt2.copy())


    recurse(0, [])

    return retval

In [76]:
combos = compute_combo(['country_cd_US', 'is_male', 'bachelor_obtained'])

In [77]:
combos.sort(key=len)

In [78]:
# for c in combos: 
#     subgroup = []
#     for col, val in c:
#         if col == 'country_cd_US':
#             subgroup.append("US" if val == 1 else "International")
#         elif col == 'is_male':
#             subgroup.append("Male" if val == 1 else "Female/Other")
#         elif col == 'bachelor_obtained':
#             subgroup.append("Bachelor or higher" if val == 1 else "No Bachelor")
#     print(", ".join(subgroup))
#     fpr_avg, fpr_std, auc_avg, auc_std, n = calculate_fp_rates(X, y, c)
#     print(f"AUC: {auc_avg} +/- {auc_std}")
#     print(f"FPR: {fpr_avg} +/- {fpr_std}") 
#     print(f"Members of subgroup: {n}")
#     print()
    
    

Generate a list of all subgroup data and write it to a csv

In [79]:
all_subgroup_data = []

for c in combos:
    subgroup = []
    if c:
        for col, val in c:
            if col == 'country_cd_US':
                subgroup.append("US" if val == 1 else "International")
            elif col == 'is_male':
                subgroup.append("Male" if val == 1 else "Female/Other")
            elif col == 'bachelor_obtained':
                subgroup.append("Bachelor or higher" if val == 1 else "No Bachelor")
    else:
        subgroup.append("Overall")

    auc_avg, auc_std, fpr_avg, fpr_std, n = calculate_fp_rates(X, y, c)

    subgroup_str = ", ".join(subgroup)
    if n < 10:
        subgroup_str += "*"
    elif n >= 10 and n < 30:
        subgroup_str += "**"
    # Create a dictionary for each subgroup
    subgroup_data = {
        'subgroup': subgroup_str,
        'auc_avg': f"{auc_avg:.3f}",
        'auc_std': f"{auc_std:.3f}",
        'fpr_avg': f"{fpr_avg:.3f}",
        'fpr_std': f"{fpr_std:.3f}",
        'number_of_members': f"{n}"
    }

    all_subgroup_data.append(subgroup_data)

In [80]:
csv_file = 'subgroup_data_with_demographics.csv'

# Write the data to a CSV file
with open(csv_file, 'w', newline='') as csvfile:
    fieldnames = ['subgroup', 'auc_avg', 'auc_std', 'fpr_avg', 'fpr_std', 'number_of_members']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()  # Write the header row
    for row in all_subgroup_data:
        writer.writerow(row)