In [98]:
import pandas as pd
import numpy as np
import warnings
import csv
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import mean_squared_error

warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [99]:
include_demographics = True 

In [100]:
df = pd.read_csv('./preprocessed.csv')
df.drop(['umich_user_id', 'Unnamed: 0'], axis=1, inplace=True)

In [101]:
X = df.drop('completed', axis=1)
y = df['completed']

#### Calculate false positive rates for the given subgroup

This function takes in the subgroup to test for as a list of tuples. We use logistic regression and 10-fold cross validation to calculate the average FPR and AUC for the given subgroup. 

Returns a tuple containing FPR average, FPR standard deviation, AUC average, AUC standard deviation, and number of students in a given subgroup

In [102]:
def calculate_fp_rates(X, y, subgroup, omit_demographics=False):
    col_subgroup = subgroup.copy()
    model = LogisticRegression(verbose=0)

    kfold = KFold(n_splits=10, shuffle=True, random_state=42)

    # Gets index of given columns
    for i,d in enumerate(col_subgroup):
        col_subgroup[i] = (X.columns.get_loc(d[0]), d[1])

    to_drop = []
    for d in ['country_cd_US', 'is_female', 'bachelor_obtained']:
        to_drop.append(X.columns.get_loc(d))

    aucs = []
    fprs = []
    rmses = []

    y = np.array(y)
    X = np.array(X)

    subgroup_size = 0
    count = 0
    # Iterate through the each fold
    for train_index, test_index in kfold.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        train = X_train
        if omit_demographics:
            train = np.delete(train, to_drop, axis=1)                
        

        model.fit(train, y_train)
        test = X_test
        if omit_demographics:
            test = np.delete(test, to_drop, axis=1)

        y_pred = model.predict(test)

        # Check if a subgroup is provided or not
        if subgroup:
            # Filter out the data
            conditions = np.array([(X_test[:, name] == val) for name, val in col_subgroup]).all(axis=0)
            X_test_filtered = X_test[conditions]
            y_test_filtered = y_test[conditions]
            y_pred_filtered = None
            auc = None
            rmse = None

            # get total number of members in this subgroup
            subgroup_size += X_test_filtered.shape[0]
            count+=1
            test = X_test_filtered
            if omit_demographics:
                test = np.delete(test, to_drop, axis=1)
                    

            # This exception is thrown when there is not enough data for the given subgroup
            try:
                y_pred_filtered = model.predict(test)
                auc = roc_auc_score(y_test_filtered, y_pred_filtered)
                rmse = np.sqrt(mean_squared_error(y_test_filtered, y_pred_filtered))

            except Exception:
                continue

            aucs.append(auc)
            rmses.append(rmse)

            # TP = np.sum((y_test[conditions] == 1) & (y_pred[conditions] == 1))
            # FN = np.sum((y_test[conditions] == 1) & (y_pred[conditions] == 0))
            TN = np.sum((y_test[conditions] == 0) & (y_pred[conditions] == 0))
            FP = np.sum((y_test[conditions] == 0) & (y_pred[conditions] == 1))      
        else:
            subgroup_size += X_test.shape[0]
            count+=1
            auc = roc_auc_score(y_test, y_pred)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            aucs.append(auc) 
            rmses.append(rmse)

            # TP = np.sum((y_test == 1) & (y_pred == 1))
            # FN = np.sum((y_test == 1) & (y_pred == 0))
            TN = np.sum((y_test == 0) & (y_pred == 0))
            FP = np.sum((y_test == 0) & (y_pred == 1))

        # Calculate FPR
        FPR = FP / (FP + TN)
        fprs.append(FPR)

    auc_avg = np.average(aucs)
    auc_std = np.std(aucs)
    fpr_avg = np.average(fprs)
    fpr_std = np.std(fprs)
    rmse_avg = np.average(rmses)
    rmse_std = np.std(rmses)
    
    # print(f"AUC: {auc_avg} +/- {auc_std}")
    # print(f"FPR: {fpr_avg} +/- {fpr_std}")
    return (auc_avg, auc_std, fpr_avg, fpr_std, rmse_avg, rmse_std, subgroup_size/count)

In [103]:
# calculate_fp_rates(X, y, [])

#### Compute all subgroups

This function generates all possible subgroups given the list of protected columns

In [104]:
def compute_combo(cols):
    retval = [[]]
    def recurse(i, curr):
        for j in range(i, len(cols)):
            opt1 = curr.copy()
            opt1.append((cols[j], 0))

            opt2 = curr.copy()
            opt2.append((cols[j], 1))
            
            retval.append(opt1)
            retval.append(opt2)
            if j < len(cols) - 1:
                recurse(j+1, opt1.copy())
            if j < len(cols) - 1:
                recurse(j+1, opt2.copy())


    recurse(0, [])

    return retval

In [105]:
combos = compute_combo(['country_cd_US', 'is_female', 'bachelor_obtained', 'white'])

In [106]:
combos.sort(key=len)

In [107]:
# for c in combos: 
#     subgroup = []
#     for col, val in c:
#         if col == 'country_cd_US':
#             subgroup.append("US" if val == 1 else "International")
#         elif col == 'is_male':
#             subgroup.append("Male" if val == 1 else "Female/Other")
#         elif col == 'bachelor_obtained':
#             subgroup.append("Bachelor or higher" if val == 1 else "No Bachelor")
#     print(", ".join(subgroup))
#     fpr_avg, fpr_std, auc_avg, auc_std, n = calculate_fp_rates(X, y, c)
#     print(f"AUC: {auc_avg} +/- {auc_std}")
#     print(f"FPR: {fpr_avg} +/- {fpr_std}") 
#     print(f"Members of subgroup: {n}")
#     print()
    
    

Generate a list of all subgroup data and write it to a csv

In [108]:
all_subgroup_data = []

for c in combos:
    subgroup = []
    if c:
        for col, val in c:
            if col == 'country_cd_US':
                subgroup.append("US" if val == 1 else "International")
            elif col == 'is_female':
                subgroup.append("Female" if val == 1 else "Male/Other")
            elif col == 'bachelor_obtained':
                subgroup.append("Bachelor or higher" if val == 1 else "No Bachelor")
            elif col == 'white':
                subgroup.append("White" if val == 1 else "Non-white")
    else:
        subgroup.append("Overall")

    auc_avg, auc_std, fpr_avg, fpr_std, rmse_avg, rmse_std, n = calculate_fp_rates(X, y, c, omit_demographics=not include_demographics)

    subgroup_str = ", ".join(subgroup)
    if n < 10:
        subgroup_str += "*"
    elif n >= 10 and n < 30:
        subgroup_str += "**"
    # Create a dictionary for each subgroup
    subgroup_data = {
        'subgroup': subgroup_str,
        'n': f"{n}",
        'auc_avg': f"{auc_avg:.3f}",
        'auc_std': f"{auc_std:.3f}",
        'fpr_avg': f"{fpr_avg:.3f}",
        'fpr_std': f"{fpr_std:.3f}",
        'rmse_avg': f"{rmse_avg:.3f}",
        'rmse_std': f"{rmse_std:.3f}"
    }

    all_subgroup_data.append(subgroup_data)

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


In [109]:
csv_file = f"MTC508_subgroup_data_with{'' if include_demographics else 'out'}_demographics.csv"
data = sorted(all_subgroup_data, key=lambda x: -float(x['n']))

# Write the data to a CSV file
with open(csv_file, 'w', newline='') as csvfile:
    fieldnames = [
        'subgroup', 'n', 'auc_avg', 'auc_std', 'fpr_avg', 'fpr_std', 'rmse_avg', 'rmse_std']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()  # Write the header row
    for row in data:
        writer.writerow(row)

In [110]:
np.delete(X, 10, axis=1).shape

(1098, 12)

In [111]:
all_subgroup_data[0]['n']

'109.8'