In [7]:
import pandas as pd
import numpy as np
import warnings
from functions import metrics
import csv
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import mean_squared_error
from functions.formatting import get_indices, get_subgroup_str

warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
include_demographics = True 
use_gerryfair = False 

In [12]:
df = pd.read_csv('./data/preprocessed.csv')
df.drop(['umich_user_id', 'Unnamed: 0'], axis=1, inplace=True)

In [13]:
X = df.drop('completed', axis=1)
y = df['completed']

#### Calculate false positive rates for the given subgroup

This function takes in the subgroup to test for as a list of tuples. We use logistic regression and 10-fold cross validation to calculate the average FPR and AUC for the given subgroup. 

Returns a tuple containing FPR average, FPR standard deviation, AUC average, AUC standard deviation, and number of students in a given subgroup

In [5]:
def calculate_fp_rates(X, y, subgroup, omit_demographics=False):
    col_subgroup = subgroup.copy()
    model = LogisticRegression(verbose=0)

    kfold = KFold(n_splits=10, shuffle=True, random_state=42)

    # Gets index of given columns
    for i,d in enumerate(col_subgroup):
        col_subgroup[i] = (X.columns.get_loc(d[0]), d[1])

    to_drop = []
    for d in ['country_cd_US', 'is_female', 'bachelor_obtained']:
        to_drop.append(X.columns.get_loc(d))

    aucs = []
    fprs = []
    rmses = []

    y = np.array(y)
    X = np.array(X)

    subgroup_size = 0
    count = 0
    # Iterate through the each fold
    for train_index, test_index in kfold.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        train = X_train
        if omit_demographics:
            train = np.delete(train, to_drop, axis=1)                
        

        model.fit(train, y_train)
        test = X_test
        if omit_demographics:
            test = np.delete(test, to_drop, axis=1)

        y_pred = model.predict(test)

        # Check if a subgroup is provided or not
        if subgroup:
            # Filter out the data
            conditions = np.array([(X_test[:, name] == val) for name, val in col_subgroup]).all(axis=0)
            X_test_filtered = X_test[conditions]
            y_test_filtered = y_test[conditions]
            y_pred_filtered = None
            auc = None
            rmse = None

            # get total number of members in this subgroup
            subgroup_size += X_test_filtered.shape[0]
            count+=1
            test = X_test_filtered
            if omit_demographics:
                test = np.delete(test, to_drop, axis=1)
                    

            # This exception is thrown when there is not enough data for the given subgroup
            try:
                y_pred_filtered = model.predict(test)
                print(y_pred_filtered)
                auc = roc_auc_score(y_test_filtered, y_pred_filtered)
                rmse = np.sqrt(mean_squared_error(y_test_filtered, y_pred_filtered))

            except Exception:
                continue

            aucs.append(auc)
            rmses.append(rmse)

            # TP = np.sum((y_test[conditions] == 1) & (y_pred[conditions] == 1))
            # FN = np.sum((y_test[conditions] == 1) & (y_pred[conditions] == 0))
            TN = np.sum((y_test[conditions] == 0) & (y_pred[conditions] == 0))
            FP = np.sum((y_test[conditions] == 0) & (y_pred[conditions] == 1))      
        else:
            subgroup_size += X_test.shape[0]
            count+=1
            auc = roc_auc_score(y_test, y_pred)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            aucs.append(auc) 
            rmses.append(rmse)

            # TP = np.sum((y_test == 1) & (y_pred == 1))
            # FN = np.sum((y_test == 1) & (y_pred == 0))
            TN = np.sum((y_test == 0) & (y_pred == 0))
            FP = np.sum((y_test == 0) & (y_pred == 1))

        # Calculate FPR
        FPR = FP / (FP + TN)
        fprs.append(FPR)

    auc_avg = np.average(aucs)
    auc_std = np.std(aucs)
    fpr_avg = np.average(fprs)
    fpr_std = np.std(fprs)
    rmse_avg = np.average(rmses)
    rmse_std = np.std(rmses)
    
    # print(f"AUC: {auc_avg} +/- {auc_std}")
    # print(f"FPR: {fpr_avg} +/- {fpr_std}")
    return (auc_avg, auc_std, fpr_avg, fpr_std, rmse_avg, rmse_std, subgroup_size/count)

In [6]:
# calculate_fp_rates(X, y, [])

#### Compute all subgroups

This function generates all possible subgroups given the list of protected columns

In [4]:
def compute_combo(cols, races, genders):
    retval = [[]]
    def recurse(i, curr):
        for r in races:
            opt3 = curr.copy()
            opt3.append((r, 1))
            if opt3 not in retval:
                retval.append(opt3)
        for g in genders:
            opt4 = curr.copy()
            opt4.append((g, 1))
            if opt4 not in retval:
                retval.append(opt4)
        for r in races:
            for g in genders:
                opt5 = curr.copy()
                opt5.append((r, 1))
                opt5.append((g, 1))
                if opt5 not in retval:
                    retval.append(opt5)
        for j in range(i, len(cols)):
            opt1 = curr.copy()
            opt1.append((cols[j], 0))

            opt2 = curr.copy()
            opt2.append((cols[j], 1))
            
            retval.append(opt1)
            retval.append(opt2)
            
            if j < len(cols):
                recurse(j+1, opt1.copy())
            if j < len(cols):
                recurse(j+1, opt2.copy())
        
    recurse(0, [])

    return retval

In [5]:
combos = compute_combo(['country_cd_US', 'bachelor_obtained'], ['white', 'black', 'asian', 'latinx', 'race_others', 'race_na'], ['male', 'female', 'gender_na']) 

In [6]:
combos.sort(key=len)
combos

[[],
 [('white', 1)],
 [('black', 1)],
 [('asian', 1)],
 [('latinx', 1)],
 [('race_others', 1)],
 [('race_na', 1)],
 [('male', 1)],
 [('female', 1)],
 [('gender_na', 1)],
 [('country_cd_US', 0)],
 [('country_cd_US', 1)],
 [('bachelor_obtained', 0)],
 [('bachelor_obtained', 1)],
 [('white', 1), ('male', 1)],
 [('white', 1), ('female', 1)],
 [('white', 1), ('gender_na', 1)],
 [('black', 1), ('male', 1)],
 [('black', 1), ('female', 1)],
 [('black', 1), ('gender_na', 1)],
 [('asian', 1), ('male', 1)],
 [('asian', 1), ('female', 1)],
 [('asian', 1), ('gender_na', 1)],
 [('latinx', 1), ('male', 1)],
 [('latinx', 1), ('female', 1)],
 [('latinx', 1), ('gender_na', 1)],
 [('race_others', 1), ('male', 1)],
 [('race_others', 1), ('female', 1)],
 [('race_others', 1), ('gender_na', 1)],
 [('race_na', 1), ('male', 1)],
 [('race_na', 1), ('female', 1)],
 [('race_na', 1), ('gender_na', 1)],
 [('country_cd_US', 0), ('white', 1)],
 [('country_cd_US', 0), ('black', 1)],
 [('country_cd_US', 0), ('asian', 

In [10]:
groups = {}
for combo in combos:
    group_key = []
    for c in combo:
        group_key.append(c[0])
    if tuple(group_key) not in groups:
        groups[tuple(group_key)] = []
    groups[tuple(group_key)].append(combo)

# sort dict by length of key
groups = dict(sorted(groups.items(), key=lambda item: len(item[0])))

print(groups)


{(): [[]], ('white',): [[('white', 1)]], ('black',): [[('black', 1)]], ('asian',): [[('asian', 1)]], ('latinx',): [[('latinx', 1)]], ('race_others',): [[('race_others', 1)]], ('race_na',): [[('race_na', 1)]], ('male',): [[('male', 1)]], ('female',): [[('female', 1)]], ('gender_na',): [[('gender_na', 1)]], ('country_cd_US',): [[('country_cd_US', 0)], [('country_cd_US', 1)]], ('bachelor_obtained',): [[('bachelor_obtained', 0)], [('bachelor_obtained', 1)]], ('white', 'male'): [[('white', 1), ('male', 1)]], ('white', 'female'): [[('white', 1), ('female', 1)]], ('white', 'gender_na'): [[('white', 1), ('gender_na', 1)]], ('black', 'male'): [[('black', 1), ('male', 1)]], ('black', 'female'): [[('black', 1), ('female', 1)]], ('black', 'gender_na'): [[('black', 1), ('gender_na', 1)]], ('asian', 'male'): [[('asian', 1), ('male', 1)]], ('asian', 'female'): [[('asian', 1), ('female', 1)]], ('asian', 'gender_na'): [[('asian', 1), ('gender_na', 1)]], ('latinx', 'male'): [[('latinx', 1), ('male', 1)]

In [11]:
# for c in combos: 
#     subgroup = []
#     for col, val in c:
#         if col == 'country_cd_US':
#             subgroup.append("US" if val == 1 else "International")
#         elif col == 'is_male':
#             subgroup.append("Male" if val == 1 else "Female/Other")
#         elif col == 'bachelor_obtained':
#             subgroup.append("Bachelor or higher" if val == 1 else "No Bachelor")
#     print(", ".join(subgroup))
#     fpr_avg, fpr_std, auc_avg, auc_std, n = calculate_fp_rates(X, y, c)
#     print(f"AUC: {auc_avg} +/- {auc_std}")
#     print(f"FPR: {fpr_avg} +/- {fpr_std}") 
#     print(f"Members of subgroup: {n}")
#     print()
    
    

Generate a list of all subgroup data and write it to a csv

In [12]:
all_subgroup_data = []

for k, v in groups.items():
    for c in v:
        subgroup = []
        if c:
            for col, val in c:
                if col == 'country_cd_US':
                    subgroup.append("US" if val == 1 else "International")
                # elif col == 'is_female':
                #     subgroup.append("Female" if val == 1 else "Male/Other")
                elif col == 'bachelor_obtained':
                    subgroup.append("Bachelor or higher" if val == 1 else "No Bachelor")
                else:
                    subgroup.append(col)
        else:
            subgroup.append("Overall")

        print(X)    
        # auc_avg, auc_std, fpr_avg, fpr_std, rmse_avg, rmse_std, n = calculate_fp_rates(X, y, c, omit_demographics=not include_demographics)
        auc_avg, auc_std, fpr_avg, fpr_std, rmse_avg, rmse_std, n = metrics.calc_metrics(X, y, c, omit_demographics=not include_demographics, demographics=['country_cd_US', 'bachelor_obtained'], is_gerryfair=use_gerryfair)
        # auc_avg, auc_std, fpr_avg, fpr_std, rmse_avg, rmse_std, n = 0, 0, 0, 0, 0, 0, 0
        subgroup_str = ", ".join(subgroup)
        if n < 10:
            subgroup_str += "*"
        elif n >= 10 and n < 30:
            subgroup_str += "**"
        # Create a dictionary for each subgroup
        subgroup_data = {
            'subgroup': subgroup_str,
            'n': f"{n}",
            'auc_avg': f"{auc_avg:.3f}",
            'auc_std': f"{auc_std:.3f}",
            'fpr_avg': f"{fpr_avg:.3f}",
            'fpr_std': f"{fpr_std:.3f}",
            'rmse_avg': f"{rmse_avg:.3f}",
            'rmse_std': f"{rmse_std:.3f}"
        }

        all_subgroup_data.append(subgroup_data)

      answer_count  average_answer_length  total_votes_given_answers  \
0              0.0               0.000000                        0.0   
1              0.0               0.000000                        0.0   
2             10.0             797.200000                        0.0   
3              5.0             589.000000                        1.0   
4             11.0             300.818182                        3.0   
...            ...                    ...                        ...   
1093          14.0             532.071429                        0.0   
1094           0.0               0.000000                        0.0   
1095           9.0             894.111111                        0.0   
1096           0.0               0.000000                        0.0   
1097           2.0             534.000000                        0.0   

      total_votes_received_answers  question_count  average_question_length  \
0                              0.0             0.0      

AttributeError: 'list' object has no attribute 'items'

In [18]:
csv_file = f"./data/MTC508_subgroup_data_with{'' if include_demographics else 'out'}_demographics.csv"
# csv_file = "test.csv"

# Write the data to a CSV file
with open(csv_file, 'w', newline='') as csvfile:
    fieldnames = [
        'subgroup', 'n', 'auc_avg', 'auc_std', 'fpr_avg', 'fpr_std', 'rmse_avg', 'rmse_std']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    writer.writeheader()  # Write the header row
    for row in res:
        writer.writerow(row)

In [None]:
# np.delete(X, 10, axis=1).shape

In [None]:
# all_subgroup_data[0]['n']

In [17]:
res = metrics.calc_metrics(X, y, groups, omit_demographics=not include_demographics, demographics=['country_cd_US','female','bachelor_obtained','white'], is_gerryfair=False)

[1 1 0 1 1 1 1 0 0 1 0 1 0 1 1 0 1 1 0 0 0 1 1 0 0 0 0 1 1 1 1 1 0 0 1 0 1
 1 0 0 0 1 1 0 0 1 0 1 1 1 0 0 1 0 0 1 1 1 0 1 0 1 0 0 1 0 0 0 0 0 1 1 0 0
 0 1 1 1 1 0 0 1 0 1 1 0 1 1 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 1 1 0 0 1 1 1 1
 0 1 0 0 1 1 1 1 0 1 0 1 1 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 0 0 1 0 1 1 0 1
 0 0 1 0 0 0 0 1 1 1 1 0 1 1 0 1 1 1 0 1 1 0 1 1 1 0 0 1 0 1 0 0 1 0 0 0 1
 1 1 1 0 0 0 1 1 0 0 0 1 1 1 0 1 0 0 1 1 0 0 0 0 1 1 0 0 0 1 1 0 0 1 0] [1 1 0 1 1 1 1 0 0 1 0 1 0 1 1 0 1 1 0 0 0 1 0 1 0 0 0 1 1 0 1 1 0 1 1 0 1
 1 1 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0
 0 1 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 0 1 1 0 0 0 0 1 1 1
 0 0 0 1 1 1 1 0 0 1 0 1 0 0 1 0 0 1 0 0 1 1 0 0 0 1 0 1 1 0 0 1 0 1 0 0 0
 0 0 0 0 0 0 0 1 1 1 1 0 0 1 0 0 1 1 0 0 1 0 1 1 1 0 1 1 0 1 1 0 1 0 1 0 1
 1 0 0 0 1 0 1 1 0 0 0 0 0 1 0 1 1 0 1 1 0 0 0 1 0 0 0 1 0 1 0 0 0 1 0] 0.17117117117117117
[1 1 0 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 1 1 0 0 1 0
 1 1 0 0 1 

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


In [48]:
from sklearn.metrics import roc_auc_score, mean_squared_error
import numpy as np
import pandas as pd
import gerryfair
from functions.build_models import gerryfair_model, logr_model
from sklearn.model_selection import KFold
from functions.formatting import get_indices, get_subgroup_str

def calc_metrics(X, y, subgroups_dict, demographics, omit_demographics=False, is_gerryfair = False):
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)

    subgroups, names = get_indices(subgroups_dict, X)
    # subgroups = subgroups[:5]
    # names = names[:5]

    to_drop = []

    for d in demographics:
        to_drop.append(X.columns.get_loc(d))

    aucs = [[] for _ in subgroups]
    fprs = [[] for _ in subgroups]
    rmses = [[] for _ in subgroups]

    # dataset, attributes = generate_preprocessed.create_attributes(X, y, demographics)
    # X, X_prime, y = gerryfair.clean.clean_dataset(dataset, attributes, False)
    X_prime = X.loc[:, demographics]

    X_cols, X_prime_cols = X.columns, X_prime.columns
    X, X_prime, y = np.array(X), np.array(X_prime), np.array(y) 

    subgroup_sizes = np.zeros(len(subgroups))
    counts = np.zeros(len(subgroups))
    # Iterate through the each fold
    for train_index, test_index in kfold.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        X_prime_train, X_prime_test = X_prime[train_index], X_prime[test_index]

        model = None

        if is_gerryfair:
            X_train_df = pd.DataFrame(X_train, columns=X_cols)
            X_prime_train_df = pd.DataFrame(X_prime_train, columns=X_prime_cols)
            y_train_df = pd.Series(y_train)

            model = gerryfair_model(X_train_df, X_prime_train_df, y_train_df)
        else:
            model = logr_model(X_train, y_train)
        
        for i, subgroup in enumerate(subgroups):
            X_test_filtered = X_test
            y_test_filtered = y_test
            if subgroup:
                conditions = np.array([(X_test_filtered[:, name] == val) for name, val in subgroup]).all(axis=0)
                X_test_filtered = X_test_filtered[conditions]
                y_test_filtered = y_test_filtered[conditions]

            subgroup_sizes[i] += X_test_filtered.shape[0]
            counts[i]+=1

            # if omit_demographics and not is_gerryfair:
            #     X_train = np.delete(X_train, to_drop, axis=1)    
            #     X_test_filtered = np.delete(X_test_filtered, to_drop, axis=1)        

            res = None
            if is_gerryfair:
                X_test_df = pd.DataFrame(X_test_filtered, columns=X_cols)
                res = calc_metric(model, X_test_df, y_test_filtered, True)
            else:
                res = calc_metric(model, X_test_filtered, y_test_filtered, False)
        
        
            if res == None:
                continue

            auc, FPR, rmse = res
            aucs[i].append(auc)
            fprs[i].append(FPR)
            rmses[i].append(rmse)

    ret = []

    for i, subgroup in enumerate(names):
        subgroup_data = {
            'subgroup': get_subgroup_str(subgroup),
            'n': f"{subgroup_sizes[i]/counts[i]}",
            'auc_avg': f"{np.average(aucs[i]):.3f}",
            'auc_std': f"{np.std(aucs[i]):.3f}",
            'fpr_avg': f"{np.average(fprs[i]):.3f}",
            'fpr_std': f"{np.std(fprs[i]):.3f}",
            'rmse_avg': f"{np.average(rmses[i]):.3f}",
            'rmse_std': f"{np.std(rmses[i]):.3f}"
        }

        ret.append(subgroup_data)
    
    # print(f"AUC: {auc_avg} +/- {auc_std}")
    # print(f"FPR: {fpr_avg} +/- {fpr_std}")
    return ret

def calc_metric(model, X_test, y_test, is_gerryfair):
    auc = None
    rmse = None
    y_pred = None
    try:
        y_pred = np.array(model.predict(X_test))
        if is_gerryfair:
            y_pred = y_pred.ravel()
        # if is_gerryfair:
        #     print(y_pred)
        #     y_pred = (y_pred.values >= 0.5).astype(int)
        auc = roc_auc_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    except Exception:
        print('error')
        return None
    y_test = np.array(y_test)
    TN = np.sum((y_test == 0) & (y_pred == 0))
    FP = np.sum((y_test == 0) & (y_pred == 1))      
    FPR = FP / (FP + TN)

    print(y_test, y_pred, FPR)

    return auc, FPR, rmse


In [31]:
X.head()

Unnamed: 0,answer_count,average_answer_length,total_votes_given_answers,total_votes_received_answers,question_count,average_question_length,total_votes_given_questions,total_votes_received_questions,total_questions_following,country_cd_US,female,male,gender_na,bachelor_obtained,white,latinx,black,asian,race_others,race_na
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0,0,1,0,0,0,0,0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0,0,1,0,0,0,0,0,1
2,10.0,797.2,0.0,0.0,0.0,797.2,0.0,0.0,0.0,1,1,0,0,0,1,0,0,0,0,0
3,5.0,589.0,1.0,1.0,1.0,589.0,0.0,4.0,1.0,1,1,0,0,1,1,0,0,0,1,0
4,11.0,300.818182,3.0,3.0,0.0,300.818182,0.0,3.0,0.0,1,0,0,1,0,1,0,0,0,0,0


In [28]:
res

[{'subgroup': 'Overall',
  'n': '109.8',
  'auc_avg': '0.777',
  'auc_std': '0.029',
  'fpr_avg': '0.433',
  'fpr_std': '0.044',
  'rmse_avg': '0.471',
  'rmse_std': '0.029'},
 {'subgroup': 'white',
  'n': '36.7',
  'auc_avg': '0.761',
  'auc_std': '0.100',
  'fpr_avg': '0.653',
  'fpr_std': '0.053',
  'rmse_avg': '0.461',
  'rmse_std': '0.106'},
 {'subgroup': 'black',
  'n': '8.5',
  'auc_avg': '0.690',
  'auc_std': '0.231',
  'fpr_avg': '0.535',
  'fpr_std': '0.165',
  'rmse_avg': '0.509',
  'rmse_std': '0.223'},
 {'subgroup': 'asian',
  'n': '5.5',
  'auc_avg': '0.817',
  'auc_std': '0.190',
  'fpr_avg': '0.488',
  'fpr_std': '0.185',
  'rmse_avg': '0.310',
  'rmse_std': '0.266'},
 {'subgroup': 'latinx',
  'n': '4.7',
  'auc_avg': '0.670',
  'auc_std': '0.200',
  'fpr_avg': '0.308',
  'fpr_std': '0.251',
  'rmse_avg': '0.516',
  'rmse_std': '0.294'},
 {'subgroup': 'race_others',
  'n': '1.9',
  'auc_avg': '0.667',
  'auc_std': '0.264',
  'fpr_avg': '0.688',
  'fpr_std': '0.207',
  '

In [26]:
res

[{'subgroup': 'Overall',
  'n': '109.8',
  'auc_avg': '0.780',
  'auc_std': '0.033',
  'fpr_avg': '0.156',
  'fpr_std': '0.053',
  'rmse_avg': '0.467',
  'rmse_std': '0.035'},
 {'subgroup': 'white',
  'n': '36.7',
  'auc_avg': '0.762',
  'auc_std': '0.097',
  'fpr_avg': '0.313',
  'fpr_std': '0.132',
  'rmse_avg': '0.454',
  'rmse_std': '0.108'},
 {'subgroup': 'black',
  'n': '8.5',
  'auc_avg': '0.704',
  'auc_std': '0.202',
  'fpr_avg': '0.360',
  'fpr_std': '0.301',
  'rmse_avg': '0.502',
  'rmse_std': '0.213'},
 {'subgroup': 'asian',
  'n': '5.5',
  'auc_avg': '0.817',
  'auc_std': '0.181',
  'fpr_avg': '0.145',
  'fpr_std': '0.198',
  'rmse_avg': '0.307',
  'rmse_std': '0.259'},
 {'subgroup': 'latinx',
  'n': '4.7',
  'auc_avg': '0.680',
  'auc_std': '0.178',
  'fpr_avg': '0.252',
  'fpr_std': '0.309',
  'rmse_avg': '0.539',
  'rmse_std': '0.217'},
 {'subgroup': 'race_others',
  'n': '1.9',
  'auc_avg': '0.458',
  'auc_std': '0.072',
  'fpr_avg': '1.000',
  'fpr_std': '0.000',
  '

In [18]:
csv_file = f"./data/new_gerryfair_metrics.csv"
# csv_file = "test.csv"

# Write the data to a CSV file
with open(csv_file, 'w', newline='') as csvfile:
    fieldnames = [
        'subgroup', 'n', 'auc_avg', 'auc_std', 'fpr_avg', 'fpr_std', 'rmse_avg', 'rmse_std']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    writer.writeheader()  # Write the header row
    for row in res:
        writer.writerow(row)

In [12]:
res

[{'subgroup': 'Overall',
  'n': '109.8',
  'auc_avg': '0.546',
  'auc_std': '0.051',
  'fpr_avg': '0.128',
  'fpr_std': '0.077',
  'rmse_avg': '0.536',
  'rmse_std': '0.024'},
 {'subgroup': 'white',
  'n': '36.7',
  'auc_avg': '0.787',
  'auc_std': '0.099',
  'fpr_avg': '0.017',
  'fpr_std': '0.050',
  'rmse_avg': '0.530',
  'rmse_std': '0.047'},
 {'subgroup': 'black',
  'n': '8.5',
  'auc_avg': '0.721',
  'auc_std': '0.209',
  'fpr_avg': 'nan',
  'fpr_std': 'nan',
  'rmse_avg': '0.530',
  'rmse_std': '0.135'},
 {'subgroup': 'asian',
  'n': '5.5',
  'auc_avg': '0.835',
  'auc_std': '0.227',
  'fpr_avg': 'nan',
  'fpr_std': 'nan',
  'rmse_avg': '0.347',
  'rmse_std': '0.151'},
 {'subgroup': 'latinx',
  'n': '4.7',
  'auc_avg': '0.675',
  'auc_std': '0.196',
  'fpr_avg': '0.000',
  'fpr_std': '0.000',
  'rmse_avg': '0.521',
  'rmse_std': '0.213'},
 {'subgroup': 'race_others',
  'n': '1.9',
  'auc_avg': '0.812',
  'auc_std': '0.123',
  'fpr_avg': 'nan',
  'fpr_std': 'nan',
  'rmse_avg': '

In [19]:
demo_groups = [['country_cd_US'], ['bachelor_obtained'], ['white', 'black', 'asian', 'latinx', 'race_others', 'race_na'], ['male', 'female', 'gender_na']]

In [27]:
data = {}

data['overall'] = len(X)

for c in combos[1:]:
    masks = [X[name] == value for name, value in c]
    print(masks)
    final_mask = pd.concat(masks, axis=1).all(axis=1)
    filter_X = X[final_mask]
    data[get_subgroup_str(c)] = len(filter_X)




[0       False
1       False
2        True
3        True
4        True
        ...  
1093    False
1094     True
1095    False
1096    False
1097    False
Name: white, Length: 1098, dtype: bool]
[0       False
1       False
2       False
3       False
4       False
        ...  
1093    False
1094    False
1095    False
1096     True
1097    False
Name: black, Length: 1098, dtype: bool]
[0       False
1       False
2       False
3       False
4       False
        ...  
1093    False
1094    False
1095    False
1096    False
1097    False
Name: asian, Length: 1098, dtype: bool]
[0       False
1       False
2       False
3       False
4       False
        ...  
1093    False
1094    False
1095    False
1096    False
1097    False
Name: latinx, Length: 1098, dtype: bool]
[0       False
1       False
2       False
3        True
4       False
        ...  
1093    False
1094    False
1095    False
1096    False
1097    False
Name: race_others, Length: 1098, dtype: bool]
[0        True
1  

In [28]:
data

{'overall': 1098,
 'white': 367,
 'black': 85,
 'asian': 55,
 'latinx': 47,
 'race_others': 19,
 'race_na': 470,
 'male': 173,
 'female': 691,
 'gender_na': 234,
 'International': 121,
 'US': 977,
 'No Bachelor': 601,
 'Bachelor or higher': 497,
 'white, male': 57,
 'white, female': 257,
 'white, gender_na': 53,
 'black, male': 21,
 'black, female': 34,
 'black, gender_na': 30,
 'asian, male': 9,
 'asian, female': 32,
 'asian, gender_na': 14,
 'latinx, male': 5,
 'latinx, female': 34,
 'latinx, gender_na': 8,
 'race_others, male': 1,
 'race_others, female': 12,
 'race_others, gender_na': 6,
 'race_na, male': 79,
 'race_na, female': 285,
 'race_na, gender_na': 106,
 'International, white': 22,
 'International, black': 15,
 'International, asian': 14,
 'International, latinx': 2,
 'International, race_others': 3,
 'International, race_na': 61,
 'International, male': 23,
 'International, female': 61,
 'International, gender_na': 37,
 'International, No Bachelor': 60,
 'International, Bac

In [29]:
df = pd.DataFrame(list(data.items()), columns=['Key', 'Count'])

# Specify the CSV file path
csv_file_path = 'data/MTC508_data_counts.csv'

# Write the DataFrame to a CSV file
df.to_csv(csv_file_path, index=False)

In [21]:
combos

[[],
 [('white', 1)],
 [('black', 1)],
 [('asian', 1)],
 [('latinx', 1)],
 [('race_others', 1)],
 [('race_na', 1)],
 [('male', 1)],
 [('female', 1)],
 [('gender_na', 1)],
 [('country_cd_US', 0)],
 [('country_cd_US', 1)],
 [('bachelor_obtained', 0)],
 [('bachelor_obtained', 1)],
 [('white', 1), ('male', 1)],
 [('white', 1), ('female', 1)],
 [('white', 1), ('gender_na', 1)],
 [('black', 1), ('male', 1)],
 [('black', 1), ('female', 1)],
 [('black', 1), ('gender_na', 1)],
 [('asian', 1), ('male', 1)],
 [('asian', 1), ('female', 1)],
 [('asian', 1), ('gender_na', 1)],
 [('latinx', 1), ('male', 1)],
 [('latinx', 1), ('female', 1)],
 [('latinx', 1), ('gender_na', 1)],
 [('race_others', 1), ('male', 1)],
 [('race_others', 1), ('female', 1)],
 [('race_others', 1), ('gender_na', 1)],
 [('race_na', 1), ('male', 1)],
 [('race_na', 1), ('female', 1)],
 [('race_na', 1), ('gender_na', 1)],
 [('country_cd_US', 0), ('white', 1)],
 [('country_cd_US', 0), ('black', 1)],
 [('country_cd_US', 0), ('asian', 

In [9]:
combos

[[],
 [('white', 1)],
 [('black', 1)],
 [('asian', 1)],
 [('latinx', 1)],
 [('race_others', 1)],
 [('race_na', 1)],
 [('male', 1)],
 [('female', 1)],
 [('gender_na', 1)],
 [('country_cd_US', 0)],
 [('country_cd_US', 1)],
 [('bachelor_obtained', 0)],
 [('bachelor_obtained', 1)],
 [('white', 1), ('male', 1)],
 [('white', 1), ('female', 1)],
 [('white', 1), ('gender_na', 1)],
 [('black', 1), ('male', 1)],
 [('black', 1), ('female', 1)],
 [('black', 1), ('gender_na', 1)],
 [('asian', 1), ('male', 1)],
 [('asian', 1), ('female', 1)],
 [('asian', 1), ('gender_na', 1)],
 [('latinx', 1), ('male', 1)],
 [('latinx', 1), ('female', 1)],
 [('latinx', 1), ('gender_na', 1)],
 [('race_others', 1), ('male', 1)],
 [('race_others', 1), ('female', 1)],
 [('race_others', 1), ('gender_na', 1)],
 [('race_na', 1), ('male', 1)],
 [('race_na', 1), ('female', 1)],
 [('race_na', 1), ('gender_na', 1)],
 [('country_cd_US', 0), ('white', 1)],
 [('country_cd_US', 0), ('black', 1)],
 [('country_cd_US', 0), ('asian', 

In [26]:
data

{'white': 367,
 'black': 85,
 'asian': 55,
 'latinx': 47,
 'race_others': 19,
 'race_na': 470,
 'male': 173,
 'female': 691,
 'gender_na': 234,
 'International': 121,
 'US': 977,
 'No Bachelor': 601,
 'Bachelor or higher': 497,
 'white, male': 57,
 'white, female': 257,
 'white, gender_na': 53,
 'black, male': 21,
 'black, female': 34,
 'black, gender_na': 30,
 'asian, male': 9,
 'asian, female': 32,
 'asian, gender_na': 14,
 'latinx, male': 5,
 'latinx, female': 34,
 'latinx, gender_na': 8,
 'race_others, male': 1,
 'race_others, female': 12,
 'race_others, gender_na': 6,
 'race_na, male': 79,
 'race_na, female': 285,
 'race_na, gender_na': 106,
 'International, white': 22,
 'International, black': 15,
 'International, asian': 14,
 'International, latinx': 2,
 'International, race_others': 3,
 'International, race_na': 61,
 'International, male': 23,
 'International, female': 61,
 'International, gender_na': 37,
 'International, No Bachelor': 60,
 'International, Bachelor or higher': 