In [2]:
import os
import pandas as pd
import glob
import yaml
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
data_path = 'data'
performance_path = os.path.join(data_path, 'performance', 'final_classifier_grid')
config_path = os.path.join(data_path, 'config', 'grid', 'final_classifier')

In [4]:
figure_path = os.path.join(data_path, 'figures')
os.makedirs(figure_path, exist_ok = True)

In [5]:
def path_split(x):
    x = x.replace('/', ' ').split()
    return x

In [38]:
files_eval = glob.glob(os.path.join(performance_path, '**', '*_eval.csv'), recursive=True)
df_eval = pd.concat({(file, path_split(file)[-4], path_split(file)[-3], path_split(file)[-2]): 
                        pd.read_csv(file) for file in files_eval})
df_eval = df_eval.rename_axis(index = ['file_path', 'outcome', 'sensitive_variable', 'experiment_id', 'index'])
df_eval = df_eval.reset_index([0, 1, 2, 3])
df_eval['experiment_id'] = df_eval['experiment_id'].astype(int)

In [39]:
df_eval.head()

Unnamed: 0_level_0,file_path,outcome,sensitive_variable,experiment_id,metric,phase,epoch,performance
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,data/performance/final_classifier_grid/los/age...,los,age,0,auc,val,0,0.840372
1,data/performance/final_classifier_grid/los/age...,los,age,0,auprc,val,0,0.587191
2,data/performance/final_classifier_grid/los/age...,los,age,0,brier,val,0,0.122572
3,data/performance/final_classifier_grid/los/age...,los,age,0,classification,val,0,0.397314
4,data/performance/final_classifier_grid/los/age...,los,age,0,classification_cf,val,0,0.264394


In [22]:
files_config = glob.glob(os.path.join(config_path, '**', 'config.csv'), recursive=True)

In [31]:
df_config = pd.concat({(file, path_split(file)[-3], path_split(file)[-2]): 
                pd.read_csv(file, dtype = {'id' : int}) for file in files_config})

In [32]:
df_config = df_config.rename_axis(index = ['file_path', 'outcome', 'sensitive_variable', 'index'])
df_config = df_config.rename(columns = {'id' : 'experiment_id'})
df_config = df_config.reset_index([0, 1, 2])

In [33]:
df_config.head()

Unnamed: 0_level_0,file_path,outcome,sensitive_variable,experiment_id,cf_gradients,lambda_clp,lambda_clp_entropy,lambda_final_classifier_cf,lr_final_classifier,num_epochs,num_samples_eval,weighted
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,data/config/grid/final_classifier/los/age/conf...,los,age,0,False,0.0,0.0,10.0,0.001,20,1,False
1,data/config/grid/final_classifier/los/age/conf...,los,age,1,True,0.1,0.0,0.1,0.01,20,1,False
2,data/config/grid/final_classifier/los/age/conf...,los,age,2,True,0.1,0.0,1.0,0.01,20,1,False
3,data/config/grid/final_classifier/los/age/conf...,los,age,3,True,0.0,0.0,1.0,0.0001,20,1,False
4,data/config/grid/final_classifier/los/age/conf...,los,age,4,False,0.1,0.0,1.0,0.001,20,1,False


In [34]:
df_eval.head()

Unnamed: 0_level_0,file_path,outcome,sensitive_variable,experiment_id,metric,phase,epoch,performance
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,data/performance/final_classifier_grid/los/age...,los,age,0,auc,val,0,0.840372
1,data/performance/final_classifier_grid/los/age...,los,age,0,auprc,val,0,0.587191
2,data/performance/final_classifier_grid/los/age...,los,age,0,brier,val,0,0.122572
3,data/performance/final_classifier_grid/los/age...,los,age,0,classification,val,0,0.397314
4,data/performance/final_classifier_grid/los/age...,los,age,0,classification_cf,val,0,0.264394


In [40]:
print(df_eval.apply(lambda x: type(x.array[0])))
print(df_config_temp.apply(lambda x: type(x.array[0])))

file_path               <class 'str'>
outcome                 <class 'str'>
sensitive_variable      <class 'str'>
experiment_id           <class 'int'>
metric                  <class 'str'>
phase                   <class 'str'>
epoch                   <class 'int'>
performance           <class 'float'>
dtype: object
file_path                       <class 'str'>
outcome                         <class 'str'>
sensitive_variable              <class 'str'>
experiment_id                   <class 'int'>
cf_gradients                   <class 'bool'>
lambda_clp                    <class 'float'>
lambda_clp_entropy            <class 'float'>
lambda_final_classifier_cf    <class 'float'>
lr_final_classifier           <class 'float'>
num_epochs                      <class 'int'>
num_samples_eval                <class 'int'>
weighted                       <class 'bool'>
dtype: object


In [108]:
df_config_temp = df_config.loc[(df_config.cf_gradients == False) &
                               (df_config.lambda_final_classifier_cf == 1.0) &
                               (df_config.lr_final_classifier == 1e-3)
                              ]
df_summary = pd.merge(df_eval, df_config_temp, how = 'inner', on = ['experiment_id', 'outcome', 'sensitive_variable'])
df_summary = df_summary.loc[df_summary.phase == 'test']
temp = df_summary.groupby(['outcome', 'sensitive_variable', 'metric', 'lambda_clp'])[['performance']] \
            .agg(lambda x: x). \
            reset_index()
temp = temp.loc[(temp.outcome == 'los') & (temp.sensitive_variable == 'age')]
temp.pivot('metric', 'lambda_clp', 'performance')

temp = df_summary. \
            groupby(['outcome', 'sensitive_variable', 'metric', 'lambda_clp'])[['performance']]. \
            agg(lambda x: x). \
            reset_index()

temp = temp. \
        groupby(['outcome', 'sensitive_variable']). \
        apply(lambda x: x.pivot('metric', 'lambda_clp', 'performance'))

temp = temp[temp.index.get_level_values('metric').isin(['auc', 'auprc', 'brier', 'clp'])]

display(temp)

Unnamed: 0_level_0,Unnamed: 1_level_0,lambda_clp,0.0,0.01,0.1,1.0,10.0
outcome,sensitive_variable,metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
los,age,auc,0.839281,0.83944,0.837327,0.838033,0.836906
los,age,auprc,0.564808,0.567654,0.564719,0.571301,0.569843
los,age,brier,0.120617,0.120257,0.120731,0.119884,0.119177
los,age,clp,0.225489,0.133754,0.046074,0.003846,7.8e-05
los,gender,auc,0.843952,0.843763,0.842846,0.839208,0.838614
los,gender,auprc,0.574549,0.571456,0.571536,0.568054,0.57109
los,gender,brier,0.118158,0.118805,0.118338,0.118249,0.117402
los,gender,clp,0.187826,0.09741,0.023434,0.001006,5.4e-05
los,race_eth,auc,0.843057,0.842709,0.843434,0.844095,0.84436
los,race_eth,auprc,0.560884,0.562984,0.565923,0.564842,0.562695


### baseline

In [143]:
performance_path_baseline = os.path.join(data_path, 'performance', 'baseline_default')

files_baseline = glob.glob(os.path.join(performance_path_baseline, '**', '*_eval.csv'), recursive=True)
df_baseline = pd.concat({(file, path_split(file)[-3], path_split(file)[-2]): 
                        pd.read_csv(file) for file in files_baseline})
df_baseline = df_baseline.rename_axis(index = ['file_path', 'outcome', 'experiment_id', 'index'])
df_baseline = df_baseline.reset_index([0, 2, 3], drop = True)
df_baseline = df_baseline.loc[df_baseline.phase == 'test']
df_baseline = df_baseline.drop(columns = ['epoch', 'phase'])
df_baseline = df_baseline.rename(columns = {'performance' : 'baseline'})
display(df_baseline)

Unnamed: 0_level_0,metric,baseline
outcome,Unnamed: 1_level_1,Unnamed: 2_level_1
los,auc,0.850835
los,auprc,0.581952
los,brier,0.115485
los,loss,0.365094
mortality,auc,0.892904
mortality,auprc,0.267086
mortality,brier,0.020567
mortality,loss,0.09496


In [162]:
temp

Unnamed: 0_level_0,Unnamed: 1_level_0,lambda_clp,0.0,0.01,0.1,1.0,10.0
outcome,sensitive_variable,metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
los,age,auc,0.839281,0.83944,0.837327,0.838033,0.836906
los,age,auprc,0.564808,0.567654,0.564719,0.571301,0.569843
los,age,brier,0.120617,0.120257,0.120731,0.119884,0.119177
los,age,clp,0.225489,0.133754,0.046074,0.003846,7.8e-05
los,gender,auc,0.843952,0.843763,0.842846,0.839208,0.838614
los,gender,auprc,0.574549,0.571456,0.571536,0.568054,0.57109
los,gender,brier,0.118158,0.118805,0.118338,0.118249,0.117402
los,gender,clp,0.187826,0.09741,0.023434,0.001006,5.4e-05
los,race_eth,auc,0.843057,0.842709,0.843434,0.844095,0.84436
los,race_eth,auprc,0.560884,0.562984,0.565923,0.564842,0.562695


In [173]:
df = pd.merge(temp.reset_index([0, 1, 2]), df_baseline.reset_index(), how = 'left')
df = df.set_index(['outcome', 'sensitive_variable', 'metric'])
df = df[['baseline', 0.0, 0.01, 0.1, 1.0, 10.0]]
display(df)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,baseline,0.0,0.01,0.1,1.0,10.0
outcome,sensitive_variable,metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
los,age,auc,0.850835,0.839281,0.83944,0.837327,0.838033,0.836906
los,age,auprc,0.581952,0.564808,0.567654,0.564719,0.571301,0.569843
los,age,brier,0.115485,0.120617,0.120257,0.120731,0.119884,0.119177
los,age,clp,,0.225489,0.133754,0.046074,0.003846,7.8e-05
los,gender,auc,0.850835,0.843952,0.843763,0.842846,0.839208,0.838614
los,gender,auprc,0.581952,0.574549,0.571456,0.571536,0.568054,0.57109
los,gender,brier,0.115485,0.118158,0.118805,0.118338,0.118249,0.117402
los,gender,clp,,0.187826,0.09741,0.023434,0.001006,5.4e-05
los,race_eth,auc,0.850835,0.843057,0.842709,0.843434,0.844095,0.84436
los,race_eth,auprc,0.581952,0.560884,0.562984,0.565923,0.564842,0.562695


In [175]:
table_path = './latex_tables'
os.makedirs(table_path, exist_ok = True)
with open(os.path.join(table_path, 'group_performance.txt'), 'w') as fp:
    df.to_latex(fp, float_format = '%.3g')

### by group

In [259]:
files_by_group = glob.glob(os.path.join(performance_path, '**', '*_by_group.csv'), recursive=True)
df_by_group = pd.concat({(file, path_split(file)[-4], path_split(file)[-3], path_split(file)[-2]): 
                        pd.read_csv(file) for file in files_by_group})
df_by_group = df_by_group.drop(columns = 'sensitive_variable')
df_by_group = df_by_group.rename_axis(index = ['file_path', 'outcome', 'sensitive_variable', 'experiment_id', 'index'])
df_by_group = df_by_group.reset_index([0, 1, 2, 3])
df_by_group['experiment_id'] = df_by_group['experiment_id'].astype(int)

In [260]:
df_by_group.head()

Unnamed: 0_level_0,file_path,outcome,sensitive_variable,experiment_id,group,metric,phase,epoch,performance
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,data/performance/final_classifier_grid/los/age...,los,age,0,0,auc,val,0,0.887418
1,data/performance/final_classifier_grid/los/age...,los,age,0,0,auprc,val,0,0.64211
2,data/performance/final_classifier_grid/los/age...,los,age,0,0,brier,val,0,0.09824
3,data/performance/final_classifier_grid/los/age...,los,age,0,0,classification,val,0,0.31494
4,data/performance/final_classifier_grid/los/age...,los,age,0,0,loss,val,0,0.31494


In [261]:
df_config_temp = df_config.loc[(df_config.cf_gradients == True) &
                               (df_config.lambda_final_classifier_cf == 1.0) &
                               (df_config.lr_final_classifier == 1e-3)
                              ]
df_summary = pd.merge(df_by_group, df_config_temp, 
                      how = 'inner', 
                      on = ['experiment_id', 'outcome', 'sensitive_variable'])

df_summary = df_summary.loc[df_summary.phase == 'test']

temp = df_summary. \
            groupby(['outcome', 'sensitive_variable', 'metric', 'group', 'lambda_clp'])[['performance']]. \
            agg(lambda x: x). \
            reset_index()

temp = temp. \
        groupby(['outcome', 'sensitive_variable', 'group']). \
        apply(lambda x: x.pivot('metric', 'lambda_clp', 'performance'))

temp = temp[temp.index.get_level_values('metric').isin(['auc', 'auprc', 'brier', 'clp'])]

display(temp)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lambda_clp,0.0,0.01,0.1,1.0,10.0
outcome,sensitive_variable,group,metric,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
los,age,0,auc,0.879818,0.869902,0.862532,0.872007,0.873057
los,age,0,auprc,0.622410,0.614965,0.588737,0.617355,0.607218
los,age,0,brier,0.098488,0.102041,0.104650,0.100918,0.103416
los,age,1,auc,0.869006,0.867848,0.866198,0.873195,0.871014
los,age,1,auprc,0.549133,0.557163,0.545393,0.560247,0.557220
los,age,1,brier,0.090981,0.090550,0.091069,0.087456,0.089452
los,age,2,auc,0.838456,0.838895,0.839321,0.836774,0.838227
los,age,2,auprc,0.587777,0.584016,0.579474,0.577226,0.591799
los,age,2,brier,0.129876,0.130763,0.128887,0.129432,0.129194
los,age,3,auc,0.801292,0.802314,0.804272,0.805051,0.806124


In [262]:
## Load the baseline data
performance_path_baseline = os.path.join(data_path, 'performance', 'baseline_default')

files_baseline = glob.glob(os.path.join(performance_path_baseline, '**', '*_by_group.csv'), recursive=True)
df_baseline = pd.concat({(file, path_split(file)[-3], path_split(file)[-2]): 
                        pd.read_csv(file) for file in files_baseline})
df_baseline = df_baseline.rename_axis(index = ['file_path', 'outcome', 'experiment_id', 'index'])
df_baseline = df_baseline.reset_index([0, 1, 2, 3])
df_baseline = df_baseline.loc[df_baseline.phase == 'test']
df_baseline = df_baseline.drop(columns = ['epoch', 'phase'])
df_baseline = df_baseline.rename(columns = {'performance' : 'baseline'})
df_baseline = df_baseline.drop(columns = ['experiment_id', 'file_path', 'index'])
display(df_baseline)

Unnamed: 0,outcome,sensitive_variable,group,metric,baseline
4,los,age,0,auc,0.885375
5,los,age,0,auprc,0.607936
6,los,age,0,brier,0.098003
7,los,age,0,loss,0.345627
12,los,age,1,auc,0.881584
13,los,age,1,auprc,0.544652
14,los,age,1,brier,0.086962
15,los,age,1,loss,0.294602
20,los,age,2,auc,0.848664
21,los,age,2,auprc,0.606433


In [263]:
temp = pd.merge(temp.reset_index(), df_baseline).set_index(['outcome', 'sensitive_variable', 'group', 'metric'])

In [264]:
temp

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,0.0,0.01,0.1,1.0,10.0,baseline
outcome,sensitive_variable,group,metric,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
los,age,0,auc,0.879818,0.869902,0.862532,0.872007,0.873057,0.885375
los,age,0,auprc,0.622410,0.614965,0.588737,0.617355,0.607218,0.607936
los,age,0,brier,0.098488,0.102041,0.104650,0.100918,0.103416,0.098003
los,age,1,auc,0.869006,0.867848,0.866198,0.873195,0.871014,0.881584
los,age,1,auprc,0.549133,0.557163,0.545393,0.560247,0.557220,0.544652
los,age,1,brier,0.090981,0.090550,0.091069,0.087456,0.089452,0.086962
los,age,2,auc,0.838456,0.838895,0.839321,0.836774,0.838227,0.848664
los,age,2,auprc,0.587777,0.584016,0.579474,0.577226,0.591799,0.606433
los,age,2,brier,0.129876,0.130763,0.128887,0.129432,0.129194,0.123383
los,age,3,auc,0.801292,0.802314,0.804272,0.805051,0.806124,0.816585


In [265]:
## Load group maps
label_path = os.path.join(data_path, 'labels')
os.listdir(label_path)
pd.read_csv(os.path.join(label_path, 'race_eth_map.csv'))
group_map_df = pd.concat(
    {key: pd.read_csv(os.path.join(label_path, '{}_map.csv'.format(key)))
          for key in ['age', 'gender', 'race_eth']}). \
        reset_index(1, drop = True). \
        rename_axis(index = 'sensitive_variable'). \
        rename(columns = {'category_id' : 'group', 'categories' : 'group_name'})

In [266]:
group_map_df

Unnamed: 0_level_0,group,group_name
sensitive_variable,Unnamed: 1_level_1,Unnamed: 2_level_1
age,0,"[18, 30)"
age,1,"[30, 45)"
age,2,"[45, 65)"
age,3,"[65, 89)"
gender,0,Female
gender,1,Male
gender,2,Other
race_eth,0,Asian
race_eth,1,Black
race_eth,2,Hispanic


In [267]:
by_group_df = pd.merge(temp.reset_index(), group_map_df.reset_index(), how = 'left'). \
    set_index(['outcome', 'sensitive_variable', 'group_name', 'metric']). \
    drop(columns = 'group')
by_group_df = by_group_df[['baseline', 0.0, 0.01, 0.1, 1.0, 10.0]]

with open(os.path.join(table_path, 'by_group_performance.txt'), 'w') as fp:
    by_group_df.to_latex(fp, float_format = '%.3g')

temp = by_group_df.loc[by_group_df.index.get_level_values('outcome') == 'los']
temp = temp.reset_index(0, drop = True)
with open(os.path.join(table_path, 'by_group_performance_los.txt'), 'w') as fp:
    temp.to_latex(fp, float_format = '%.3g')

temp = by_group_df.loc[by_group_df.index.get_level_values('outcome') == 'mortality']
temp = temp.reset_index(0, drop = True)
with open(os.path.join(table_path, 'by_group_performance_mortality.txt'), 'w') as fp:
    temp.to_latex(fp, float_format = '%.3g')