In [1]:
import os
import pandas as pd
import glob
import yaml
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data_path = 'data'
performance_path = os.path.join(data_path, 'performance', 'final_classifier_grid')
config_path = os.path.join(data_path, 'config', 'grid', 'final_classifier')

In [3]:
figure_path = os.path.join(data_path, 'figures')
os.makedirs(figure_path, exist_ok = True)

In [4]:
def path_split(x):
    x = x.replace('/', ' ').split()
    return x

In [5]:
files_eval = glob.glob(os.path.join(performance_path, '**', '*_eval.csv'), recursive=True)
df_eval = pd.concat({(file, path_split(file)[-4], path_split(file)[-3], path_split(file)[-2]): 
                        pd.read_csv(file) for file in files_eval})
df_eval = df_eval.rename_axis(index = ['file_path', 'outcome', 'sensitive_variable', 'experiment_id', 'index'])
df_eval = df_eval.reset_index([0, 1, 2, 3])

In [6]:
df_eval.head()

Unnamed: 0_level_0,file_path,outcome,sensitive_variable,experiment_id,metric,phase,epoch,performance
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,data/performance/final_classifier_grid/los/age...,los,age,0,auc,val,0,0.840372
1,data/performance/final_classifier_grid/los/age...,los,age,0,auprc,val,0,0.587191
2,data/performance/final_classifier_grid/los/age...,los,age,0,brier,val,0,0.122572
3,data/performance/final_classifier_grid/los/age...,los,age,0,classification,val,0,0.397314
4,data/performance/final_classifier_grid/los/age...,los,age,0,classification_cf,val,0,0.264394


In [7]:
files_config = glob.glob(os.path.join(config_path, '**', 'config.csv'), recursive=True)

In [8]:
files_config

['data/config/grid/final_classifier/los/age/config.csv',
 'data/config/grid/final_classifier/los/race_eth/config.csv',
 'data/config/grid/final_classifier/los/gender/config.csv',
 'data/config/grid/final_classifier/mortality/age/config.csv',
 'data/config/grid/final_classifier/mortality/race_eth/config.csv',
 'data/config/grid/final_classifier/mortality/gender/config.csv']

In [9]:
df_config = pd.concat({(file, path_split(file)[-3], path_split(file)[-2]): 
                pd.read_csv(file) for file in files_config})

In [10]:
df_config = df_config.rename_axis(index = ['file_path', 'outcome', 'sensitive_variable', 'index'])
df_config = df_config.rename(columns = {'id' : 'experiment_id'})
df_config = df_config.reset_index([0, 1, 2])

In [11]:
df_config.head()

Unnamed: 0_level_0,file_path,outcome,sensitive_variable,experiment_id,cf_gradients,lambda_clp,lambda_clp_entropy,lambda_final_classifier_cf,lr_final_classifier,num_epochs,num_samples_eval,weighted
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,data/config/grid/final_classifier/los/age/conf...,los,age,0,False,0.0,0.0,10.0,0.001,20,1,False
1,data/config/grid/final_classifier/los/age/conf...,los,age,1,True,0.1,0.0,0.1,0.01,20,1,False
2,data/config/grid/final_classifier/los/age/conf...,los,age,2,True,0.1,0.0,1.0,0.01,20,1,False
3,data/config/grid/final_classifier/los/age/conf...,los,age,3,True,0.0,0.0,1.0,0.0001,20,1,False
4,data/config/grid/final_classifier/los/age/conf...,los,age,4,False,0.1,0.0,1.0,0.001,20,1,False


In [12]:
files_preds = glob.glob(os.path.join(performance_path, '**', '*_cf_df.csv'), recursive=True)
df_preds = pd.concat({(file, path_split(file)[-4], path_split(file)[-3], path_split(file)[-2]): 
                        pd.read_csv(file) for file in files_preds})

In [13]:
df_preds.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,phase,y,y_cf,group,group_cf,pred_prob_factual,pred_prob_cf,output_factual,output_cf,pred_diff
data/performance/final_classifier_grid/los/age/0/1552943500.6546397_cf_df.csv,los,age,0,0,test,0,0,3,0,0.09701,0.056446,-2.022694,-2.221776,-0.040564
data/performance/final_classifier_grid/los/age/0/1552943500.6546397_cf_df.csv,los,age,0,1,test,0,0,3,0,0.015053,0.011752,-2.604437,-2.693174,-0.0033
data/performance/final_classifier_grid/los/age/0/1552943500.6546397_cf_df.csv,los,age,0,2,test,0,0,2,0,0.012044,0.008748,-2.586944,-2.722506,-0.003296
data/performance/final_classifier_grid/los/age/0/1552943500.6546397_cf_df.csv,los,age,0,3,test,0,0,2,0,0.201651,0.148533,-1.633199,-1.772977,-0.053118
data/performance/final_classifier_grid/los/age/0/1552943500.6546397_cf_df.csv,los,age,0,4,test,1,1,2,0,0.902797,0.904435,-0.011167,0.013584,0.001638


In [14]:
df_preds_copy = df_preds.copy(deep = True)

In [15]:
df_preds = df_preds.rename_axis(index = ['file_path', 'outcome', 'sensitive_variable', 'experiment_id', 'index'])
df_preds = df_preds.reset_index([0, 1, 2, 3])

In [16]:
df_preds['experiment_id'] = df_preds['experiment_id'].astype('int')

In [17]:
df_preds.head()

Unnamed: 0_level_0,file_path,outcome,sensitive_variable,experiment_id,phase,y,y_cf,group,group_cf,pred_prob_factual,pred_prob_cf,output_factual,output_cf,pred_diff
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,data/performance/final_classifier_grid/los/age...,los,age,0,test,0,0,3,0,0.09701,0.056446,-2.022694,-2.221776,-0.040564
1,data/performance/final_classifier_grid/los/age...,los,age,0,test,0,0,3,0,0.015053,0.011752,-2.604437,-2.693174,-0.0033
2,data/performance/final_classifier_grid/los/age...,los,age,0,test,0,0,2,0,0.012044,0.008748,-2.586944,-2.722506,-0.003296
3,data/performance/final_classifier_grid/los/age...,los,age,0,test,0,0,2,0,0.201651,0.148533,-1.633199,-1.772977,-0.053118
4,data/performance/final_classifier_grid/los/age...,los,age,0,test,1,1,2,0,0.902797,0.904435,-0.011167,0.013584,0.001638


In [18]:
df_preds_temp = df_preds.loc[(df_preds.sensitive_variable == 'age') & 
                            (df_preds.outcome == 'los') & 
                            (df_preds.phase == 'test')
                           ]

In [19]:
df_config_temp = df_config.loc[(df_config.cf_gradients == True) &
                               (df_config.lambda_final_classifier_cf == 1.0) &
                               (df_config.lr_final_classifier == 1e-3) & 
                               (df_config.weighted == False)
                              ]

In [20]:
temp = pd.merge(df_preds_temp, df_config_temp, how = 'inner', on = ['experiment_id', 'outcome', 'sensitive_variable'])

In [21]:
temp.head()

Unnamed: 0,file_path_x,outcome,sensitive_variable,experiment_id,phase,y,y_cf,group,group_cf,pred_prob_factual,...,pred_diff,file_path_y,cf_gradients,lambda_clp,lambda_clp_entropy,lambda_final_classifier_cf,lr_final_classifier,num_epochs,num_samples_eval,weighted
0,data/performance/final_classifier_grid/los/age...,los,age,100,test,0,0,3,0,0.041154,...,-0.01183,data/config/grid/final_classifier/los/age/conf...,True,0.01,0.0,1.0,0.001,20,1,False
1,data/performance/final_classifier_grid/los/age...,los,age,100,test,0,0,3,0,0.017358,...,-0.001145,data/config/grid/final_classifier/los/age/conf...,True,0.01,0.0,1.0,0.001,20,1,False
2,data/performance/final_classifier_grid/los/age...,los,age,100,test,0,0,2,0,0.008716,...,-0.001161,data/config/grid/final_classifier/los/age/conf...,True,0.01,0.0,1.0,0.001,20,1,False
3,data/performance/final_classifier_grid/los/age...,los,age,100,test,0,0,2,0,0.164062,...,-0.053136,data/config/grid/final_classifier/los/age/conf...,True,0.01,0.0,1.0,0.001,20,1,False
4,data/performance/final_classifier_grid/los/age...,los,age,100,test,1,1,2,0,0.91732,...,-0.012163,data/config/grid/final_classifier/los/age/conf...,True,0.01,0.0,1.0,0.001,20,1,False


In [22]:
temp = temp.loc[(temp.y == temp.y_cf) & (temp.y == 0)]

In [23]:
def draw_line(data = None, color = None, label = None):
    plt.axvline(x = 0, linestyle = 'dashed')

In [None]:
with sns.plotting_context('notebook', font_scale=2):
    g = sns.FacetGrid(temp,
                      hue = 'lambda_clp',
                      row = "group",
                      col = "group_cf",   
                      sharey = 'row',
                      ylim = (0, 20),
                      height = 4,
                      aspect = 2
                     )
    g.map(lambda x: sns.kdeplot(x, hist_kws={'log':True}), 'pred_diff').map(draw_line).add_legend()
    plt.subplots_adjust(top = 0.9)
    g.set_xlabels('Relative difference in predicted p(Y)')
    g.set_ylabels('Density')
    g.savefig(os.path.join(figure_path, 'dist_plot_{}.png'.format('test')), dpi = 90)

In [25]:
df_group = df_preds.groupby(['outcome', 'sensitive_variable', 'y', 'y_cf'])

In [26]:
df_config_group = df_config.groupby(['cf_gradients', 'lambda_final_classifier_cf', 'lr_final_classifier'])

In [27]:
for j, df_tuple in enumerate(df_config_group):
    config_id = df_tuple[0]
    config_id = [str(x) for x in config_id]
    print('_'.join(config_id))

False_0.0_0.0001
False_0.0_0.001
False_0.0_0.01
False_0.1_0.0001
False_0.1_0.001
False_0.1_0.01
False_1.0_0.0001
False_1.0_0.001
False_1.0_0.01
False_10.0_0.0001
False_10.0_0.001
False_10.0_0.01
True_0.0_0.0001
True_0.0_0.001
True_0.0_0.01
True_0.1_0.0001
True_0.1_0.001
True_0.1_0.01
True_1.0_0.0001
True_1.0_0.001
True_1.0_0.01
True_10.0_0.0001
True_10.0_0.001
True_10.0_0.01


In [None]:
with sns.plotting_context('notebook', font_scale=2):
    for i, df_tuple in enumerate(df_group):
        for j, config_df_tuple in enumerate(df_config_group):
            group_id, small_df = df_tuple
            config_id, config_df = config_df_tuple
            
            config_id = config_df_tuple[0]
            config_id = [str(x) for x in config_id]
            config_id = '_'.join(config_id)
            print(group_id, config_id)
            temp_df = pd.merge(small_df, config_df, how = 'inner', on = ['experiment_id', 'outcome', 'sensitive_variable'])
            g = sns.FacetGrid(temp_df,
                              hue = 'lambda_clp',
                              row = "group",
                              col = "group_cf",   
                              sharey = 'row',
                              ylim = (0, 20),
                              height = 4,
                              aspect = 2
                             )
            g.map(sns.kdeplot, 'pred_diff').map(draw_line).add_legend()
            plt.subplots_adjust(top = 0.9)
            g.set_xlabels('Relative difference in predicted p(Y)')
            g.set_ylabels('Density')
            temp_fig_path = os.path.join(figure_path, group_id[0], group_id[1], config_id)
            os.makedirs(temp_fig_path, exist_ok = True)
            g.savefig(os.path.join(temp_fig_path, '{}_{}.png'.format(group_id[2], group_id[3])), dpi = 90)
            plt.close()

('los', 'age', 0, 0) False_0.0_0.0001
('los', 'age', 0, 0) False_0.0_0.001
('los', 'age', 0, 0) False_0.0_0.01
('los', 'age', 0, 0) False_0.1_0.0001
('los', 'age', 0, 0) False_0.1_0.001
('los', 'age', 0, 0) False_0.1_0.01
('los', 'age', 0, 0) False_1.0_0.0001
('los', 'age', 0, 0) False_1.0_0.001
('los', 'age', 0, 0) False_1.0_0.01
('los', 'age', 0, 0) False_10.0_0.0001
('los', 'age', 0, 0) False_10.0_0.001
('los', 'age', 0, 0) False_10.0_0.01
('los', 'age', 0, 0) True_0.0_0.0001
('los', 'age', 0, 0) True_0.0_0.001
('los', 'age', 0, 0) True_0.0_0.01
('los', 'age', 0, 0) True_0.1_0.0001
('los', 'age', 0, 0) True_0.1_0.001
('los', 'age', 0, 0) True_0.1_0.01
('los', 'age', 0, 0) True_1.0_0.0001
('los', 'age', 0, 0) True_1.0_0.001
('los', 'age', 0, 0) True_1.0_0.01
('los', 'age', 0, 0) True_10.0_0.0001
('los', 'age', 0, 0) True_10.0_0.001
('los', 'age', 0, 0) True_10.0_0.01
('los', 'age', 0, 1) False_0.0_0.0001
('los', 'age', 0, 1) False_0.0_0.001
('los', 'age', 0, 1) False_0.0_0.01
('los',

('los', 'race_eth', 0, 0) True_10.0_0.0001
('los', 'race_eth', 0, 0) True_10.0_0.001
('los', 'race_eth', 0, 0) True_10.0_0.01
('los', 'race_eth', 0, 1) False_0.0_0.0001
('los', 'race_eth', 0, 1) False_0.0_0.001
('los', 'race_eth', 0, 1) False_0.0_0.01
('los', 'race_eth', 0, 1) False_0.1_0.0001
('los', 'race_eth', 0, 1) False_0.1_0.001
('los', 'race_eth', 0, 1) False_0.1_0.01
('los', 'race_eth', 0, 1) False_1.0_0.0001
('los', 'race_eth', 0, 1) False_1.0_0.001
('los', 'race_eth', 0, 1) False_1.0_0.01
('los', 'race_eth', 0, 1) False_10.0_0.0001
('los', 'race_eth', 0, 1) False_10.0_0.001
('los', 'race_eth', 0, 1) False_10.0_0.01
('los', 'race_eth', 0, 1) True_0.0_0.0001
('los', 'race_eth', 0, 1) True_0.0_0.001
('los', 'race_eth', 0, 1) True_0.0_0.01
('los', 'race_eth', 0, 1) True_0.1_0.0001
('los', 'race_eth', 0, 1) True_0.1_0.001
('los', 'race_eth', 0, 1) True_0.1_0.01
('los', 'race_eth', 0, 1) True_1.0_0.0001
('los', 'race_eth', 0, 1) True_1.0_0.001
('los', 'race_eth', 0, 1) True_1.0_0.0