In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import glob
import os
import matplotlib.pyplot as plt
import shutil
import sys
sys.path.append('../')
from prediction_utils.util import df_dict_concat, yaml_read, yaml_write
from pathlib import Path
from tqdm import tqdm
import json
import hashlib
import pickle

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [None]:
project_dir = Path("/scratch/hdd001/home/haoran/stanford_robustness/results")

In [None]:
force_reload = False
pkl_path = Path('df_all.pkl')
hparams = ['lr', 'num_hidden', 'drop_prob', 'hidden_dim', 'model_type', 'adjustment_scale', 'lr_lambda']
if pkl_path.exists() and not force_reload:
    df_all = pd.read_pickle(pkl_path)
else:
    res = []
    for i in tqdm(project_dir.glob('**/result_df_group_standard_eval.parquet')):    
        df_i = pd.read_parquet(i)
        args_i = json.load((i.parent/'args.json').open('r'))
        args_i['task'] = i.parent.parent.name[:-3] + '_' + args_i['label_col']
        if args_i['task'] == 'eICUlos3_los3':
            args_i['task'] = 'eICUlos_los3'
        elif args_i['task'] == 'eICUlos7_los7':
            args_i['task'] = 'eICUlos_los7'
        
        args_i['config_filename'] = i.parent.relative_to(project_dir)
        
        for j in ['task', 'config_filename', 'group_objective_type', 'selection_metric', 
                  'balance_groups', 'sensitive_attribute', 'fold_id', 'group_objective_metric', 'subset_attribute',
                 'subset_group']:
            
            if not isinstance(args_i[j], (list, tuple)):
                df_i[j] = args_i[j]        
        
        for hparam in hparams:            
            if hparam in args_i:
                df_i[hparam] = args_i[hparam]
            elif hparam == 'adjustment_scale':
                df_i[hparam] = None
        df_i['hparams_id'] = (hashlib.md5(str(df_i[hparams + ['task', 'group_objective_type', 'selection_metric', 
                                          'balance_groups', 'sensitive_attribute', 'group_objective_metric', 'subset_attribute',
                                         'subset_group']]
                                                   .iloc[0].values.tolist())
                                               .encode('utf-8')).hexdigest())            
                
        if (not args_i['balance_groups'] and args_i['selection_metric'] == 'loss' 
            and pd.isnull(args_i['subset_attribute']) 
            and args_i['sensitive_attribute'] == 'gender'
            and args_i['group_objective_type'] == 'standard'):
            df_i['exp'] = 'erm_baseline'
        elif not pd.isnull(args_i['subset_attribute']) and args_i['group_objective_type'] == 'standard':
            df_i['exp'] = 'erm_subset'
        elif args_i['balance_groups'] and args_i['group_objective_type'] == 'standard':
            df_i['exp'] = 'erm_group_aware'
        elif args_i['group_objective_type'] == 'dro':
            df_i['exp'] = 'dro'
        else:
            pass
        
        res.append(df_i)
    df_all = pd.concat(res).reset_index(drop = True)
    df_all.to_pickle(pkl_path)

In [None]:
config_df = df_all.drop_duplicates(['config_filename'])[hparams + ['task', 'group_objective_type', 'selection_metric', 
                                          'balance_groups', 'sensitive_attribute', 'group_objective_metric', 'subset_attribute',
                                         'subset_group', 'exp', 'hparams_id', 'fold_id', 'config_filename']]

In [None]:
# sanity check
df_all.drop_duplicates(subset = ['hparams_id', 'fold_id']).groupby('hparams_id')['performance'].count().describe()

#### ERM Baseline

In [None]:
df = df_all[(df_all.exp == 'erm_baseline')]

In [None]:
mean_performance = (
    pd.DataFrame(
        df
        .query('metric == "loss_bce" & phase == "eval"')
        .groupby(['task', 'hparams_id'])
        .agg(performance=('performance_overall', 'mean'))
        .reset_index()
    )
)

best_model = (
    mean_performance.groupby('task')
    .agg(performance=('performance','min'))
    .merge(mean_performance)   
)

In [None]:
selected_config_df_erm = (
    best_model[['hparams_id', 'task']]
    .merge(config_df[config_df.exp == 'erm_baseline'])
    .assign(
        tag='erm_baseline'
    )
)
selected_config_df_erm.sensitive_attribute = None

In [None]:
best_model

In [None]:
selected_config_df_erm

#### ERM with group aware model selection

Revelant evaluations
* Best group-aware model selected by worst-group AUC model selection on the eval set over all hyperparameters
* Best group-aware model selected by worst-group loss model selection on the eval set over all hyperparameters
* Balanced groups, early stopping on the average loss across groups, model selection on average loss

In [None]:
df = df_all[(df_all.exp == 'erm_group_aware')]

In [None]:
def query_df(df, query_str=None):
    if query_str is None:
        return df
    else:
        return df.query(query_str)
    
def select_model_mean_min_max(
    df,
    metric_name='auc', 
    agg_func_inner='min', 
    agg_func_outer='max', 
    query_str=None,
    group_vars=None
):
    default_group_vars = ['sensitive_attribute', 'eval_attribute']
    group_vars = default_group_vars if group_vars is None else default_group_vars + group_vars
    mean_performance_by_hparam = (
        df
        .pipe(query_df, query_str=query_str)
        .query('sensitive_attribute == eval_attribute')
        .query('metric == @metric_name')
        .query('phase == "eval"')
        .groupby(group_vars + ['config_filename', 'hparams_id', 'task'])
        .agg(performance=('performance', agg_func_inner)) 
        .reset_index()
        .groupby(group_vars + ['hparams_id', 'task'])
        .agg(performance=('performance', 'mean'))
        .reset_index()
    )

    # Get the hparam_id with the best mean performance
    return (
        mean_performance_by_hparam
        .groupby(group_vars + ['task'])
        .agg(performance=('performance', agg_func_outer))
        .merge(mean_performance_by_hparam)
        .drop_duplicates(subset = group_vars + ['performance'])
    )

In [None]:
best_mean_auc_min = select_model_mean_min_max(
    df,
    metric_name='auc',
    agg_func_inner='min',
    agg_func_outer='max',
)

best_mean_auc_min = best_mean_auc_min.assign(
    config_selection='auc_min_max', 
    tag='aware_auc_min'
)
display(best_mean_auc_min)

In [None]:
best_mean_loss_max = select_model_mean_min_max(
    df,
    metric_name='loss_bce',
    agg_func_inner='max',
    agg_func_outer='min'
)
best_mean_loss_max = best_mean_loss_max.assign(
    config_selection='loss_max_min',
    tag='aware_loss_max'
)
display(best_mean_loss_max)

In [None]:
best_mean_loss_mean_balanced = select_model_mean_min_max(
    df,
    metric_name='loss_bce',
    agg_func_inner='mean',
    agg_func_outer='min',
    query_str='balance_groups == True & selection_metric == "loss"'
)
best_mean_loss_mean_balanced = best_mean_loss_mean_balanced.assign(
    config_selection='loss_mean_min_balanced',
    tag='aware_balanced'
)
display(best_mean_loss_mean_balanced)

In [None]:
selected_models_aware = (
    pd.concat(
        [best_mean_auc_min, best_mean_loss_max, best_mean_loss_mean_balanced]
    )
    .drop(columns='performance')
    .merge(config_df[config_df.exp == 'erm_group_aware'])
)
selected_models_aware

### Subset tuning

In [None]:
df = df_all[(df_all.exp == 'erm_subset')]

In [None]:
mean_performance_subset = (
    df
    .query('subset_attribute == eval_attribute & subset_group == subset_group')
    .query('metric == "loss_bce" & phase == "eval"')
    .groupby(['task', 'hparams_id', 'subset_attribute', 'subset_group'])
    .agg(performance=('performance', 'mean'))
    .reset_index()
)

best_performance_subset = (
    mean_performance_subset
    .groupby(['task', 'subset_attribute', 'subset_group'])
    .agg(performance=('performance', 'min'))
    .reset_index()
    .merge(mean_performance_subset)
    .drop_duplicates(subset = ['task', 'subset_attribute', 'subset_group', 'performance'])
)
display(best_performance_subset)

selected_models_subset = (
    best_performance_subset
    .merge(config_df[config_df.exp == 'erm_subset'])
    .drop(columns='performance')
    .assign(tag='erm_subset')
)
display(selected_models_subset)

### Group DRO

Relevant comparisons

    * "Best DRO" - min/max auc model selection
    * "Best DRO" - max/min loss model selection
    * By objective - min/max auc model selection
    * By objective - max/min loss model selection

In [None]:
df = df_all[df_all.exp == 'dro']

In [None]:
best_mean_auc_min_dro = select_model_mean_min_max(
    df,
    metric_name='auc',
    agg_func_inner='min',
    agg_func_outer='max',
)

best_mean_auc_min_dro = best_mean_auc_min_dro.assign(
    config_selection='auc_min_max', 
    tag='dro_auc_min'
)
display(best_mean_auc_min_dro)

In [None]:
best_mean_loss_max_dro = select_model_mean_min_max(
    df,
    metric_name='loss_bce',
    agg_func_inner='max',
    agg_func_outer='min'
)
best_mean_loss_max_dro = best_mean_loss_max_dro.assign(
    config_selection='loss_max_min',
    tag='dro_loss_max'
)
display(best_mean_loss_max_dro)

In [None]:
best_mean_auc_min_dro_by_objective_metric = select_model_mean_min_max(
    df,
    metric_name='auc',
    agg_func_inner='min',
    agg_func_outer='max',
    group_vars = ['group_objective_metric']
)

best_mean_auc_min_dro_by_objective_metric = best_mean_auc_min_dro_by_objective_metric.assign(
    config_selection='auc_min_max', 
    tag=lambda x: (
        x.apply(
            lambda y: 'dro_auc_min_objective_{}'.format(y.group_objective_metric),
            axis=1
        )
    )
)

display(best_mean_auc_min_dro_by_objective_metric)

In [None]:
best_mean_loss_max_dro_by_objective_metric = select_model_mean_min_max(
    df,
    metric_name='loss_bce',
    agg_func_inner='max',
    agg_func_outer='min',
    group_vars = ['group_objective_metric']
)

best_mean_loss_max_dro_by_objective_metric = best_mean_loss_max_dro_by_objective_metric.assign(
    config_selection='loss_max_min', 
    tag=lambda x: (
        x.apply(
            lambda y: 'dro_loss_max_objective_{}'.format(y.group_objective_metric),
            axis=1
        )
    )
)

display(best_mean_loss_max_dro_by_objective_metric)

In [None]:
selected_models_dro = (
    pd.concat(
        [best_mean_auc_min_dro, best_mean_loss_max_dro, best_mean_auc_min_dro_by_objective_metric, best_mean_loss_max_dro_by_objective_metric]
    )
    .drop(columns=['performance', 'group_objective_metric'])
    .merge(config_df[config_df.exp == 'dro'])
)
selected_models_dro

#### Put it all together - export configs

In [None]:
selected_config_df = pd.concat(
    [
        selected_config_df_erm,
        selected_models_aware,
        selected_models_subset,
        selected_models_dro
    ]
)
selected_config_df

In [None]:
selected_config_df.to_csv(
    '/scratch/hdd001/home/haoran/stanford_robustness/results/selected_configs.csv',
    index=False
)

In [None]:
selected_config_df.task.unique()