In [19]:
import os
import pandas as pd
import glob
import yaml

In [2]:
data_path = 'data'
performance_path = os.path.join(data_path, 'performance', 'baseline_grid')

In [3]:
selection_metric = 'auc'

In [4]:
def path_split(x, num_iters = 1):
    for i in range(num_iters - 1):
        x = os.path.split(x)[0]
    return os.path.split(x)[1]

In [5]:
files_eval = glob.glob(os.path.join(performance_path, '**', '*_eval.csv'), recursive=True)
df_eval = pd.concat({(file, path_split(file, 3), path_split(file, 2)): 
                pd.read_csv(file) for file in files_eval})
df_eval = df_eval.rename_axis(index = ['file_path', 'outcome', 'experiment_id', 'index'])

In [6]:
df_eval = df_eval.reset_index([0, 1, 2])

In [7]:
best_performance = df_eval.loc[(df_eval.metric == selection_metric) & (df_eval.phase == 'val')].groupby(['phase', 'outcome'])[['performance']].agg(max).merge(df_eval)
best_performance

Unnamed: 0,performance,file_path,outcome,experiment_id,metric,phase,epoch
0,0.850924,data/performance/baseline_grid/los/75/15523354...,los,75,auc,val,0
1,0.905366,data/performance/baseline_grid/mortality/32/15...,mortality,32,auc,val,0


In [8]:
df_eval.merge(best_performance[['experiment_id', 'outcome']])

Unnamed: 0,file_path,outcome,experiment_id,metric,phase,epoch,performance
0,data/performance/baseline_grid/los/75/15523354...,los,75,auc,val,0,0.850924
1,data/performance/baseline_grid/los/75/15523354...,los,75,auprc,val,0,0.602833
2,data/performance/baseline_grid/los/75/15523354...,los,75,brier,val,0,0.117819
3,data/performance/baseline_grid/los/75/15523354...,los,75,loss,val,0,0.381414
4,data/performance/baseline_grid/los/75/15523354...,los,75,auc,test,0,0.854169
5,data/performance/baseline_grid/los/75/15523354...,los,75,auprc,test,0,0.597652
6,data/performance/baseline_grid/los/75/15523354...,los,75,brier,test,0,0.114607
7,data/performance/baseline_grid/los/75/15523354...,los,75,loss,test,0,0.372925
8,data/performance/baseline_grid/mortality/32/15...,mortality,32,auc,val,0,0.905366
9,data/performance/baseline_grid/mortality/32/15...,mortality,32,auprc,val,0,0.23262


In [9]:
best_performance[['outcome', 'experiment_id']]

Unnamed: 0,outcome,experiment_id
0,los,75
1,mortality,32


In [10]:
best_model_ids = best_performance[['outcome', 'experiment_id']].set_index('outcome').transpose().to_dict('records')

In [17]:
best_model_ids = best_model_ids[0]
print(best_model_ids)

{'los': '75', 'mortality': '32'}


In [23]:
## Create defaults
config_path = os.path.join(data_path, 'config', 'grid', 'baseline')
config_defaults_path = os.path.join(data_path, 'config', 'defaults', 'baseline')
for the_outcome, the_model_id in best_model_ids.items():
    ## Read the best config
    with open(os.path.join(config_path, '{}.yaml'.format(the_model_id)), 'r') as fp:
        best_config = yaml.load(fp)
    temp_path = os.path.join(config_defaults_path, the_outcome)
    os.makedirs(temp_path, exist_ok = True)  
    with open(os.path.join(temp_path, 'model_config.yaml'), 'w') as fp:
        yaml.dump(best_config, fp)

In [11]:
files_group = glob.glob(os.path.join(performance_path, '**', '*_by_group.csv'), recursive=True)
df_group = pd.concat({(file, path_split(file, 3), path_split(file, 2)): 
                pd.read_csv(file) for file in files_group})
df_group = df_group.rename_axis(index = ['file_path', 'outcome', 'experiment_id', 'index'])
df_group = df_group.reset_index([0, 1, 2])

In [12]:
df_group_best = df_group.merge(best_performance[['outcome', 'experiment_id']])
df_group_best = df_group_best.loc[(df_group_best.phase == 'val') & (df_group_best.metric == 'auc')]

In [13]:
df_group_best

Unnamed: 0,file_path,outcome,experiment_id,sensitive_variable,group,metric,phase,epoch,performance
0,data/performance/baseline_grid/los/75/15523354...,los,75,age,0,auc,val,0,0.893216
8,data/performance/baseline_grid/los/75/15523354...,los,75,age,1,auc,val,0,0.88337
16,data/performance/baseline_grid/los/75/15523354...,los,75,age,2,auc,val,0,0.83471
24,data/performance/baseline_grid/los/75/15523354...,los,75,age,3,auc,val,0,0.826948
32,data/performance/baseline_grid/los/75/15523354...,los,75,gender,0,auc,val,0,0.865027
40,data/performance/baseline_grid/los/75/15523354...,los,75,gender,1,auc,val,0,0.827259
48,data/performance/baseline_grid/los/75/15523354...,los,75,race_eth,0,auc,val,0,0.86739
56,data/performance/baseline_grid/los/75/15523354...,los,75,race_eth,1,auc,val,0,0.801317
64,data/performance/baseline_grid/los/75/15523354...,los,75,race_eth,2,auc,val,0,0.861681
72,data/performance/baseline_grid/los/75/15523354...,los,75,race_eth,3,auc,val,0,0.840504


In [14]:
df_group_best = df_group.merge(best_performance[['outcome', 'experiment_id']])
df_group_best = df_group_best.loc[(df_group_best.phase == 'test') & (df_group_best.metric == 'auc')]
df_group_best

Unnamed: 0,file_path,outcome,experiment_id,sensitive_variable,group,metric,phase,epoch,performance
4,data/performance/baseline_grid/los/75/15523354...,los,75,age,0,auc,test,0,0.88287
12,data/performance/baseline_grid/los/75/15523354...,los,75,age,1,auc,test,0,0.886358
20,data/performance/baseline_grid/los/75/15523354...,los,75,age,2,auc,test,0,0.856125
28,data/performance/baseline_grid/los/75/15523354...,los,75,age,3,auc,test,0,0.81681
36,data/performance/baseline_grid/los/75/15523354...,los,75,gender,0,auc,test,0,0.866275
44,data/performance/baseline_grid/los/75/15523354...,los,75,gender,1,auc,test,0,0.834033
52,data/performance/baseline_grid/los/75/15523354...,los,75,race_eth,0,auc,test,0,0.862112
60,data/performance/baseline_grid/los/75/15523354...,los,75,race_eth,1,auc,test,0,0.845566
68,data/performance/baseline_grid/los/75/15523354...,los,75,race_eth,2,auc,test,0,0.866527
76,data/performance/baseline_grid/los/75/15523354...,los,75,race_eth,3,auc,test,0,0.837003
