In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import glob
import os
import matplotlib.pyplot as plt
import shutil
from prediction_utils.util import df_dict_concat, yaml_read, yaml_write

In [3]:
project_dir = "/share/pi/nigam/projects/spfohl/cohorts/admissions/starr_20200523"
os.listdir(os.path.join(project_dir, 'experiments'))

['baseline_tuning_fold_1_10']

In [4]:
experiment_name = 'baseline_tuning_fold_1_10'

In [5]:
baseline_files = glob.glob(
    os.path.join(
        project_dir, 
        'experiments', 
        experiment_name, 
        '**', 
        'result_df_training_eval.parquet'
    ),
    recursive=True
)

In [6]:
baseline_df_dict = {
    tuple(file_name.split('/'))[-4:-1]: pd.read_parquet(file_name)
    for file_name in baseline_files
}
baseline_df = df_dict_concat(baseline_df_dict, 
                             ['task', 'config_filename', 'fold']
                            )

In [7]:
baseline_df.head()

Unnamed: 0,task,config_filename,fold,metric,phase,epoch,performance
0,LOS_7,8.yaml,7,auc,val,0,0.787855
1,LOS_7,8.yaml,7,auprc,val,0,0.466624
2,LOS_7,8.yaml,7,brier,val,0,0.131414
3,LOS_7,8.yaml,7,loss_bce,val,0,0.408703
4,LOS_7,8.yaml,7,loss,val,0,0.408703


In [15]:
assert (
    baseline_df
    .groupby(['task', 'config_filename'])
    .agg(num_folds = ('fold', lambda x: len(x.unique())))
    .query('num_folds != 10')
    .shape[0]
) == 0

In [8]:
mean_performance = (
    pd.DataFrame(
        baseline_df
        .query('metric == "loss" & phase == "val"')
        .groupby(['config_filename', 'task'])
        .agg(performance=('performance', 'mean'))
        .reset_index()
    )
)
best_model = (
    mean_performance
    .groupby('task')
    .agg(performance=('performance','min'))
    .merge(mean_performance)
    
)
 
# mean_performance

In [16]:
# mean_performance = (
#     pd.DataFrame(
#         baseline_df
#         .query('metric == "loss" & phase == "val"')
#         .groupby(['config_filename', 'task'])
#         .agg({'performance': 'mean', 'config_filename': lambda x: x.array[-1], 'task': lambda x: x.array[-1]})
#         .reset_index(drop=True)
#     )
# )

# best_model = pd.DataFrame(mean_performance.groupby(['task']).performance.agg('min')).reset_index().merge(mean_performance)
# best_model

best_model_config_df = best_model[['config_filename', 'task']]
best_model_performance = baseline_df.merge(best_model_config_df)

In [22]:
best_model_performance[['task', 'config_filename']].drop_duplicates()

Unnamed: 0,task,config_filename
0,LOS_7,15.yaml
100,readmission_30,48.yaml
200,hospital_mortality,13.yaml


In [23]:
best_model_config_df

Unnamed: 0,config_filename,task
0,15.yaml,LOS_7
1,13.yaml,hospital_mortality
2,48.yaml,readmission_30


In [24]:
baseline_df

Unnamed: 0,task,config_filename,fold,metric,phase,epoch,performance
0,LOS_7,8.yaml,7,auc,val,0,0.787855
1,LOS_7,8.yaml,7,auprc,val,0,0.466624
2,LOS_7,8.yaml,7,brier,val,0,0.131414
3,LOS_7,8.yaml,7,loss_bce,val,0,0.408703
4,LOS_7,8.yaml,7,loss,val,0,0.408703
...,...,...,...,...,...,...,...
14957,hospital_mortality,49.yaml,9,auc,test,0,0.858716
14958,hospital_mortality,49.yaml,9,auprc,test,0,0.144664
14959,hospital_mortality,49.yaml,9,brier,test,0,0.019936
14960,hospital_mortality,49.yaml,9,loss_bce,test,0,0.086408


In [25]:
base_config_path = os.path.join(project_dir, 'experiments', experiment_name, 'config')
selected_config_path = os.path.join(project_dir, 'experiments', experiment_name, 'config', 'selected_models')

In [26]:
# Write to a new directory
for i, row in best_model_config_df.iterrows():
    the_config = yaml_read(os.path.join(base_config_path, row.task, row.config_filename))
    print(the_config)
    the_config['label_col'] = row.task
    os.makedirs(os.path.join(selected_config_path, row.task), exist_ok=True)
    yaml_write(the_config, os.path.join(selected_config_path, row.task, row.config_filename))

{'batch_size': 256, 'drop_prob': 0.75, 'early_stopping': True, 'early_stopping_patience': 10, 'gamma': 1.0, 'hidden_dim': 128, 'label_col': 'LOS_7', 'lr': 0.0001, 'num_epochs': 150, 'num_hidden': 1}
{'batch_size': 512, 'drop_prob': 0.75, 'early_stopping': True, 'early_stopping_patience': 10, 'gamma': 1.0, 'hidden_dim': 256, 'label_col': 'hospital_mortality', 'lr': 0.0001, 'num_epochs': 150, 'num_hidden': 3}
{'batch_size': 512, 'drop_prob': 0.75, 'early_stopping': True, 'early_stopping_patience': 10, 'gamma': 1.0, 'hidden_dim': 128, 'label_col': 'readmission_30', 'lr': 1e-05, 'num_epochs': 150, 'num_hidden': 3}
