In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import glob
import os
import matplotlib.pyplot as plt
import shutil
from prediction_utils.util import df_dict_concat, yaml_read, yaml_write

In [2]:
project_dir = "/share/pi/nigam/projects/spfohl/cohorts/admissions/mimic_omop/"
os.listdir(os.path.join(project_dir, 'experiments'))

['baseline_tuning_fold_1_10']

In [3]:
experiment_name = 'baseline_tuning_fold_1_10'

In [4]:
baseline_files = glob.glob(
    os.path.join(
        project_dir, 
        'experiments', 
        experiment_name, 
        '**', 
        'result_df_training_eval.parquet'
    ),
    recursive=True
)

In [5]:
baseline_df_dict = {
    tuple(file_name.split('/'))[-4:-1]: pd.read_parquet(file_name)
    for file_name in baseline_files
}
baseline_df = df_dict_concat(baseline_df_dict, 
                             ['task', 'config_filename', 'fold']
                            )

In [6]:
baseline_df.head()

Unnamed: 0,task,config_filename,fold,metric,phase,epoch,performance
0,los_icu_3days,8.yaml,7,auc,val,0,0.729332
1,los_icu_3days,8.yaml,7,auprc,val,0,0.560348
2,los_icu_3days,8.yaml,7,brier,val,0,0.184709
3,los_icu_3days,8.yaml,7,loss_bce,val,0,0.551051
4,los_icu_3days,8.yaml,7,loss,val,0,0.551051


In [7]:
mean_performance = (
    pd.DataFrame(
        baseline_df
        .query('metric == "loss" & phase == "val"')
        .groupby(['config_filename', 'task'])
        .agg(performance=('performance', 'mean'))
        .reset_index()
    )
)
best_model = (
    mean_performance
    .groupby('task')
    .agg(performance=('performance','min'))
    .merge(mean_performance)
    
)

In [8]:
best_model_config_df = best_model[['config_filename', 'task']]
best_model_performance = baseline_df.merge(best_model_config_df)

In [9]:
best_model_performance.groupby(['task', 'config_filename', 'metric', 'phase', 'epoch']).agg('mean')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,performance
task,config_filename,metric,phase,epoch,Unnamed: 5_level_1
los_icu_3days,42.yaml,auc,test,0,0.72107
los_icu_3days,42.yaml,auc,val,0,0.741523
los_icu_3days,42.yaml,auprc,test,0,0.547199
los_icu_3days,42.yaml,auprc,val,0,0.585919
los_icu_3days,42.yaml,brier,test,0,0.189039
los_icu_3days,42.yaml,brier,val,0,0.180962
los_icu_3days,42.yaml,loss,test,0,0.559233
los_icu_3days,42.yaml,loss,val,0,0.54083
los_icu_3days,42.yaml,loss_bce,test,0,0.559233
los_icu_3days,42.yaml,loss_bce,val,0,0.54083


In [10]:
best_model_performance[['task', 'config_filename']].drop_duplicates()

Unnamed: 0,task,config_filename
0,los_icu_3days,42.yaml
100,mortality_hospital,42.yaml
200,los_icu_7days,48.yaml
300,mortality_icu,42.yaml


In [11]:
best_model_config_df

Unnamed: 0,config_filename,task
0,42.yaml,los_icu_3days
1,48.yaml,los_icu_7days
2,42.yaml,mortality_hospital
3,42.yaml,mortality_icu


In [12]:
best_model

Unnamed: 0,performance,config_filename,task
0,0.54083,42.yaml,los_icu_3days
1,0.185529,48.yaml,los_icu_7days
2,0.178114,42.yaml,mortality_hospital
3,0.122544,42.yaml,mortality_icu


In [13]:
base_config_path = os.path.join(project_dir, 'experiments', experiment_name, 'config')
# retrain_experiment_name = 'baseline_best'
selected_config_path = os.path.join(project_dir, 'experiments', experiment_name, 'config', 'selected_models')

In [14]:
# Write to a new directory
for i, row in best_model_config_df.iterrows():
    the_config = yaml_read(os.path.join(base_config_path, row.task, row.config_filename))
    print(row.task)
    print(the_config)
    the_config['label_col'] = row.task
    os.makedirs(os.path.join(selected_config_path, row.task), exist_ok=True)
    yaml_write(the_config, os.path.join(selected_config_path, row.task, row.config_filename))

los_icu_3days
{'batch_size': 128, 'drop_prob': 0.75, 'early_stopping': True, 'early_stopping_patience': 10, 'gamma': 1.0, 'hidden_dim': 256, 'label_col': 'los_icu_3days', 'lr': 1e-05, 'num_epochs': 150, 'num_hidden': 1}
los_icu_7days
{'batch_size': 512, 'drop_prob': 0.75, 'early_stopping': True, 'early_stopping_patience': 10, 'gamma': 1.0, 'hidden_dim': 128, 'label_col': 'los_icu_7days', 'lr': 1e-05, 'num_epochs': 150, 'num_hidden': 3}
mortality_hospital
{'batch_size': 128, 'drop_prob': 0.75, 'early_stopping': True, 'early_stopping_patience': 10, 'gamma': 1.0, 'hidden_dim': 256, 'label_col': 'mortality_hospital', 'lr': 1e-05, 'num_epochs': 150, 'num_hidden': 1}
mortality_icu
{'batch_size': 128, 'drop_prob': 0.75, 'early_stopping': True, 'early_stopping_patience': 10, 'gamma': 1.0, 'hidden_dim': 256, 'label_col': 'mortality_icu', 'lr': 1e-05, 'num_epochs': 150, 'num_hidden': 1}
