In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns

from matplotlib import rc
rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']})

from tqdm import tqdm

In [None]:
simulation_type = 'experiment_2'
runid = '1'
runid_foldername = [foldername for foldername in os.listdir(f'experiments/{simulation_type}') if foldername == f'output_{runid}'][0]
output_folder = f'experiments/{simulation_type}/{runid_foldername}'

seed_list = []
file_list = os.listdir(f'{output_folder}')
for file in file_list:
    if 'seed' in file:
        seed_list.append(file.split('_')[1])
seed_list = list(set(seed_list))

results_list = []
for seed in tqdm(seed_list):
    df_seed = pd.read_csv(f'{output_folder}/seed_{seed}', usecols=['ed_los', 'destination_record'], dtype={'ed_los': int, 'destination_record': str})
    df_seed['ed_los'] = df_seed['ed_los'] / 60
    df_seed['case_len'] = df_seed.destination_record.str.len()
    results_list.append(df_seed)
df_pro = pd.concat(results_list)

simulation_type = 'experiment_2'
runid = '2'
runid_foldername = [foldername for foldername in os.listdir(f'experiments/{simulation_type}') if foldername == f'output_{runid}'][0]
output_folder = f'experiments/{simulation_type}/{runid_foldername}'

seed_list = []
file_list = os.listdir(f'{output_folder}')
for file in file_list:
    if 'seed' in file:
        seed_list.append(file.split('_')[1])
seed_list = list(set(seed_list))

results_list = []
for seed in tqdm(seed_list):
    df_seed = pd.read_csv(f'{output_folder}/seed_{seed}', usecols=['ed_los', 'destination_record'], dtype={'ed_los': int, 'destination_record': str})
    df_seed['ed_los'] = df_seed['ed_los'] / 60
    df_seed['case_len'] = df_seed.destination_record.str.len()
    results_list.append(df_seed)
df_ret = pd.concat(results_list)

In [None]:
list_acuity = ['1', '2', '3', '4', '5']
list_disposition = ['HOME', 'WARD', 'ICU']
list_complexity = ['LOW', 'MODERATE', 'HIGH']

In [None]:
# Get target LOS distributions
with open(f'params/los/los_overall.txt') as filehandle:
    target_los_all = json.load(filehandle)

target_los_acuity = {}
for acuity_val in list_acuity:
    with open(f'params/los/los_acuity_{acuity_val}.txt') as filehandle:
        target_los_acuity[acuity_val] = json.load(filehandle)

target_los_disposition = {}
for disposition_val in list_disposition:
    with open(f'params/los/los_disposition_{disposition_val}.txt') as filehandle:
        target_los_disposition[disposition_val] = json.load(filehandle)

target_los_complexity = {}
for complexity_val in list_complexity:
    with open(f'params/los/los_complexity_{complexity_val}.txt') as filehandle:
        target_los_complexity[complexity_val] = json.load(filehandle)

target_los_acuity_disposition = {}
target_los_acuity_complexity = {}
for acuity_val in list_acuity:
    for disposition_val in list_disposition:
        los_disposition = []
        for complexity_val in list_complexity:
            try:
                with open(f'params/los/los_groupname_{acuity_val}-{disposition_val}-{complexity_val}.txt') as filehandle:
                    los_temp = json.load(filehandle)
                    los_disposition.extend(los_temp)
            except:
                continue
        target_los_acuity_disposition[f'{acuity_val}_{disposition_val}'] = los_disposition

    for complexity_val in list_complexity:
        los_complexity = []
        for disposition_val in list_disposition:
            try:
                with open(f'params/los/los_groupname_{acuity_val}-{disposition_val}-{complexity_val}.txt') as filehandle:
                    los_temp = json.load(filehandle)
                    los_complexity.extend(los_temp)
            except:
                continue
        target_los_acuity_complexity[f'{acuity_val}_{complexity_val}'] = los_complexity

In [None]:
def get_distribution_statistics(results_list):

    if len(results_list) > 0:

        return round(np.median(results_list), 1), round(np.quantile(results_list, 0.25), 1), round(np.quantile(results_list, 0.75), 1)
    else:

        return 'N/A', 'N/A', 'N/A'


In [None]:
def print_los(los_all, los_acuity, los_disposition, los_complexity, los_acuity_disposition, los_acuity_complexity):
    print('overall', get_distribution_statistics(los_all))

    print('--- Acuity')
    for acuity_val in list_acuity:
        print(acuity_val, get_distribution_statistics(los_acuity[acuity_val]))

    print('--- Disposition')
    for disposition_val in list_disposition:
        print(disposition_val, get_distribution_statistics(los_disposition[disposition_val]))

    print('--- Complexity')
    for complexity_val in list_complexity:
        print(complexity_val, get_distribution_statistics(los_complexity[complexity_val]))

    print('--- Acuity x Disposition')
    for disposition_val in list_disposition:
        for acuity_val in list_acuity:
            print(acuity_val, disposition_val, get_distribution_statistics(los_acuity_disposition[f'{acuity_val}_{disposition_val}']))

    print('--- Acuity x Complexity')
    for complexity_val in list_complexity:
        for acuity_val in list_acuity:
            print(acuity_val, complexity_val, get_distribution_statistics(los_acuity_complexity[f'{acuity_val}_{complexity_val}']))


In [None]:
def load_results(simulation_type, run_id):
    runid_foldername = [foldername for foldername in os.listdir(f'experiments/{simulation_type}') if foldername == f'output_{run_id}'][0]
    output_folder = f'experiments/{simulation_type}/{runid_foldername}'

    seed_list = []
    file_list = os.listdir(f'{output_folder}')
    for file in file_list:
        if 'seed' in file:
            seed_list.append(file.split('_')[1])
    seed_list = list(set(seed_list))

    df_results_concatenated = pd.DataFrame()
    df_results_per_run = []
    for seed in tqdm(seed_list):
        df_seed = pd.read_csv(f'{output_folder}/seed_{seed}', dtype={'acuity': str, 'disposition': str, 'complexity': str, 'ed_los': float})
        df_seed['ed_los'] = df_seed['ed_los'] / 60
        df_seed['case_len'] = df_seed.destination_record.str.len()

        df_results_per_run.append(df_seed)
    df_results_concatenated = pd.concat(df_results_per_run)

    return df_results_concatenated, df_results_per_run

In [None]:
df_pro_concatenated, df_pro_per_run = load_results('experiment_2', '1')
df_ret_concatenated, df_ret_per_run = load_results('experiment_2', '2')

In [None]:
print('=== Target distributions')
print_los(target_los_all, target_los_acuity, target_los_disposition, target_los_complexity, target_los_acuity_disposition, target_los_acuity_complexity)

In [None]:
for df_cohort, cohort_type in zip([df_pro_concatenated, df_ret_concatenated], ['Prospective', 'Retrospective']):
    print(f'\n=== {cohort_type} cohort type')

    cohort_los_all = df_cohort['ed_los']

    cohort_los_acuity = {}
    for acuity_val in list_acuity:
        cohort_los_acuity[acuity_val] = df_cohort[df_cohort['acuity'] == acuity_val]['ed_los']

    cohort_los_disposition = {}
    for disposition_val in list_disposition:
        cohort_los_disposition[disposition_val] = df_cohort[df_cohort['disposition'] == disposition_val]['ed_los']

    cohort_los_complexity = {}
    for complexity_val in list_complexity:
        cohort_los_complexity[complexity_val] = df_cohort[df_cohort['complexity'] == complexity_val]['ed_los']

    cohort_los_acuity_disposition = {}
    cohort_los_acuity_complexity = {}
    for acuity_val in list_acuity:
        for disposition_val in list_disposition:
            cohort_los_acuity_disposition[f'{acuity_val}_{disposition_val}'] = df_cohort[(df_cohort['acuity'] == acuity_val) & (df_cohort['disposition'] == disposition_val)]['ed_los']

        for complexity_val in list_complexity:
            cohort_los_acuity_complexity[f'{acuity_val}_{complexity_val}'] = df_cohort[(df_cohort['acuity'] == acuity_val) & (df_cohort['complexity'] == complexity_val)]['ed_los']

    print_los(cohort_los_all, cohort_los_acuity, cohort_los_disposition,cohort_los_complexity, cohort_los_acuity_disposition, cohort_los_acuity_complexity)

In [None]:
def calculate_los_difference(df_results_per_run, target_results, indicator_type):
    if indicator_type == 'all':
        dict_los_difference = {'all': []}
        for df_run in df_results_per_run:
            los_difference = np.median(df_run['ed_los']) - np.median(target_results)
            dict_los_difference['all'].append(los_difference)
    elif indicator_type == 'acuity':
        dict_los_difference = {}
        for acuity_val in list_acuity:
            dict_los_difference[acuity_val] = []
            for df_run in df_results_per_run:
                df_acuity = df_run[df_run['acuity'] == acuity_val]
                if len(df_acuity) > 0:
                    los_difference = np.median(df_acuity['ed_los']) - np.median(target_results[acuity_val])
                    dict_los_difference[acuity_val].append(los_difference)
    elif indicator_type == 'disposition':
        dict_los_difference = {}
        for disposition_val in list_disposition:
            dict_los_difference[disposition_val] = []
            for df_run in df_results_per_run:
                df_disposition = df_run[df_run['disposition'] == disposition_val]
                if len(df_disposition) > 0:
                    los_difference = np.median(df_disposition['ed_los']) - np.median(target_results[disposition_val])
                    dict_los_difference[disposition_val].append(los_difference)
    elif indicator_type == 'complexity':
        dict_los_difference = {}
        for complexity_val in list_complexity:
            dict_los_difference[complexity_val] = []
            for df_run in df_results_per_run:
                df_complexity = df_run[df_run['complexity'] == complexity_val]
                if len(df_complexity) > 0:
                    los_difference = np.median(df_complexity['ed_los']) - np.median(target_results[complexity_val])
                    dict_los_difference[complexity_val].append(los_difference)
    elif indicator_type == 'acuity-disposition':
        dict_los_difference = {}
        for acuity_val in list_acuity:
            for disposition_val in list_disposition:
                dict_los_difference[f'{acuity_val}_{disposition_val}'] = []

            for df_run in df_results_per_run:
                for disposition_val in list_disposition:
                    df_group = df_run[(df_run['acuity'] == acuity_val) & (df_run['disposition'] == disposition_val)]
                    if len(df_group) > 0:
                        los_difference = np.median(df_group['ed_los']) - np.median(target_results[f'{acuity_val}_{disposition_val}'])
                        dict_los_difference[f'{acuity_val}_{disposition_val}'].append(los_difference)
    elif indicator_type == 'acuity-complexity':
        dict_los_difference = {}
        for acuity_val in list_acuity:
            for complexity_val in list_complexity:
                dict_los_difference[f'{acuity_val}_{complexity_val}'] = []

            for df_run in df_results_per_run:
                for complexity_val in list_complexity:
                    df_group = df_run[(df_run['acuity'] == acuity_val) & (df_run['complexity'] == complexity_val)]
                    if len(df_group) > 0:
                        los_difference = np.median(df_group['ed_los']) - np.median(target_results[f'{acuity_val}_{complexity_val}'])
                        dict_los_difference[f'{acuity_val}_{complexity_val}'].append(los_difference)

    return dict_los_difference

In [None]:
def plot_los_difference(dict_los_difference, sorted_key, sorted_labels, color, xlabel=None, title=None):
    plot_this = [dict_los_difference[key] for key in sorted_key]

    fig, axes = plt.subplots(1, 1, figsize=(0.4*len(sorted_key),1.5), sharey=True, sharex=False)
    fig.dpi = 600

    axes.boxplot(plot_this, vert=True, flierprops={'markersize': 2}, widths=0.2, medianprops={'color': '#D55E00'}, showfliers=False, whiskerprops={'color': color}, capprops={'color': color}, boxprops={'color': color, 'facecolor': color}, patch_artist=True)

    for patch in axes.patches:
        r, g, b, a = patch.get_facecolor()
        patch.set_facecolor((r, g, b, .3))

    default_grid_color = plt.rcParams['grid.color']
    plt.axhline(1, color=default_grid_color, linewidth=0.5, zorder=0, linestyle='--')
    plt.axhline(0, color=default_grid_color, linewidth=1, zorder=0, linestyle='-')
    plt.axhline(-1, color=default_grid_color, linewidth=0.5, zorder=0, linestyle='--')
    axes.set_ylim(-6, 6)
    axes.set_xticklabels(sorted_labels)
    axes.set_yticks(np.arange(-6,6.1,2))
    plt.tick_params(axis='both', which='both', labelsize='small')
    plt.ylabel('LOS difference', fontsize='small')
    plt.xlabel(xlabel, fontsize='small')
    plt.title(title, fontsize='small')
    plt.show()

In [None]:
def plot_los_difference_vert(dict_los_difference, sorted_key, sorted_labels, color, xlabel=None, title=None):
    plot_this = [dict_los_difference[key] for key in sorted_key]

    fig, axes = plt.subplots(1, 1, figsize=(1.6,0.35*len(sorted_key)), sharey=True, sharex=False)
    fig.dpi = 600

    axes.boxplot(plot_this[::-1], vert=False, flierprops={'markersize': 2}, widths=0.35, medianprops={'color': '#D55E00'}, showfliers=False, whiskerprops={'color': color}, capprops={'color': color}, boxprops={'color': color, 'facecolor': color}, patch_artist=True)

    for patch in axes.patches:
        r, g, b, a = patch.get_facecolor()
        patch.set_facecolor((r, g, b, .3))

    default_grid_color = plt.rcParams['grid.color']
    plt.axvline(1, color=default_grid_color, linewidth=0.5, zorder=0, linestyle=':')
    plt.axvline(0, color=default_grid_color, linewidth=1, zorder=0, linestyle='-')
    plt.axvline(-1, color=default_grid_color, linewidth=0.5, zorder=0, linestyle=':')
    axes.set_xlim(-5, 5)
    axes.set_yticklabels(sorted_labels[::-1])
    axes.set_xticks(np.arange(-4,4.1,2))
    plt.tick_params(axis='both', which='both', labelsize='small')
    plt.xlabel('Relative median LOS', fontsize='small')
    plt.ylabel(xlabel, fontsize='small')
    plt.title(title, fontsize='small')
    plt.show()

In [None]:
for acuity_val in list_acuity:
    df_test = df_pro_concatenated[df_pro_concatenated['acuity'] == acuity_val]
    print(acuity_val, np.median(df_test['ed_los']) - np.median(target_los_acuity[acuity_val]))

In [None]:
for df_cohort, color in zip([df_pro_per_run, df_ret_per_run], ['#0072B2', '#009E73']):
    dict_los_difference_acuity = calculate_los_difference(df_cohort, target_los_acuity, 'acuity')
    plot_los_difference_vert(dict_los_difference_acuity, list_acuity, ['1', '2', '3', '4', '5'], color, 'Acuity', 'All patients')

In [None]:
for df_cohort, color in zip([df_pro_per_run, df_ret_per_run], ['#0072B2', '#009E73']):
    dict_los_difference_disposition = calculate_los_difference(df_cohort, target_los_disposition, 'disposition')
    plot_los_difference_vert(dict_los_difference_disposition, list_disposition, ['Home', 'Ward', 'ICU'], color, 'Disposition', 'All patients')

In [None]:
for df_cohort, color in zip([df_pro_per_run, df_ret_per_run], ['#0072B2', '#009E73']):
    dict_los_difference_complexity = calculate_los_difference(df_cohort, target_los_complexity, 'complexity')
    plot_los_difference_vert(dict_los_difference_complexity, list_complexity, ['Low', 'Moderate', 'High'], color, 'Complexity', 'All patients')

In [None]:
for df_cohort, color in zip([df_pro_per_run, df_ret_per_run], ['#0072B2', '#009E73']):
    for acuity_val in ['1', '2', '3', '4', '5']:
        sorted_key = []
        for disposition_val in list_disposition:
            sorted_key.append(f'{acuity_val}_{disposition_val}')

        dict_los_difference_acuity_disposition = calculate_los_difference(df_cohort, target_los_acuity_disposition, 'acuity-disposition')
        plot_los_difference_vert(dict_los_difference_acuity_disposition, sorted_key, ['Home', 'Ward', 'ICU'], color, 'Disposition', f'Acuity {acuity_val}')

In [None]:
for df_cohort, color in zip([df_pro_per_run, df_ret_per_run], ['#0072B2', '#009E73']):
    for acuity_val in ['1', '2', '3', '4', '5']:
        sorted_key = []
        for complexity_val in list_complexity:
            sorted_key.append(f'{acuity_val}_{complexity_val}')

        dict_los_difference_acuity_complexity = calculate_los_difference(df_cohort, target_los_acuity_complexity, 'acuity-complexity')
        plot_los_difference_vert(dict_los_difference_acuity_complexity, sorted_key, ['Low', 'Moderate', 'High'], color, 'Complexity', f'Acuity {acuity_val}')

In [None]:
def get_underrepresented_group(df_cohort, frequency_threshold):
    list_name, list_relative_frequency, list_los_difference = [], [], []
    for acuity_val in list_acuity:
        for disposition_val in list_disposition:
            df_group = df_cohort[(df_cohort['acuity'] == acuity_val) & (df_cohort['disposition'] == disposition_val)]

            if len(df_group) > 0:
                list_name.append(f'{acuity_val}_{disposition_val}')
                list_relative_frequency.append(len(target_los_acuity_disposition[f'{acuity_val}_{disposition_val}']) / len(target_los_acuity[acuity_val]))
                list_los_difference.append(np.median(df_group['ed_los']) - np.median(target_los_acuity_disposition[f'{acuity_val}_{disposition_val}']))

        for complexity_val in list_complexity:
            df_group = df_cohort[(df_cohort['acuity'] == acuity_val) & (df_cohort['complexity'] == complexity_val)]

            if len(df_group) > 0:
                list_name.append(f'{acuity_val}_{complexity_val}')
                list_relative_frequency.append(len(target_los_acuity_complexity[f'{acuity_val}_{complexity_val}']) / len(target_los_acuity[acuity_val]))
                list_los_difference.append(np.median(df_group['ed_los']) - np.median(target_los_acuity_complexity[f'{acuity_val}_{complexity_val}']))

    return (list_name, list_relative_frequency, list_los_difference)

FREQUENCY_THRESHOLD = 0.15
name_pro, freq_pro, diff_pro = get_underrepresented_group(df_pro_concatenated, FREQUENCY_THRESHOLD)
name_ret, freq_ret, diff_ret = get_underrepresented_group(df_ret_concatenated, FREQUENCY_THRESHOLD)

assert set(name_pro) == set(name_ret), 'Patient groups names should be the same'
assert set(freq_pro) == set(freq_ret), 'Relative frequency should be the same'

for name_val, freq_val, pro_val, ret_val in zip(name_pro, freq_pro, diff_pro, diff_ret):
    if freq_val < FREQUENCY_THRESHOLD:
        print(f'{name_val}\t{freq_val*100}\t{pro_val}\t{ret_val}')


In [None]:
cohort_col, frequency_col, value_col = [], [], []
pro_high, pro_low = [], []
for freq_val, diff_val in zip(freq_pro, diff_pro):
    if freq_val > FREQUENCY_THRESHOLD:
        pro_high.append(abs(diff_val))
    else:
        pro_low.append(abs(diff_val))

pro_mean = [np.mean(pro_high), np.mean(pro_low)]
for idx, mean_val in enumerate(pro_mean):
    cohort_col.append('Prospective cohort')

    if idx == 0:
        frequency_col.append('Low')
    else:
        frequency_col.append('High')

    value_col.append(mean_val)

ret_high, ret_low = [], []
for freq_val, diff_val in zip(freq_ret, diff_ret):
    if freq_val > FREQUENCY_THRESHOLD:
        ret_high.append(abs(diff_val))
    else:
        ret_low.append(abs(diff_val))

ret_mean = [np.mean(ret_high), np.mean(ret_low)]
for idx, mean_val in enumerate(ret_mean):
    cohort_col.append('Retrospective cohort')

    if idx == 0:
        frequency_col.append('Low')
    else:
        frequency_col.append('High')

    value_col.append(mean_val)

df_representation = pd.DataFrame({'Cohort type': cohort_col, 'Data frequency': frequency_col, 'Mean LOS difference': value_col})

fig, ax = plt.subplots(1, 1, figsize=(2,1.5), sharey=True, sharex=False)
fig.dpi = 600

ax = sns.barplot(data=df_representation, y='Data frequency', x='Mean LOS difference', hue='Cohort type', palette=['#0072B2', '#009E73'], width=0.6, gap=0.1, zorder=3)
for idx, container in enumerate(ax.containers):
    if idx == 0:
        ax.bar_label(container, fontsize=8, fmt='%.1f', color='#0072B2', padding=2)
    else:
        ax.bar_label(container, fontsize=8, fmt='%.1f', color='#009E73', padding=2)

sns.move_legend(ax, 'lower center', bbox_to_anchor=(.5, 1), ncol=2, title='Cohort type', frameon=False, columnspacing=2, handlelength=0.8, handletextpad=0.8, reverse=False, fontsize='small')

plt.xlabel('Mean of the absolute differences between the\ntarget and aggregated median LOS values', fontsize='small')
plt.xlim(0,4)
plt.xticks([0, 1, 2, 3, 4])
ax.set_xticks([0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4], minor=True)

plt.ylabel('')
ax.set_yticks(range(2))
ax.set_yticklabels(['Well-represented\npatient groups\n(n = 19)', 'Under-represented\npatient groups\n(n = 8)'])

plt.grid(axis='x', which='minor', linewidth=0.5)
plt.grid(axis='x', which='major', linewidth=0.5)

plt.tick_params(axis='both', which='both', labelsize='small')
legend = ax.get_legend()
title = legend.get_title()
title.set_fontsize('small')
ax.grid(True, axis='x', which='both', linestyle=':')

for spine in ax.spines.values():
    spine.set_zorder(3)

plt.show()