In [None]:
import numpy as np
import os
import pandas as pd

from tqdm import tqdm

In [None]:
# Load ed/edstays table
df_edstays = pd.read_csv('data/ed/edstays.csv', dtype={'stay_id': str, 'subject_id': str, 'arrival_transport': str, 'disposition': str, 'hadm_id': str}, index_col='stay_id')
df_edstays['intime'] = pd.to_datetime(df_edstays['intime'])
df_edstays['outtime'] = pd.to_datetime(df_edstays['outtime'])
df_edstays.loc[:, 'los'] = (df_edstays['outtime'] - df_edstays['intime']).dt.total_seconds() / (60*60)

In [None]:
# Load hosp/patients table
df_patients = pd.read_csv('data/hosp/patients.csv', dtype={'subject_id': str, 'anchor_age': int, 'anchor_year': int}, index_col='subject_id')
dict_patients = df_patients.to_dict(orient='index')

In [None]:
# Add patient age to ed/edstays table
df_patientdata = pd.merge(df_edstays, df_patients, how='left', left_index=True, right_on='subject_id')
age_col = []
for stay_idx in tqdm(df_edstays.index):
    val_admityear = df_edstays.loc[stay_idx, 'intime'].year

    subject_idx = df_edstays.loc[stay_idx, 'subject_id']
    val_anchoryear = dict_patients[subject_idx]['anchor_year']
    val_anchorage = dict_patients[subject_idx]['anchor_age']

    age_col.append(val_admityear - val_anchoryear + val_anchorage)
df_edstays.loc[:, 'age'] = age_col

In [None]:
# Load list of hadm_id in icu/icustays
df_icustays = pd.read_csv('data/icu/icustays.csv', dtype={'hadm_id': str})
list_icustays_hadm_id = df_icustays['hadm_id']

In [None]:
# Relabel ADMITTED disposition to either WARD or ICU
df_edstays['disposition'] = np.where(df_edstays['disposition'] == 'ADMITTED', 'WARD', df_edstays['disposition'])
df_edstays['disposition'] = np.where(df_edstays['hadm_id'].isin(list_icustays_hadm_id), 'ICU', df_edstays['disposition'])

df_edstays

In [None]:
# Load ed/triage table
df_triage = pd.read_csv('data/ed/triage.csv', index_col='stay_id', dtype={'stay_id': str, 'acuity': str})
df_triage['acuity'] = df_triage['acuity'].fillna('-1').astype(float).astype(int).astype(str)

df_triage

In [None]:
# Load ed/diagnosis table (only primary diagnosis)
df_diagnosis = pd.read_csv('data/ed/diagnosis.csv', dtype={'stay_id': str, 'seq_num': str, 'icd_code': str, 'icd_version': str}, index_col='stay_id')
df_diagnosis = df_diagnosis[df_diagnosis['seq_num'] == '1'] # primary diagnosis

df_diagnosis

In [None]:
# Get columns from each dataframes that will be used for the study
df_edstays = df_edstays[['subject_id', 'age', 'arrival_transport', 'disposition', 'los']]
df_triage = df_triage[['acuity']]
df_diagnosis = df_diagnosis[['icd_code', 'icd_version']]

In [None]:
# Merge into one dataframe
df_patientdata = pd.merge(df_edstays, df_triage, how='left', left_index=True, right_index=True).merge(df_diagnosis, how='left', left_index=True, right_index=True)

df_patientdata

In [None]:
# Limit study to patients with LOS within 72 hours
df_patientdata = df_patientdata[df_patientdata['los'] <= 72].copy()

df_patientdata

In [None]:
# Filter data with invalid values for the experiments
df_patientdata = df_patientdata[df_patientdata['acuity'].isin(['1', '2', '3', '4', '5'])].copy()
df_patientdata = df_patientdata[df_patientdata['disposition'].isin(['HOME', 'WARD', 'ICU'])].copy()

df_patientdata

In [None]:
# Filter data with invalid values for the experiments
df_patientdata.dropna(subset='icd_code', inplace=True)

df_patientdata

In [None]:
# Load mapping for ICD-9 to ICD-10
# (input file obtained from https://www.cms.gov/medicare/coordination-benefits-recovery/overview/icd-code-lists)
# (output file generated using https://github.com/ClancyClark/ICD9to10mapping)

df_icd9_to_icd10 = pd.read_csv('data/others/icd9_to_icd10.csv', dtype={'ICD9': str, 'ICD10': str, 'FLAG': str})

# Select one-to-one mappings
df_icd9_to_icd10 = df_icd9_to_icd10[df_icd9_to_icd10['FLAG'].isin(['00000', '10000'])].drop(columns=['FLAG'])
df_icd9_to_icd10.drop_duplicates(subset=['ICD9'], keep=False, inplace=True)

df_icd9_to_icd10 = df_icd9_to_icd10.set_index('ICD9')
dict_idc9_to_icd10 = df_icd9_to_icd10.to_dict(orient='index')

In [None]:
# Load mapping for ICD-10 to ICD-10-AM
# (based on the 12th edition mapping table from https://www.ihacpa.gov.au/resources/icd-10-am-and-achi-mapping-tables)

df_icd10_to_icd10am = pd.read_csv('data/others/icd10_to_icd10am.csv')
df_icd10_to_icd10am['ICD-10'] = df_icd10_to_icd10am['ICD-10'].str.replace('.', '', regex=False)
df_icd10_to_icd10am['ICD-10-AM'] = df_icd10_to_icd10am['ICD-10-AM'].str.replace('.', '', regex=False)

# Remove one-to-many mappings
df_icd10_to_icd10am.drop_duplicates(subset='ICD-10', keep=False, inplace=True)

df_icd10_to_icd10am = df_icd10_to_icd10am.set_index('ICD-10')
dict_icd10_to_icd10am = df_icd10_to_icd10am.to_dict(orient='index')

In [None]:
# Relabel icd_code with ICD-10-AM codes
icd10am_col = []
list_icd9_to_icd10_mapping = list(dict_idc9_to_icd10.keys())
list_icd10_to_icd10am_mapping = list(dict_icd10_to_icd10am.keys())

for stay_idx in tqdm(df_patientdata.index):
    icd_idx = df_patientdata.loc[stay_idx, 'icd_code']
    icd_ver = df_patientdata.loc[stay_idx, 'icd_version']

    # If no entry for ICD code, retain as nan
    if pd.isna(icd_idx):
        icd10am_col.append(np.nan)
        continue

    # If ICD code version is 9, convert to ICD-10 first
    if icd_ver == '9':
        flag_index = -1
        if icd_idx[:3] in list_icd9_to_icd10_mapping:
            flag_index = 3
        if icd_idx[:4] in list_icd9_to_icd10_mapping:
            flag_index = 4
        if icd_idx[:5] in list_icd9_to_icd10_mapping:
            flag_index = 5

        if flag_index != -1:
            val_icd10 = dict_idc9_to_icd10[icd_idx[:flag_index]]['ICD10']
            icd_idx = val_icd10
        else:
            icd10am_col.append(np.nan)
            continue

    # Convert ICD-10 to ICD-10-AM
    flag_index = -1
    if icd_idx[:3] in list_icd10_to_icd10am_mapping:
        flag_index = 3
    if icd_idx[:4] in list_icd10_to_icd10am_mapping:
        flag_index = 4
    if icd_idx[:5] in list_icd10_to_icd10am_mapping:
        flag_index = 5

    if flag_index != -1:
        val_icd10am = dict_icd10_to_icd10am[icd_idx[:flag_index]]['ICD-10-AM']
        icd10am_col.append(val_icd10am)
    else:
        icd10am_col.append(np.nan)

df_patientdata.loc[:, 'icd_code'] = icd10am_col

df_patientdata

In [None]:
# Filter data with invalid values for the experiments
df_patientdata.dropna(subset='icd_code', inplace=True)

df_patientdata

In [None]:
df_aecc_mapping = pd.read_csv('data/others/aecc_to_icd10.csv')
df_aecc_mapping['icd10'] = df_aecc_mapping['icd10'].str.replace('.', '', regex=False)
df_aecc_mapping = df_aecc_mapping.set_index('icd10')
dict_aecc_mapping = df_aecc_mapping.to_dict(orient='index')

In [None]:
ecdg_col = []
ecdgsub_col = []
for stay_idx in tqdm(df_patientdata.index):
    icd_idx = df_patientdata.loc[stay_idx, 'icd_code']

    if icd_idx in dict_aecc_mapping.keys():
        val_ecdg = dict_aecc_mapping[icd_idx]['ecdg_code'].split()[0]
        val_ecdgsub = dict_aecc_mapping[icd_idx]['ecdg_subgroup'].split()[0]

        ecdg_col.append(val_ecdg)
        ecdgsub_col.append(val_ecdgsub)
    else:
        ecdg_col.append(np.nan)
        ecdgsub_col.append(np.nan)

df_patientdata.loc[:, 'ecdg'] = ecdg_col
df_patientdata.loc[:, 'ecdg_subgroup'] = ecdgsub_col

In [None]:
# Filter data with invalid values for the experiments
df_patientdata.dropna(subset=['ecdg', 'ecdg_subgroup'], inplace=True)

df_patientdata

In [None]:
# Add complexity label to each records based on the Australian Emergency Care Classification
df_aecc_coefficients = pd.read_csv('data/others/aecc_coefficients.csv', index_col='ecdg_code')
dict_aecc_coefficients = df_aecc_coefficients.to_dict(orient='index')

df_aecc_subgroup_intercept = pd.read_csv('data/others/aecc_subgroup_intercept.csv', index_col='ecdg_subgroup')
dict_aecc_subgroup_intercept_temp = df_aecc_subgroup_intercept.to_dict(orient='index')
dict_aecc_subgroup_intercept = {}
for key, val in dict_aecc_subgroup_intercept_temp.items():
    dict_aecc_subgroup_intercept[key.split()[0]] = val

df_aecc_threshold = pd.read_csv('data/others/aecc_threshold.csv', index_col='aecc_class')
dict_aecc_threshold_temp = df_aecc_threshold.to_dict(orient='index')
dict_aecc_threshold = {}
for key, val in dict_aecc_threshold_temp.items():
    ecdg_code = val['ecdg_code'].split()[0]
    if ecdg_code not in dict_aecc_threshold.keys():
        dict_aecc_threshold[ecdg_code] = {}
        dict_aecc_threshold[ecdg_code][key] = {'min_score': val['min_score'], 'max_score': val['max_score']}
    else:
        dict_aecc_threshold[ecdg_code][key] = {'min_score': val['min_score'], 'max_score': val['max_score']}

complexity_col = []
for stay_idx in tqdm(df_patientdata.index):
    depst_admitted = 0
    depst_died = 0
    depst_left = 0
    depst_referred = 0
    triag1 = 0
    triag2 = 0
    triag3 = 0
    triag4 = 0
    transmodB_1 = 0
    age0509 = 0
    age1014 = 0
    age1569 = 0
    age7074 = 0
    age7579 = 0
    age8084 = 0
    age85 = 0
    depst_admitted_age_0014 = 0
    depst_admitted_age_80 = 0
    depst_admitted_triage_1 = 0
    depst_admitted_triage_2 = 0
    depst_admitted_triage_3 = 0
    depst_admitted_triage_4 = 0

    patient_info = df_patientdata.loc[stay_idx]

    if patient_info['disposition'] in ['WARD', 'ICU']:
        depst_admitted = 1

    if patient_info['disposition'] in ['TRANSFER']:
        depst_referred = 1

    if patient_info['acuity'] == '1':
        triag1 = 1
    elif patient_info['acuity'] == '2':
        triag2 = 1
    elif patient_info['acuity'] == '3':
        triag3 = 1
    elif patient_info['acuity'] == '4':
        triag4 = 1

    if patient_info['arrival_transport'] in ['AMBULANCE', 'HELICOPTER']:
        transmodB_1 = 1

    if 5 <= patient_info['age'] <= 9:
        age0509 = 1
    elif 10 <= patient_info['age'] <= 14:
        age1014 = 1
    elif 15 <= patient_info['age'] <= 69:
        age1569 = 1
    elif 70 <= patient_info['age'] <= 74:
        age7074 = 1
    elif 75 <= patient_info['age'] <= 79:
        age7579 = 1
    elif 80 <= patient_info['age'] <= 84:
        age8084 = 1
    elif 85 <= patient_info['age']:
        age85 = 1

    if depst_admitted:
        if 0 <= patient_info['age'] <= 14:
            depst_admitted_age_0014 = 1
        if 80 <= patient_info['age']:
            depst_admitted_age_80 = 1
        if triag1:
            depst_admitted_triage_1 = 1
        if triag2:
            depst_admitted_triage_2 = 1
        if triag3:
            depst_admitted_triage_3 = 1
        if triag4:
            depst_admitted_triage_4 = 1

    patient_ceofficients = dict_aecc_coefficients[patient_info['ecdg']]

    predicted_value = 5.902 + \
                        patient_ceofficients['endstatus_admitted'] * depst_admitted + \
                        patient_ceofficients['endstatus_died'] * depst_died + \
                        patient_ceofficients['endstatus_left'] * depst_left + \
                        patient_ceofficients['endstatus_referred'] * depst_referred + \
                        patient_ceofficients['arrival_ambulance'] * transmodB_1 + \
                        patient_ceofficients['triage_1'] * triag1 + \
                        patient_ceofficients['triage_2'] * triag2 + \
                        patient_ceofficients['triage_3'] * triag3 + \
                        patient_ceofficients['triage_4'] * triag4 + \
                        patient_ceofficients['age_05_09'] * age0509 + \
                        patient_ceofficients['age_10_14'] * age1014 + \
                        patient_ceofficients['age_15_69'] * age1569 + \
                        patient_ceofficients['age_70_74'] * age7074 + \
                        patient_ceofficients['age_75_79'] * age7579 + \
                        patient_ceofficients['age_80_84'] * age8084 + \
                        patient_ceofficients['age_85+'] * age85 + \
                        patient_ceofficients['admitted_age_0_14'] * depst_admitted_age_0014 + \
                        patient_ceofficients['admitted_age_80+'] * depst_admitted_age_80 + \
                        patient_ceofficients['admitted_triage_1'] * depst_admitted_triage_1 + \
                        patient_ceofficients['admitted_triage_2'] * depst_admitted_triage_2 + \
                        patient_ceofficients['admitted_triage_3'] * depst_admitted_triage_3 + \
                        patient_ceofficients['admitted_triage_4'] * depst_admitted_triage_4 + \
                        dict_aecc_subgroup_intercept[patient_info['ecdg_subgroup']]['intercept']

    complexity_score = ((np.exp(predicted_value) - 713) / 166) + 3.26

    for key, val in dict_aecc_threshold[patient_info['ecdg']].items():
        if val['min_score'] <= complexity_score < val['max_score']:
            complexity_val = key[-1]
            if complexity_val in ['A', 'Z']:
                complexity_val = 'HIGH'
            elif complexity_val == 'B':
                complexity_val = 'MODERATE'
            elif complexity_val in ['C', 'D']:
                complexity_val = 'LOW'
            break

    complexity_col.append(complexity_val)

df_patientdata.loc[:, 'complexity'] = complexity_col
df_patientdata

In [None]:
# Retain the relevant patient information needed for the study
df_patientdata = df_patientdata[['subject_id', 'acuity', 'disposition', 'complexity', 'los']].reset_index()
df_patientdata

In [None]:
# Save preprocessed patient records that will be used for the study
outpath = 'data/preprocessed'
if not os.path.exists(outpath):
    os.makedirs(outpath)

df_patientdata.to_csv(f'{outpath}/patient_data.csv', index=False)

In [None]:
# Plot frequency distribution of disposition and complexity per acuity
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rc
rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']})

In [None]:
palette = {'HOME': '#009E73',
          'WARD': '#56B4E9',
          'ICU': '#E69F00'}
fig, ax = plt.subplots(1, 1, figsize=(2,2))
fig.dpi = 600
ax = sns.histplot(data=df_patientdata.sort_values(by=['acuity']), x='acuity', hue='disposition', hue_order=['HOME', 'WARD', 'ICU'], stat='percent', multiple='fill', shrink=0.8, alpha=0.5, palette=palette)
sns.move_legend(
    ax, 'upper left',
    bbox_to_anchor=(1, 1.05), ncol=1, title='Disposition', frameon=False, columnspacing=0.9, handlelength=0.8, handletextpad=0.5
)
plt.xlabel('Acuity')
ax.set_yticks([0, 0.2, 0.4, 0.6, 0.8, 1])
ax.set_yticklabels(['0', '20', '40', '60', '80', '100'])
plt.show()

In [None]:
palette = {'HIGH':'#CC79A7',
        'MODERATE':'#D55E00',
        'LOW':'#0072B2'}
fig, ax = plt.subplots(1, 1, figsize=(2,2))
fig.dpi = 600
ax = sns.histplot(data=df_patientdata.sort_values(by=['acuity']), x='acuity', hue='complexity', hue_order=['HIGH', 'MODERATE', 'LOW'], stat='percent', multiple='fill', palette=palette, shrink=0.8, alpha=0.5)
sns.move_legend(
    ax, 'upper left',
    bbox_to_anchor=(1, 1.05), ncol=1, title='Complexity', frameon=False, columnspacing=0.9, handlelength=0.8, handletextpad=0.5, reverse=True
)

plt.xlabel('Acuity')
ax.set_yticks([0, 0.2, 0.4, 0.6, 0.8, 1])
ax.set_yticklabels(['0', '20', '40', '60', '80', '100'])
plt.show()