In [11]:
import pandas as pd
import numpy as np

In [12]:
f = '../../data/'

In [13]:
# Identify acute myocardial infarction (AMI) diagnoses
df_diag = pd.read_csv(f + 'diagnoses_icd.csv')
df_icd = pd.read_csv(f + 'd_icd_diagnoses.csv')

# Merge to get the text description (just for us to know about)
df_diag = df_diag.merge(df_icd, on=['icd_code', 'icd_version'], how='left')

# Select ICD-9 = 410.* or ICD-10 = I21.*
df_ami = df_diag[
    ((df_diag['icd_version'] == 9) & (df_diag['icd_code'].str.startswith('410'))) |
    ((df_diag['icd_version'] == 10) & (df_diag['icd_code'].str.startswith('I21')))
]

In [14]:
df_adm = pd.read_csv(f + 'admissions.csv')
df_pat = pd.read_csv(f + 'patients.csv')

# Merge with admissions and patients so each AMI diagnosis is linked to demographics and hospital stay info
df_merged = df_ami.merge(
    df_adm, 
    on=['subject_id','hadm_id'], 
    how='left', 
    validate='m:1'
)
df_merged = df_merged.merge(
    df_pat, 
    on='subject_id', 
    how='left', 
    validate='m:1'
)

# Restricting to adults
df_merged = df_merged[df_merged['anchor_age'] >= 18] # to account for anchor_age being shifted

In [None]:
df_icu = pd.read_csv(f + 'icustays.csv')

# Merge ICU info
df_icu_merged = df_merged.merge(
    df_icu, 
    on=['hadm_id', 'subject_id'], 
    how='inner', 
    validate='m:m'
)

# Keep the first ICU stay per patient
df_icu_merged = (
    df_icu_merged.sort_values(['subject_id', 'intime'])
    .groupby('subject_id')
    .first()
    .reset_index()
)

In [22]:
# Merging with lab values

df_lab = pd.read_csv(f + 'labevents.csv', usecols=['subject_id', 'hadm_id', 'itemid', 'charttime', 'value', 'valuenum'])
df_dlab = pd.read_csv(f + 'd_labitems.csv', usecols=['itemid', 'label'])

# # Filter for specific tests
# keywords = ['lymphocyte', 'monocyte', 'neutrophil']
# df_dlab = df_dlab[df_dlab['label'].str.lower().str.contains('|'.join(keywords))]

# Merge to get their values
df_lab = df_lab.merge(df_dlab, on='itemid', how='inner')

In [23]:
labs_pivot = (
    df_lab.groupby(['subject_id', 'hadm_id', 'label'])['valuenum']
    .mean()
    .unstack()  # one column per lab type
    .reset_index()
)

In [44]:
df_final = df_icu_merged.merge(labs_pivot, on=['subject_id', 'hadm_id'], how='left')

# Drop missing lab values so that we can calculate SIRI
df_final = df_final.dropna(subset=['Absolute Lymphocyte Count', 'Absolute Monocyte Count', 'Absolute Neutrophil Count'])

print("Final cohort size:", len(df_final))

Final cohort size: 3520


In [56]:
# Dropping empty columns
df_final = df_final.dropna(axis=1, how='all')

In [60]:
df_final.to_csv(f + 'final_cohort.csv', index=False)