In [1]:
import pandas as pd

In [2]:
# File locations here
mimicNoteFile = './data/discharge.csv' # From MIMIC-Note
mimicFile = './data/diagnoses_icd.csv' # From MIMIC 2.2
mimicBHCFile = './data/mimic-iv-bhc.csv' # From MIMIC-Ext-BHC

In [3]:
# Getting note_id relation to subject_id and note_seq from mimic-iv 2.2 (regular)
discharge = mimicNoteFile
discharge_df = pd.read_csv(discharge)
combining_df = discharge_df[['note_id', 'subject_id', 'hadm_id']]

In [4]:
diagnoses_icd = mimicFile
icd_df = pd.read_csv(diagnoses_icd)

# Only keeping icd-10 codes, removing icd-9
icd_df = icd_df.loc[icd_df['icd_version'] == 10]

# Only keeping primary ICD code
icd_df = icd_df.loc[(icd_df['seq_num'] == 1)]

# Finding top 50 codes
top50_codes = icd_df['icd_code'].value_counts().head(50)

# Identifying hadm_id values where icd_code is in top_50
hadm_ids_to_keep = icd_df.loc[icd_df['icd_code'].isin(top50_codes.index), 'hadm_id']

# Filtering the dataset to keep only rows with those hadm_id values
icd_df = icd_df[icd_df['hadm_id'].isin(hadm_ids_to_keep)]

# Dropping seq_num and icd_version as now redundant
icd_df = icd_df[icd_df['seq_num'] == 1].drop(columns=['seq_num', 'icd_version'])


In [5]:
mimic = mimicBHCFile
discharge_df = pd.read_csv(mimic)

# Inner merge mimic-ext-bhc with mimic-note dataset to 
discharge_df = discharge_df.merge(combining_df, on='note_id', how='inner')

# Keeping notes with top 50 codes
discharge_df = discharge_df[discharge_df['hadm_id'].isin(hadm_ids_to_keep)]

# Attaching icd codes to each note
final_df = discharge_df.merge(icd_df, on=['hadm_id', 'subject_id'], how='inner')

In [9]:
top50_codes.index.to_series().to_csv('./data/processed/top50.csv', index=False, header=False)

In [10]:
# Removing underscores to prevent LLM from misinterpreting
no_underscores_df = final_df.copy()
no_underscores_df['target'] = no_underscores_df['target'].str.replace('_', '', regex=False)
no_underscores_df.to_csv('./data/processed/data.csv', index=False)