In [None]:
import re
from pathlib import Path

import pandas as pd
from IPython.display import display

In [None]:
RANDOM_SEED = 23
DATA_DIR = Path.cwd().parent / "data"
MIMIC_DIR = DATA_DIR / "physionet.org" / "files"

MIMIC_III_DIR = MIMIC_DIR / "mimiciii" / "1.4"
MIMIC_IV_DIR = MIMIC_DIR / "mimiciv" / "2.2" / "note"
CLEAN_DF_SAVE_PATH = DATA_DIR / "clean_df.pkl"

## Read in MIMIC III notes

In [None]:
full_df = pd.read_csv(MIMIC_III_DIR / "NOTEEVENTS.csv")

### Pre processing

Remove error and duplicate rows

In [None]:
full_df = full_df[full_df["ISERROR"] != 1]
full_df.drop("ISERROR", axis=1, inplace=True)
full_df = full_df.drop_duplicates()
full_df.head()

In [None]:
len(full_df), len(full_df["HADM_ID"].unique())

Keep only Physician and discharge notes

In [None]:
full_df["CATEGORY"].unique()

In [None]:
full_df = full_df[
    (full_df["CATEGORY"] == "Physician ")
    | (
        (full_df["CATEGORY"] == "Discharge summary")
        & (full_df["DESCRIPTION"] == "Report")
    )
]
print(len(full_df))

Group by HADM_ID and only keep rows with both a discharge summary and physician note

In [None]:
grouped_df = full_df.groupby("HADM_ID")
df = grouped_df.filter(lambda group: len(group["CATEGORY"].unique()) == 2)

Clean text, fillna chartimes with date and midnight (to allow sorting by time), then regroup

In [None]:
def clean_text(text: str) -> str:
    cleaned_text = re.sub(r"\n\.\n", r"\n\n", text)
    cleaned_text = re.sub(r"\n {2,}", "\n", cleaned_text)
    cleaned_text = re.sub(r"\n{3,}", "\n\n", cleaned_text)
    return cleaned_text


df["TEXT"] = df["TEXT"].apply(lambda text: clean_text(text))

In [None]:
df["CHARTTIME"] = df["CHARTTIME"].fillna(full_df["CHARTDATE"] + " 23:59:59")
df = df.sort_values(by=["HADM_ID", "CHARTTIME"])
df = df.reset_index(drop=True)
len(df), len(df["HADM_ID"].unique())

In [None]:
def extract_summary_sections(row: pd.Series) -> list[str]:
    if row["CATEGORY"] != "Discharge summary":
        return []

    # Extract text
    start_pattern = r"\nBrief Hospital Course:\n"
    end_pattern = r"\nMedications on Admission:\n"
    match = re.search(f"{start_pattern}(.*?){end_pattern}", row["TEXT"], re.DOTALL)
    if not match:
        return []
    # Split into sections
    sections = re.split("\n\n(?=[^\n]*:)", match.group(1))
    if len(sections) == 1:
        return []
    # Remove prefixes such as 1) or #
    sections = [
        section if idx == 0 else re.sub(r"\A[0-9#]+[\)|\.]* *", "", section)
        for idx, section in enumerate(sections)
    ]
    return sections if len(sections) > 1 else []


df["SUMMARY_SECTIONS"] = df.apply(extract_summary_sections, axis=1)
hadm_ids_w_summary_sections = df["HADM_ID"][
    df["SUMMARY_SECTIONS"].apply(len) > 0
].unique()
df = df[df["HADM_ID"].isin(hadm_ids_w_summary_sections)]
len(df["HADM_ID"].unique())

In [None]:
sample_hadm_id = df["HADM_ID"].sample(1, random_state=RANDOM_SEED).values[0]
sample_df = df[df["HADM_ID"] == sample_hadm_id]
display(sample_df)

In [None]:
print(sample_df[sample_df["CATEGORY"] == "Physician "]["TEXT"].values[-1])

In [None]:
sample_discharge_summary = sample_df[sample_df["CATEGORY"] == "Discharge summary"]
print(sample_discharge_summary["TEXT"].values[0])
for section in sample_discharge_summary["SUMMARY_SECTIONS"].values[0]:
    print(f"\n{section}")

In [None]:
df.to_pickle(CLEAN_DF_SAVE_PATH)