In [None]:
import re
from pathlib import Path

import pandas as pd

In [None]:
MIMIC_III_DIR = (
    Path.cwd().parent / "data" / "physionet.org" / "files" / "mimiciii" / "1.4"
)

## Read in MIMIC III notes

In [None]:
full_df = pd.read_csv(MIMIC_III_DIR / "NOTEEVENTS.csv")

### Pre processing

Remove error and duplicate rows

In [None]:
full_df = full_df[full_df["ISERROR"] != 1]
full_df.drop("ISERROR", axis=1, inplace=True)
full_df = full_df.drop_duplicates()
full_df.head()

In [None]:
len(full_df), len(full_df["HADM_ID"].unique())

In [None]:
full_df["CATEGORY"].unique()

In [None]:
def clean_text(text: str) -> str:
    # Tidy up new lines
    cleaned_text = re.sub(r"^[^A-Za-z0-9]*\n", r"\n", text, flags=re.MULTILINE)
    cleaned_text = re.sub(r"\n{3,}", "\n\n", cleaned_text)
    # Remove de-id tags
    cleaned_text = re.sub(r"(?:\[\*\*)|(?:\*\*\])", "", cleaned_text)
    cleaned_text = re.sub(r" {2,}", " ", cleaned_text)
    return cleaned_text


def clean_mimic_notes_df(mimic_notes_df: pd.DataFrame) -> pd.DataFrame:
    mimic_notes_df["CHARTTIME"] = mimic_notes_df["CHARTTIME"].fillna(
        full_df["CHARTDATE"] + " 23:59:59"
    )
    mimic_notes_df = mimic_notes_df.sort_values(by=["HADM_ID", "CHARTTIME"])
    mimic_notes_df = mimic_notes_df.reset_index(drop=True)
    mimic_notes_df["TEXT"] = mimic_notes_df["TEXT"].apply(clean_text)
    return mimic_notes_df

In [None]:
physician_note_hadm_ids = full_df[full_df["CATEGORY"] == "Physician "][
    "HADM_ID"
].unique()
subset_df = full_df[full_df["HADM_ID"].isin(physician_note_hadm_ids)]
len(physician_note_hadm_ids), len(subset_df["HADM_ID"].unique())

In [None]:
physician_note_df = subset_df[subset_df["CATEGORY"] == "Physician "].copy()
physician_note_df = clean_mimic_notes_df(physician_note_df)
len(physician_note_df), len(physician_note_df["HADM_ID"].unique())

In [None]:
physician_note_df.head()

In [None]:
physician_note_df.to_csv(MIMIC_III_DIR / "physician_notes_mimic.csv")

In [None]:
discharge_summary_df = subset_df[
    (subset_df["CATEGORY"] == "Discharge summary")
    & (subset_df["DESCRIPTION"] == "Report")
].copy()

single_discharge_note_hadm_ids = set()
for hadm_id, group in discharge_summary_df.groupby("HADM_ID"):
    if len(group) == 1:
        single_discharge_note_hadm_ids.add(hadm_id)
discharge_summary_df = discharge_summary_df[
    discharge_summary_df["HADM_ID"].isin(single_discharge_note_hadm_ids)
]
discharge_summary_df = clean_mimic_notes_df(discharge_summary_df)

len(discharge_summary_df), len(discharge_summary_df["HADM_ID"].unique())

In [None]:
def extract_bhc(discharge_summary_text: str) -> str:
    start_pattern = r"\nBrief Hospital Course:\n"
    end_pattern = r"\nMedications on Admission:\n"
    # Match any characters between the start and end pattern
    match = re.search(
        f"{start_pattern}(.*?){end_pattern}", discharge_summary_text, re.DOTALL
    )
    if not match:
        return ""
    return match.group(1).strip()


discharge_summary_df["BHC"] = discharge_summary_df["TEXT"].apply(extract_bhc)
discharge_summary_df = discharge_summary_df[discharge_summary_df["BHC"] != ""]
len(discharge_summary_df), len(discharge_summary_df["HADM_ID"].unique())

In [None]:
discharge_summary_df.to_csv(MIMIC_III_DIR / "discharge_summaries_mimic.csv")