In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pickle
import re
from pathlib import Path

import pandas as pd
import rich

from discharge_summaries.schemas.mimic import DischargeSummary, Note, Record

In [None]:
RANDOM_SEED = 23
DATA_DIR = Path.cwd().parent / "data"
MIMIC_DIR = DATA_DIR / "physionet.org" / "files"

MIMIC_III_DIR = MIMIC_DIR / "mimiciii" / "1.4"
MIMIC_IV_DIR = MIMIC_DIR / "mimiciv" / "2.2" / "note"
CLEAN_DATASET_SAVE_PATH = DATA_DIR / "clean_df.pkl"

## Read in MIMIC III notes

In [None]:
full_df = pd.read_csv(MIMIC_III_DIR / "NOTEEVENTS.csv")

### Pre processing

Remove error and duplicate rows

In [None]:
full_df = full_df[full_df["ISERROR"] != 1]
full_df.drop("ISERROR", axis=1, inplace=True)
full_df = full_df.drop_duplicates()
full_df.head()

In [None]:
len(full_df), len(full_df["HADM_ID"].unique())

Keep only Physician and discharge notes

In [None]:
full_df["CATEGORY"].unique()

In [None]:
full_df = full_df[
    (full_df["CATEGORY"] == "Physician ")
    | (
        (full_df["CATEGORY"] == "Discharge summary")
        & (full_df["DESCRIPTION"] == "Report")
    )
]
print(len(full_df))

Group by HADM_ID and only keep rows with both a discharge summary and physician note

In [None]:
grouped_df = full_df.groupby("HADM_ID")
df = grouped_df.filter(lambda group: len(group["CATEGORY"].unique()) == 2)

In [None]:
def clean_text(text: str) -> str:
    cleaned_text = re.sub(r"\n\.\n", r"\n\n", text)
    cleaned_text = re.sub(r"\n {2,}", "\n", cleaned_text)
    cleaned_text = re.sub(r"\n{3,}", "\n\n", cleaned_text)
    return cleaned_text


df["TEXT"] = df["TEXT"].apply(lambda text: clean_text(text))

In [None]:
df["CHARTTIME"] = df["CHARTTIME"].fillna(full_df["CHARTDATE"] + " 23:59:59")
df = df.sort_values(by=["HADM_ID", "CHARTTIME"])
df = df.reset_index(drop=True)
len(df), len(df["HADM_ID"].unique())

In [None]:
dataset = []
for hadm_id, group_df in df.groupby("HADM_ID"):
    physician_notes = [
        Note(
            text=series["TEXT"],
            datetime=series["CHARTTIME"],
            category=series["CATEGORY"],
            description=series["DESCRIPTION"],
        )
        for _, series in group_df[group_df["CATEGORY"] == "Physician "].iterrows()
    ]

    discharge_summary_row = group_df[group_df["CATEGORY"] == "Discharge summary"].iloc[
        0
    ]
    discharge_summary = DischargeSummary(
        text=discharge_summary_row["TEXT"],
        datetime=discharge_summary_row["CHARTTIME"],
        category=discharge_summary_row["CATEGORY"],
        description=discharge_summary_row["DESCRIPTION"],
    )
    if len(discharge_summary.bhc_sections) <= 1:
        continue

    record = Record(
        physician_notes=sorted(physician_notes),
        discharge_summary=discharge_summary,
        hadm_id=hadm_id,
        subject_id=group_df["SUBJECT_ID"].iloc[0],
    )
    dataset.append(record)
len(dataset)

In [None]:
rich.print(dataset[0])

In [None]:
dataset[0].discharge_summary.bhc_sections

In [None]:
with open(CLEAN_DATASET_SAVE_PATH, "wb") as out_file:
    pickle.dump([record.dict() for record in dataset], out_file)