In [None]:
from pathlib import Path

import pandas as pd
from tqdm.notebook import tqdm

In [None]:
MIMIC_III_DIR = (
    Path.cwd().parent / "data" / "physionet.org" / "files" / "mimiciii" / "1.4"
)

## Read in MIMIC III notes

In [None]:
full_df = pd.read_csv(MIMIC_III_DIR / "NOTEEVENTS.csv")

### Pre processing

Remove error and duplicate rows

In [None]:
full_df = full_df[full_df["ISERROR"] != 1]
full_df.drop("ISERROR", axis=1, inplace=True)
full_df = full_df.drop_duplicates()
full_df.head()

In [None]:
len(full_df), len(full_df["HADM_ID"].unique())

In [None]:
full_df["CATEGORY"].unique()

In [None]:
def clean_mimic_notes_df(mimic_notes_df: pd.DataFrame) -> pd.DataFrame:
    mimic_notes_df["CHARTTIME"] = mimic_notes_df["CHARTTIME"].fillna(
        full_df["CHARTDATE"] + " 23:59:59"
    )
    mimic_notes_df = mimic_notes_df.sort_values(by=["HADM_ID", "CHARTTIME"])
    mimic_notes_df = mimic_notes_df.reset_index(drop=True)
    return mimic_notes_df

Could be written better

In [None]:
keep_hadm_id = set()
for hadm_id, group in tqdm(full_df.groupby("HADM_ID")):
    if ((group["CATEGORY"] == "Physician ").any()) and (
        (
            (group["CATEGORY"] == "Discharge summary")
            & (group["DESCRIPTION"] == "Report")
        )
    ).any():
        keep_hadm_id.add(hadm_id)

discharge_and_physician_notes_df = full_df[full_df["HADM_ID"].isin(keep_hadm_id)].copy()
discharge_and_physician_notes_df = clean_mimic_notes_df(
    discharge_and_physician_notes_df
)

In [None]:
physician_note_df = discharge_and_physician_notes_df[
    discharge_and_physician_notes_df["CATEGORY"] == "Physician "
].copy()
len(physician_note_df), len(physician_note_df["HADM_ID"].unique())

In [None]:
physician_note_df.head()

In [None]:
physician_note_df.to_csv(MIMIC_III_DIR / "physician_notes.csv")

In [None]:
discharge_summary_df = discharge_and_physician_notes_df[
    (discharge_and_physician_notes_df["CATEGORY"] == "Discharge summary")
].copy()
len(discharge_summary_df), len(discharge_summary_df["HADM_ID"].unique())

In [None]:
discharge_summary_df.head()

In [None]:
discharge_summary_df.to_csv(MIMIC_III_DIR / "discharge_summaries.csv")