# Create Physician Note Dataset

In [None]:
import re
from pathlib import Path

import pandas as pd

In [None]:
MIMIC_III_DIR = Path.cwd() / "inputs" / "physionet.org" / "files" / "mimiciii" / "1.4"

## Read in MIMIC III notes

In [None]:
full_df = pd.read_csv(MIMIC_III_DIR / "NOTEEVENTS.csv")

## Pre processing

Remove error and duplicate rows

In [None]:
full_df = full_df[full_df["ISERROR"] != 1]
full_df.drop("ISERROR", axis=1, inplace=True)
full_df = full_df.drop_duplicates()
full_df.head()

In [None]:
len(full_df), len(full_df["HADM_ID"].unique())

In [None]:
full_df["CATEGORY"].unique()

Only keep hadm_ids with at least 1 physician note and 1 discharge summary

In [None]:
keep_hadm_ids = set()
for hadm_id, group_df in full_df.groupby("HADM_ID"):
    if {"Physician ", "Discharge summary"}.issubset(set(group_df["CATEGORY"].unique())):
        keep_hadm_ids.add(hadm_id)

subset_df = full_df[full_df["HADM_ID"].isin(keep_hadm_ids)]

In [None]:
physician_notes_df = subset_df[subset_df["CATEGORY"] == "Physician "].copy()
len(physician_notes_df), len(physician_notes_df["HADM_ID"].unique())

In [None]:
def clean_text(text: str) -> str:
    # Tidy up new lines
    cleaned_text = re.sub(r"\n[^A-Za-z0-9]+\n", r"\n\n", text)
    cleaned_text = re.sub(r"\n +", r"\n", cleaned_text)
    cleaned_text = re.sub(r"\n{3,}", "\n\n", cleaned_text)
    # Remove de-id tags
    cleaned_text = re.sub(r"(?:\[\*\*)|(?:\*\*\])", "", cleaned_text)
    cleaned_text = re.sub(r" {2,}", " ", cleaned_text)
    return cleaned_text


def clean_mimic_notes_df(mimic_notes_df: pd.DataFrame) -> pd.DataFrame:
    # Set charttime to chartdate if charttime is null
    mimic_notes_df["CHARTTIME"] = mimic_notes_df["CHARTTIME"].fillna(
        full_df["CHARTDATE"] + " 23:59:59"
    )
    mimic_notes_df = mimic_notes_df.sort_values(by=["HADM_ID", "CHARTTIME"])
    mimic_notes_df = mimic_notes_df.reset_index(drop=True)
    mimic_notes_df["TEXT"] = mimic_notes_df["TEXT"].apply(clean_text)
    return mimic_notes_df

In [None]:
physician_note_df = clean_mimic_notes_df(physician_notes_df)
len(physician_note_df), len(physician_note_df["HADM_ID"].unique())

In [None]:
physician_note_df.head()

## Save cleaned notes

In [None]:
physician_note_df.to_csv(MIMIC_III_DIR / "physician_notes_mimic.csv")