In [None]:
import re
from pathlib import Path

import pandas as pd
import tiktoken
from IPython.display import display
from sklearn.model_selection import train_test_split

In [None]:
RANDOM_SEED = 23
MIMIC_DIR = Path.cwd().parent / "data" / "physionet.org" / "files"

MIMIC_III_DIR = MIMIC_DIR / "mimiciii" / "1.4"
MIMIC_IV_DIR = MIMIC_DIR / "mimiciv" / "2.2" / "note"

## Read in MIMIC III notes

In [None]:
df = pd.read_csv(MIMIC_III_DIR / "NOTEEVENTS.csv")

### Pre processing

Remove error and duplicate rows

In [None]:
df = df[df["ISERROR"] != 1]
df.drop("ISERROR", axis=1, inplace=True)
df = df.drop_duplicates()
df.head()

In [None]:
len(df), len(df["HADM_ID"].unique())

Keep only Physician and discharge notes

In [None]:
df["CATEGORY"].unique()

In [None]:
df = df[
    (df["CATEGORY"] == "Physician ")
    | ((df["CATEGORY"] == "Discharge summary") & (df["DESCRIPTION"] == "Report"))
]
print(len(df))
df.head()

Group by HADM_ID and only keep rows with both a discharge summary and physician note

In [None]:
grouped_df = df.groupby("HADM_ID")
grouped_df = grouped_df.filter(lambda group: len(group["CATEGORY"].unique()) == 2)

Clean text, fillna chartimes with date and midnight (to allow sorting by time), then regroup

In [None]:
def clean_text(text: str) -> str:
    cleaned_text = re.sub(r"\n{3,}", "\n\n", text)
    cleaned_text = re.sub(r"\n {2,}", "\n", cleaned_text)
    return cleaned_text


grouped_df["CHARTTIME"] = grouped_df["CHARTTIME"].fillna(df["CHARTDATE"] + " 23:59:59")
grouped_df["TEXT"] = grouped_df["TEXT"].apply(lambda text: clean_text(text))
grouped_df = grouped_df.groupby("HADM_ID")

Test train split, currently v large test size as train data needed for method validation rather than model training currently

In [None]:
train_keys, test_keys = train_test_split(
    list(grouped_df.groups.keys()), test_size=0.8, random_state=RANDOM_SEED
)

In [None]:
train_dfs = [grouped_df.get_group(key) for key in train_keys]

In [None]:
sample = train_dfs[0]
sample = sample.sort_values("CHARTTIME")
display(sample)  # type: ignore
for _, row in sample.iterrows():
    print("=" * 80)
    print(row["CATEGORY"])
    print(row["TEXT"])
    print()

## Create cleaned dataframe

Create dataframe with all physcian notes concatenated together (ordered in time), the brief hospital course as the summary and hadim if linking is needed later

In [None]:
def create_continuous_physician_note(df: pd.DataFrame) -> str:
    notes_df = df[df["CATEGORY"] != "Discharge summary"]
    notes_df = notes_df.sort_values("CHARTDATE")
    notes = notes_df["TEXT"].values
    notes = [f"Physician Note {idx}:\n{note}" for idx, note in enumerate(notes)]
    return "\n****\n".join(notes)


def extract_brief_hospital_course(df: pd.DataFrame) -> str:
    full_summary = df[df["CATEGORY"] == "Discharge summary"]["TEXT"].values[0]
    start_pattern = r"\nBrief Hospital Course:\n"
    end_pattern = r"\nMedications on Admission:\n"
    match = re.search(f"{start_pattern}(.*?){end_pattern}", full_summary, re.DOTALL)
    return match.group(1) if match else ""


physician_notes = [create_continuous_physician_note(df) for df in train_dfs]
brief_hospital_courses = [extract_brief_hospital_course(df) for df in train_dfs]
hadim_ids = [df["HADM_ID"].values[0] for df in train_dfs]
cleaned_df = pd.DataFrame.from_dict(
    {"notes": physician_notes, "summary": brief_hospital_courses, "hadim": hadim_ids}
)
# Remove empty summaries
cleaned_df = cleaned_df[cleaned_df["summary"] != ""]
cleaned_df.head()

## Token lengths

In [None]:
tokenizer = tiktoken.get_encoding("cl100k_base")
note_tokens = cleaned_df["notes"].apply(lambda x: len(tokenizer.encode(x)))
summary_tokens = cleaned_df["summary"].apply(lambda x: len(tokenizer.encode(x)))

In [None]:
tokens_df = pd.concat(
    [note_tokens.describe(), summary_tokens.describe()], axis=1
).transpose()
tokens_df = tokens_df.drop("count", axis=1)
tokens_df.loc["total"] = tokens_df.sum(numeric_only=True)
tokens_df