In [None]:
import re
from pathlib import Path

import numpy as np
import pandas as pd
import tiktoken
from tqdm.notebook import tqdm

In [None]:
MIMIC_III_DIR = (
    Path.cwd().parent / "data" / "physionet.org" / "files" / "mimiciii" / "1.4"
)
AZURE_ENGINE = "gpt-35-turbo-16k"
AZURE_API_VERSION = "2023-07-01-preview"
TOKENIZER = tiktoken.get_encoding("cl100k_base")

In [None]:
physician_notes = pd.read_csv(MIMIC_III_DIR / "physician_notes.csv")
grouped_physician_notes_df = physician_notes.groupby("HADM_ID")
hadm_id, physician_notes_hadm_id_df = next(iter(physician_notes.groupby("HADM_ID")))

In [None]:
def generate_notes_string(physician_notes_df: pd.DataFrame):
    # Could be smarter here alot of text overlap
    physician_notes_df_filtered = physician_notes_df[
        ["CHARTTIME", "TEXT"]
    ].drop_duplicates()
    return "\n\n".join(
        f"Physician Note {idx+1}: {note['CHARTTIME']}\n{note['TEXT']}"
        for idx, note in physician_notes_df_filtered.sort_values("CHARTTIME").iterrows()
    )

In [None]:
naive_string_lengths = [
    len(TOKENIZER.encode(generate_notes_string(group)))
    for _, group in tqdm(
        physician_notes.groupby("HADM_ID"),
        total=len(physician_notes["HADM_ID"].unique()),
    )
]

In [None]:
def generate_notes_stringv2(physician_notes_df: pd.DataFrame):
    physician_notes_df_filtered = (
        physician_notes_df[["CHARTTIME", "TEXT"]]
        .drop_duplicates()
        .reset_index(drop=True)
    )
    added_sections = set()
    physician_notes = []
    for idx, note in physician_notes_df_filtered.sort_values("CHARTTIME").iterrows():
        new_sections = ""
        for note_section in re.split(
            "\n(?=^[^\n].*?:)", note["TEXT"], flags=re.MULTILINE
        ):
            if note_section not in added_sections:
                new_sections += "\n" + note_section
                added_sections.add(note_section)
        physician_notes.append(
            f"Physician Note {idx+1}: {note['CHARTTIME']}{new_sections}"
        )
    return "\n\n".join(physician_notes)

In [None]:
v2_string_lengths = [
    len(TOKENIZER.encode(generate_notes_stringv2(group)))
    for _, group in tqdm(
        physician_notes.groupby("HADM_ID"),
        total=len(physician_notes["HADM_ID"].unique()),
    )
]

In [None]:
np.median(naive_string_lengths), np.median(v2_string_lengths)

In [None]:
np.mean(naive_string_lengths), np.mean(v2_string_lengths)

In [None]:
np.max(naive_string_lengths), np.max(v2_string_lengths)

In [None]:
np.percentile(naive_string_lengths, 95), np.percentile(v2_string_lengths, 95)

In [None]:
sum(1 for length in v2_string_lengths if length < 15000) / len(v2_string_lengths)

In [None]:
np.array(naive_string_lengths).max(), np.array(v2_string_lengths).max()