In [None]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
import tiktoken

In [None]:
ANNOTATED_DIR = Path.cwd() / "outputs" / "annotated"
RAW_GENERATED_DIR = Path.cwd() / "outputs" / "output_eval_v3"
GPT_4_TURBO_INPUT_COST_PER_1K = 0.01
GPT_4_TURBO_OUTPUT_COST_PER_1K = 0.03
TOKENIZER_NAME = "cl100k_base"

In [None]:
hadm_id_to_discharge_summary = {
    directory.stem: json.loads((directory / "discharge_summary.json").read_text())
    for directory in RAW_GENERATED_DIR.iterdir()
}

In [None]:
delimter = "\n" + "*" * 80 + "\n"
hadm_id_to_messages = {
    directory.stem: list((directory / "raw_messages.txt").read_text().split(delimter))
    for directory in RAW_GENERATED_DIR.iterdir()
}

## Extractive

In [None]:
def find_json_strings(json_object):
    strings_list = []

    def process_object(obj):
        if isinstance(obj, str):
            strings_list.append(obj)
        elif isinstance(obj, list):
            for item in obj:
                process_object(item)
        elif isinstance(obj, dict):
            for value in obj.values():
                process_object(value)

    process_object(json_object)
    return strings_list

In [None]:
num_extractive_sentences = 0
num_generated_sentences = 0

for hadm_id in hadm_id_to_messages.keys():
    discharge_summary_json = hadm_id_to_discharge_summary[hadm_id]
    physician_notes_text_lowercase = hadm_id_to_messages[hadm_id][3].lower()

    json_strings = find_json_strings(discharge_summary_json)
    json_sentences_lowercase = [
        sentence.lower()
        for item in json_strings
        for sentence in item.split(". ")
        if sentence != ""
    ]

    num_extractive_sentences += sum(
        1
        for sentence_lowercase in json_sentences_lowercase
        if sentence_lowercase in physician_notes_text_lowercase
    )
    num_generated_sentences += len(json_sentences_lowercase)

num_extractive_sentences / num_generated_sentences

## Token lengths

## Prompt Lengths

In [None]:
tokenizer = tiktoken.get_encoding(TOKENIZER_NAME)

In [None]:
messages = next(iter(hadm_id_to_messages.values()))
prompt_token_length = sum(len(tokenizer.encode(message)) for message in messages[:3])
prompt_token_length

In [None]:
note_message_length_token = []
note_message_length_char = []
for messages in hadm_id_to_messages.values():
    note_message = messages[3]
    note_message_length_char.append(len(note_message))
    note_message_length_token.append(len(tokenizer.encode(note_message)))

print(
    np.percentile(note_message_length_char, [25, 50, 75]),
    np.max(note_message_length_char),
)
print(
    np.percentile(note_message_length_token, [25, 50, 75]),
    np.max(note_message_length_token),
)

## Completion metrics

In [None]:
completion_times = [
    float(messages[-2].split(": ")[1]) for messages in hadm_id_to_messages.values()
]

print(np.percentile(completion_times, [25, 50, 75]), np.max(completion_times))
print(np.percentile(completion_times, [25, 50, 75]), np.max(completion_times))

In [None]:
costs = []
for messages in hadm_id_to_messages.values():
    num_input_tokens = sum(len(tokenizer.encode(message)) for message in messages[:4])
    num_output_tokens = len(tokenizer.encode(messages[4]))
    costs.append(
        num_input_tokens / 1000 * GPT_4_TURBO_INPUT_COST_PER_1K
        + num_output_tokens / 1000 * GPT_4_TURBO_OUTPUT_COST_PER_1K
    )
print(np.percentile(costs, [25, 50, 75]), np.max(costs))

## Precision Recall

In [None]:
eval_dfs = []
for annotator_dir in ANNOTATED_DIR.iterdir():
    for hadm_id_dir in annotator_dir.iterdir():
        hadm_id = hadm_id_dir.stem
        df = pd.read_excel(
            (hadm_id_dir / f"discharge_summary_{hadm_id}.xlsx"),
            engine="openpyxl",
            header=1,
        )

        df.drop(columns=["Unnamed: 3"], inplace=True)
        df = df.dropna(axis=0, how="all")
        df["Section"] = df["Section"].fillna(method="ffill")
        df["Field"] = df["Field"].fillna(method="ffill")
        df = df.loc[df["Value"] != "Autopopulated"]
        df[
            [
                "Missed- Severe",
                "Missed- Minor",
                "Added- Hallucination",
                "Added- Not relevant",
            ]
        ] = df[
            [
                "Missed- Severe",
                "Missed- Minor",
                "Added- Hallucination",
                "Added- Not relevant",
            ]
        ].fillna(
            0
        )
        df["Field"] = df["Field"].str.replace(
            r"Causative Agent [0-9]+", "Causative Agent", regex=True
        )
        df["Field"] = df["Field"].str.replace(
            r"Description Of Reaction [0-9]+", "Description Of Reaction", regex=True
        )
        eval_dfs.append(df)

In [None]:
error_types = [
    "Missed- Severe",
    "Missed- Minor",
    "Added- Hallucination",
    "Added- Not relevant",
]
for idx, eval_df in enumerate(eval_dfs):
    df_section_errors = eval_df.groupby(["Section"])[error_types].sum()
    df_section_errors["Num Values"] = eval_df.groupby(["Section"])["Value"].count()

    # Clinical summary is a free text paragraph, so we estimate each sentnece as a value
    clinical_summary_text = eval_df[
        (eval_df["Section"] == "Clinical Summary")
        & (eval_df["Field"] == "Clinical Summary")
    ]["Value"].iloc[0]
    estimated_num_sentences = len(clinical_summary_text.split(". "))
    # -1 as already counted once
    df_section_errors.loc["Clinical Summary", "Num Values"] += (
        estimated_num_sentences - 1
    )

    if idx == 0:
        section_errors = df_section_errors
    else:
        section_errors += df_section_errors

In [None]:
section_false_positives = (
    section_errors["Added- Hallucination"] + section_errors["Added- Not relevant"]
)
section_false_negatives = (
    section_errors["Missed- Severe"] + section_errors["Missed- Minor"]
)
section_true_positives = section_errors["Num Values"] - section_false_positives

In [None]:
section_recall = section_true_positives / (
    section_true_positives + section_false_negatives
)
section_precision = section_true_positives / (
    section_true_positives + section_false_positives
)
section_f1 = (
    2 * (section_precision * section_recall) / (section_precision + section_recall)
)

In [None]:
section_precision

In [None]:
total_errors = section_errors.sum()
total_false_positives = (
    total_errors["Added- Hallucination"] + total_errors["Added- Not relevant"]
)
total_false_negatives = total_errors["Missed- Severe"] + total_errors["Missed- Minor"]
total_true_positives = total_errors["Num Values"] - total_false_positives

In [None]:
micro_recall = total_true_positives / (total_true_positives + total_false_negatives)
micro_precision = total_true_positives / (total_true_positives + total_false_positives)
micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall)

In [None]:
df_metrics = pd.concat(
    [section_recall, section_precision, section_f1],
    keys=["Recall", "Precision", "F1"],
    axis=1,
)

In [None]:
df_metrics.loc["Macro"] = df_metrics.mean(axis=0)
df_metrics.loc["Micro"] = [micro_recall, micro_precision, micro_f1]

In [None]:
df_metrics