## Metrics

Notebook that produces all metrics presented in results section.

Unfortunately, annotated data cannot be publicly released due to MIMIC license restrictions

In [None]:
import json
from collections import defaultdict
from pathlib import Path

import numpy as np
import pandas as pd
import tiktoken

In [None]:
pd.options.display.float_format = "{:,.2f}".format

In [None]:
OUTPUT_DIR = Path.cwd() / "outputs"
ANNOTATED_DIR = OUTPUT_DIR / "annotators_eval"
RAW_GENERATED_DIR = OUTPUT_DIR / "llm_responses"

GPT_4_TURBO_INPUT_COST_PER_1K = 0.01
GPT_4_TURBO_OUTPUT_COST_PER_1K = 0.03
TOKENIZER_NAME = "cl100k_base"

ERROR_COLUMNS = [
    "Missed- Severe",
    "Missed- Minor",
    "Added- Hallucination",
    "Added- Not relevant",
]

## Load data

Load all the eval dfs, needs some pre-processing to get in 'nice' pandas format

In [None]:
eval_dfs = []
for annotator_dir in ANNOTATED_DIR.iterdir():
    if not annotator_dir.is_dir():
        continue

    for hadm_id_dir in annotator_dir.iterdir():
        if not hadm_id_dir.is_dir():
            continue

        hadm_id = hadm_id_dir.stem
        df = pd.read_excel(
            (hadm_id_dir / f"discharge_summary_{hadm_id}.xlsx"),
            engine="openpyxl",
            header=1,
        )

        # Drop empty rows
        df = df.dropna(axis=0, how="all")

        # TODO: Remove when fixed in annotator
        mask = (df["Section"] == "Allergies And Adverse Reaction") & (
            df["Field"].isnull()
        )
        if mask.any():
            df.loc[mask, "Field"] = "Causative Agent"
            df.loc[mask, "Value"] = "No known drug allergies or adverse reactions"
            df.loc[-1, "Field"] = "Description Of Reaction"
            df.loc[-1, "Value"] = "No known drug allergies or adverse reactions"

        # Fill empty sections and fields with whatever is above
        df["Section"] = df["Section"].ffill()
        df["Field"] = df["Field"].ffill()
        # Autopopulated fields are not generated by LLM so we can drop them
        # from evaluation
        df = df.loc[df["Value"] != "Autopopulated"]

        # Empty cells are not errors so set to 0
        df[ERROR_COLUMNS] = df[ERROR_COLUMNS].fillna(0)

        # Help with grouping
        df["Field"] = df["Field"].str.replace(
            r"Causative Agent [0-9]+", "Causative Agent", regex=True
        )
        df["Field"] = df["Field"].str.replace(
            r"Description Of Reaction [0-9]+", "Description Of Reaction", regex=True
        )

        # Help with identification downstream
        df.hadm_id = hadm_id
        df.annotator = annotator_dir.stem
        eval_dfs.append(df)
len(eval_dfs)

In [None]:
eval_hadm_ids = {df.hadm_id for df in eval_dfs}
len(eval_hadm_ids)

In [None]:
hadm_id_to_discharge_summary = {
    directory.stem: json.loads((directory / "discharge_summary.json").read_text())
    for directory in RAW_GENERATED_DIR.iterdir()
    if directory.is_dir() and directory.stem in eval_hadm_ids
}

Could have been done nicer but raw messages delimited by *** and in the order
- system message
- one shot example
- user physician notes
- llm response
- time taken

In [None]:
delimiter = "\n" + "*" * 80 + "\n"
hadm_id_to_messages = {
    directory.stem: list((directory / "raw_messages.txt").read_text().split(delimiter))
    for directory in RAW_GENERATED_DIR.iterdir()
    if directory.is_dir() and directory.stem in eval_hadm_ids
}

## Completion Metrics
### Token Lengths

In [None]:
tokenizer = tiktoken.get_encoding(TOKENIZER_NAME)

In [None]:
messages = next(iter(hadm_id_to_messages.values()))
# Same for all inputs
prompt_token_length = sum(len(tokenizer.encode(message)) for message in messages[:3])
prompt_token_length

In [None]:
input_note_message_length = [
    # Message 3 is the physician note message
    len(tokenizer.encode(messages[3]))
    for messages in hadm_id_to_messages.values()
]

In [None]:
generated_summary_message_length = [
    # Message 3 is the physician note message
    len(tokenizer.encode(messages[4]))
    for messages in hadm_id_to_messages.values()
]

### Calc average time and costs

In [None]:
inference_times = [
    float(messages[-2].split(": ")[1]) for messages in hadm_id_to_messages.values()
]

In [None]:
costs = []
for messages in hadm_id_to_messages.values():
    num_input_tokens = sum(len(tokenizer.encode(message)) for message in messages[:4])
    num_output_tokens = len(tokenizer.encode(messages[4]))
    costs.append(
        num_input_tokens / 1000 * GPT_4_TURBO_INPUT_COST_PER_1K
        + num_output_tokens / 1000 * GPT_4_TURBO_OUTPUT_COST_PER_1K
    )

In [None]:
intervals = [25, 50, 75, 100]
completion_metrics_df = pd.DataFrame.from_records(
    (
        (np.percentile(input_note_message_length, intervals)),
        (np.percentile(generated_summary_message_length, intervals)),
        (np.percentile(inference_times, intervals)),
        (np.percentile(costs, intervals)),
    ),
)
completion_metrics_df.index = [
    [
        "De-Duplicated Physician Note Length / Tokens",
        "Output Note Length / Tokens",
        "Inference Time / secs",
        "Inference Cost / $",
    ]
]
completion_metrics_df.columns = [f"{interval}th Percentile" for interval in intervals]
completion_metrics_df

## Extractive

Find % of entries that were extractive i.e. directly from source text.

In [None]:
def find_json_strings(json_object):
    strings_list = []

    def process_object(obj):
        if isinstance(obj, str):
            strings_list.append(obj)
        elif isinstance(obj, list):
            for item in obj:
                process_object(item)
        elif isinstance(obj, dict):
            for value in obj.values():
                process_object(value)

    process_object(json_object)
    return strings_list

In [None]:
num_extractive_sentences = 0
num_generated_sentences = 0

for hadm_id in hadm_id_to_messages.keys():
    discharge_summary_json = hadm_id_to_discharge_summary[hadm_id]
    physician_notes_text_lowercase = hadm_id_to_messages[hadm_id][3].lower()

    json_strings = find_json_strings(discharge_summary_json)
    json_sentences_lowercase = [
        sentence.lower()
        for item in json_strings
        for sentence in item.split(". ")
        if sentence != ""
    ]

    num_extractive_sentences += sum(
        1
        for sentence_lowercase in json_sentences_lowercase
        if sentence_lowercase in physician_notes_text_lowercase
    )
    num_generated_sentences += len(json_sentences_lowercase)

num_extractive_sentences / num_generated_sentences

## Precision Recall

Choose between the 2 to group on a per field or section basis. Variable names assume done on a field basis

In [None]:
grouping_key = ["Section", "Field"]
# grouping_key = ["Section"]

Per dataframe error analysis

In [None]:
total_errors_list = []
for idx, eval_df in enumerate(eval_dfs):
    # Sum num errors per group
    eval_df_grouped = eval_df.groupby(grouping_key)
    df_field_errors = eval_df_grouped[ERROR_COLUMNS].sum()

    # Add column for number of values per group
    # Value is counted as either an element in a list or a sentence (tokenized by ". ")
    df_field_errors["Num Values"] = [
        len(". ".join(values.values).split(". "))
        for _, values in eval_df_grouped["Value"]
    ]

    # Add column for number of values not found per group
    not_found_count = (
        eval_df[eval_df["Value"] == "Information not found in notes"]
        .groupby(grouping_key)["Value"]
        .count()
    )
    df_field_errors["Not Found"] = not_found_count
    # If no values were found, set to 0
    df_field_errors["Not Found"].fillna(0, inplace=True)

    # Separate tracking of total errors for median calculation
    total_errors_list.append(df_field_errors[ERROR_COLUMNS].sum().sum())

    if idx == 0:
        field_errors = df_field_errors
    else:
        field_errors += df_field_errors

In [None]:
field_errors

GP Practice is never filled (47 values 47 not found). This is because there are no GPs in America! So drop from analysis

In [None]:
field_errors.drop("Gp Practice", inplace=True)

Median number of errors (mean is affected by extreme values)

In [None]:
np.median(total_errors_list)

Calc per field metrics

In [None]:
field_false_positives = (
    field_errors["Added- Hallucination"] + field_errors["Added- Not relevant"]
)
field_false_negatives = field_errors["Missed- Severe"] + field_errors["Missed- Minor"]
field_true_positives = field_errors["Num Values"] - field_false_positives

In [None]:
field_recall = field_true_positives / (field_true_positives + field_false_negatives)
field_precision = field_true_positives / (field_true_positives + field_false_positives)
field_f1 = 2 * (field_precision * field_recall) / (field_precision + field_recall)
field_accuracy = field_true_positives / (
    field_true_positives + field_false_negatives + field_false_positives
)

Calc mean num elements and not found elements

In [None]:
field_mean_num_elements = field_errors["Num Values"] / len(eval_dfs)
field_mean_not_found = field_errors["Not Found"] / len(eval_dfs)

In [None]:
df_metrics = pd.concat(
    [
        field_mean_num_elements,
        field_mean_not_found,
        field_recall,
        field_precision,
        field_f1,
        field_accuracy,
    ],
    keys=[
        "Average Number of Elements",
        "Proportion Not Found in Notes",
        "Recall",
        "Precision",
        "F1",
        "Accuracy",
    ],
    axis=1,
)

In [None]:
df_metrics

Repeat same calculations but summed across all evaluations to give micro averages

In [None]:
total_errors = field_errors.sum()
total_num_elements = total_errors["Num Values"] / len(eval_dfs)
total_not_found = total_errors["Not Found"] / len(eval_dfs)
total_false_positives = (
    total_errors["Added- Hallucination"] + total_errors["Added- Not relevant"]
)
total_false_negatives = total_errors["Missed- Severe"] + total_errors["Missed- Minor"]
total_true_positives = total_errors["Num Values"] - total_false_positives

In [None]:
micro_recall = total_true_positives / (total_true_positives + total_false_negatives)
micro_precision = total_true_positives / (total_true_positives + total_false_positives)
micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall)
micro_accuracy = total_true_positives / (
    total_true_positives + total_false_negatives + total_false_positives
)

In [None]:
error_proportions = (
    total_errors[ERROR_COLUMNS] / total_errors[ERROR_COLUMNS].sum() * 100
)
error_proportions

In [None]:
print(df_metrics.mean(axis=0))
print([micro_recall, micro_precision, micro_f1, micro_accuracy])

## Inter annotator agreement

Get paired annotations

In [None]:
hadm_id_to_annotator_dfs = defaultdict(list)
for eval_df in eval_dfs:
    hadm_id_to_annotator_dfs[eval_df.hadm_id].append(eval_df)

paired_hadm_id_to_annotator_dfs = {
    hadm_id: dfs for hadm_id, dfs in hadm_id_to_annotator_dfs.items() if len(dfs) == 2
}

Each annotator is a 4-d vector of the number of errors of each type

In [None]:
multi_annotated_fields: list[tuple[tuple, tuple]] = []
for hadm_id, annotator_dfs in paired_hadm_id_to_annotator_dfs.items():
    annotator_1_fields = (
        annotator_dfs[0]
        .groupby(grouping_key)
        .sum()[ERROR_COLUMNS]
        .values.astype(int)
        .tolist()
    )
    annotator_2_fields = (
        annotator_dfs[1]
        .groupby(grouping_key)
        .sum()[ERROR_COLUMNS]
        .values.astype(int)
        .tolist()
    )

    multi_annotated_fields.extend(
        [
            (tuple(annotator_1_field), tuple(annotator_2_field))
            for annotator_1_field, annotator_2_field in zip(
                annotator_1_fields, annotator_2_fields
            )
        ]
    )
len(multi_annotated_fields)

In [None]:
fn_and_fp_counts = [
    ((sum(anno_0[:2]), sum(anno_0[2:])), (sum(anno_1[:2]), sum(anno_1[2:])))
    for anno_0, anno_1 in multi_annotated_fields
]

In [None]:
fn_and_fp_agreement = sum(1 for y_0, y_1 in fn_and_fp_counts if y_0 == y_1) / len(
    fn_and_fp_counts
)
fn_and_fp_agreement