In [None]:
import json
from pathlib import Path

import numpy as np
import tiktoken

In [None]:
ANNOTATED_DIR = Path.cwd() / "outputs" / "annotated"
RAW_GENERATED_DIR = Path.cwd() / "outputs" / "output_eval_v3"
GPT_4_TURBO_INPUT_COST_PER_1K = 0.01
GPT_4_TURBO_OUTPUT_COST_PER_1K = 0.03
TOKENIZER_NAME = "cl100k_base"

In [None]:
hadm_id_to_discharge_summary = {
    directory.stem: json.loads((directory / "discharge_summary.json").read_text())
    for directory in RAW_GENERATED_DIR.iterdir()
}

In [None]:
delimter = "\n" + "*" * 80 + "\n"
hadm_id_to_messages = {
    directory.stem: list((directory / "raw_messages.txt").read_text().split(delimter))
    for directory in RAW_GENERATED_DIR.iterdir()
}

## Extractive

In [None]:
def find_json_strings(json_object):
    strings_list = []

    def process_object(obj):
        if isinstance(obj, str):
            strings_list.append(obj)
        elif isinstance(obj, list):
            for item in obj:
                process_object(item)
        elif isinstance(obj, dict):
            for value in obj.values():
                process_object(value)

    process_object(json_object)
    return strings_list

In [None]:
num_extractive_sentences = 0
num_generated_sentences = 0

for hadm_id in hadm_id_to_messages.keys():
    discharge_summary_json = hadm_id_to_discharge_summary[hadm_id]
    physician_notes_text_lowercase = hadm_id_to_messages[hadm_id][3].lower()

    json_strings = find_json_strings(discharge_summary_json)
    json_sentences_lowercase = [
        sentence.lower()
        for item in json_strings
        for sentence in item.split(". ")
        if sentence != ""
    ]

    num_extractive_sentences += sum(
        1
        for sentence_lowercase in json_sentences_lowercase
        if sentence_lowercase in physician_notes_text_lowercase
    )
    num_generated_sentences += len(json_sentences_lowercase)

num_extractive_sentences / num_generated_sentences

## Token lengths

In [None]:
percentile: 25th 50th 75th
abstract length
# characters 825 1,263 1,679
# tokens 177 275 383

## Prompt Lengths

In [None]:
tokenizer = tiktoken.get_encoding(TOKENIZER_NAME)

In [None]:
messages = next(iter(hadm_id_to_messages.values()))
prompt_token_length = sum(len(tokenizer.encode(message)) for message in messages[:3])
prompt_token_length

In [None]:
note_message_length_token = []
note_message_length_char = []
for messages in hadm_id_to_messages.values():
    note_message = messages[3]
    note_message_length_char.append(len(note_message))
    note_message_length_token.append(len(tokenizer.encode(note_message)))

print(
    np.percentile(note_message_length_char, [25, 50, 75]),
    np.max(note_message_length_char),
)
print(
    np.percentile(note_message_length_token, [25, 50, 75]),
    np.max(note_message_length_token),
)

## Completion metrics

In [None]:
completion_times = [
    float(messages[-2].split(": ")[1]) for messages in hadm_id_to_messages.values()
]

print(np.percentile(completion_times, [25, 50, 75]), np.max(completion_times))
print(np.percentile(completion_times, [25, 50, 75]), np.max(completion_times))

In [None]:
costs = []
for messages in hadm_id_to_messages.values():
    num_input_tokens = sum(len(tokenizer.encode(message)) for message in messages[:4])
    num_output_tokens = len(tokenizer.encode(messages[4]))
    costs.append(
        num_input_tokens / 1000 * GPT_4_TURBO_INPUT_COST_PER_1K
        + num_output_tokens / 1000 * GPT_4_TURBO_OUTPUT_COST_PER_1K
    )
print(np.percentile(costs, [25, 50, 75]), np.max(costs))