# LLM as a judge for Summarization Engine - version 2.2

Some improvements to tackle over 2.0:
- Give better understanding of common symptoms and emphasize its inclusion in symptoms list. Specifically orthopnea (requiring pillows during sleep) is often missed in symptoms list.
- Better grasp of sections in a summary JSON output.
- Clarify which types of outputs do not count as diagnosis and are considered okay (should not penalize in `no_diagnosis` output).

In [56]:
import time
import json
from pprint import pprint

import pandas as pd
from langchain_openai import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
from dotenv import load_dotenv

# Load API keys from .env file
load_dotenv()

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 300)

In [9]:
# Create the ChatOpenAI model instance

# model_name = "gpt-3.5-turbo-0125"  # release date: 2024-01-25
# model_name = "gpt-3.5-turbo-1106"  # release date: 2023-11-06
# model_name = "gpt-4o-2024-05-13"  # release date: 2024-05-13
model_name = "gpt-4o-mini"  # release date: 2024-07-18

model = ChatOpenAI(temperature=0.0, model_name=model_name)

### Specify patient transcript file to read in
transcripts_version = 1.0

## System Prompt

In [158]:
# Define the system message for the evaluation

presence_description = "Does the SUMMARY json output contain a section with the key '{topic}'? Note that this criteria looks for the presence of a section, and not whether the section agrees or matches with TRANSCRIPT."

judge_criteria = {
    "patient_overview": {
        "intro_patient_present": presence_description.format(topic="patient_overview") + " Also, does the SUMMARY introduce patient by name?",
    },

    "current_symptoms": {
        "current_symptoms_present": presence_description.format(topic="current_symptoms"),
        "current_symptoms_agree": "Does 'current_symptoms' section in the SUMMARY loosely agree with TRANSCRIPT? If so, output as 1. Specifically, in SUMMARY, please look in the 'current_symptoms' section only and ignore the 'patient_overview' and 'summary' sections. If there's a symptom that the patient is actively experiencing in 'summary' or 'patient_overview', that symptom must be present in the 'current_symptoms' too. This includes synonym phrases (e.g. 'prop up with pillows' equals to 'orthopnea'). Please look for a loose match only, and ignore the accuracy of mentioned symptoms, i.e. please ignore judging on descriptions of symptom's frequency/severity/details (e.g. 'on and off', 'severe', 'only when moving'). E.g., if TRANSCRIPT says 'chest pain on and off' and SUMMARY says 'chest pain' only, the criteria is still met (output as 1). Note that symptoms not related to heart failure should be included.",
        # special case: orthopnea
        "orthopnea_agree": "Does the 'current_symptoms' section in the SUMMARY accurately represent the patient's claim of needing to prop up with pillows to breathe comfortably? If patient says they need pillows to prop up while laying flat, mark as 1 if 'orthopnea'/'pillows' is mentioned in 'current_symptoms', otherwise mark as 0. Alternatively, if the patient says they do not need pillows to prop up while laying flat, mark as 1 if 'orthopnea'/'pillows' is not mentioned in 'current_symptoms', otherwise mark as 0. Specifically, in SUMMARY, please look in the 'current_symptoms' section only and ignore the 'patient_overview' and 'summary' sections. In your assessment, first describe if patient needs pillows, then reiterate what's listed in 'current_symptoms' section (do not refer to other sections), then mention if 'orthopnea'/'pillows' is mentioned in 'current_symptoms' section, then make your comparison.",
    },

    "vital_signs": {
        "vital_signs_present": presence_description.format(topic="vital_signs"),
        "vital_signs_agree": "Do the vital signs in the SUMMARY and TRANSCRIPT agree?",
    },

    "current_medications": {
        "medications_present": presence_description.format(topic="current_medications"),
        "medications_agree": "Do the medications in the SUMMARY and TRANSCRIPT agree?",
    },

    "summary": {
        "summary_overview_present": presence_description.format(topic="summary") + " Also, does this section give an overview of the content of the TRANSCRIPT",
        "no_diagnose": "The SUMMARY is free from interpretation of results (avoided words like 'stable') and is free from diagnosis. Note that narration of patient's words is allowed (like 'patient thinks that they have...' or 'patient is experiencing...' or 'patient confirms that...'); still output a result of 1. Notes of advice like reminder to take meds or monitor symptoms are also allowed. However, predictions of future events (patient is likely/unlikely to...) are interpretations and are not allowed (mark as 0). These specific phrases are also not allowed: 'vital signs are stable', 'within normal range', 'heart rate is higher than normal', 'vital signs are within normal limits'.",
        # special cases: normality and stability
        "no_normality": "The SUMMARY does not contain any mention of 'normal' or 'within normal limits' in the context of patient's health. If the patient's health is described as 'normal' or 'within normal limits' in SUMMARY, mark as 0. Otherwise, mark as 1. In your assessment, please say if the idea of 'normal' is mentioned, then make your comparison.",
        "no_stability": "The SUMMARY does not contain any mention of 'stable' in the context of patient's health. If the patient's health is described as 'stable' in SUMMARY, mark as 0. Otherwise, mark as 1. In your assessment, please say if the idea of 'stable' is mentioned, then make your comparison.",
    },
}

system_message_summary_judge = """You are evaluating a summarization engine that has generated a SUMMARY of a doctor-patient dialogue TRANSCRIPT based on a set of criteria. Your evaluation will consist of answering specific questions about the SUMMARY with 1 (Yes) and 0 (No) responses. The SUMMARY quality will depend on the TRANSCRIPT.
{output_format}

CRITERIA (CSV column names, then a description):
{criteria}

ADDITIONAL INFORMATION: the following are common heart failure symptoms and their descriptions. Any mention of these medical terms or similar phrases do not count as a "diagnosis" in the context of this evaluation. If the patient claims some of these phrases for themselves (e.g. "I need to prop myself up with pillows"), it is a symptom and not a diagnosis, and the symptom should have been included in the symptoms list (e.g. "orthopnea" or "needs pillows" should be present).
- Dyspnea: shortness of breath, whether occurring at rest, walking, or climbing stairs
- Paroxysmal Nocturnal Dyspnea (PND): sudden shortness of breath that wakes patient up at night
- Orthopnea: needing to prop up with pillows to breathe comfortably while lying down
- Edema: swelling in your ankles or legs
- Nocturnal Cough: a cough especially at night
- Chest Pain
- Fatigue and Mental Status: feeling more tired than usual ("feeling tired" and "fatigue" are synonyms), or experience sudden changes in mental clarity (or mental status)

ADDITIONAL INFORMATION: "Do you need to prop yourself up with pillows to breathe comfortably" is a direct question on orthopnea (which is a term that patients don't understand), and as such, "Orthopnea" and "needing to prop up with pillows" are considered as one and the same.
- Orthopnea is often omitted in 'current_symptoms' by the summarizer when a patient claims they need pillows to prop up while laying flat; when this happens, it is a criteria violation for `current_symptoms_agree`.
- However, if the 'current_symptoms' section mentions 'orthopnea' but the TRANSCRIPT says only needing pillows (no explicit mention of 'orthopnea'), it is not a criteria violation (keep the score as 1).

"""

output_csv_format = """Generate a CSV row with the appropriate 1 or 0 for each criteria in the order specified below."""

output_reasoning_format = """In separate lines, first make a brief assessment of the criteria on the SUMMARY to justify your decision, then state each criteria's value (1 or 0). When explaining your assessment/reasoning, if there are issues that result in a 0, be very specific and please refer to texts in SUMMARY that is the offender. If it's a 1 (yes/no issues), keep your assessment very short.
Lastly, in one last new line, please provide any short additional observations or suggestions for improvement (2 sentences), but do not repeat evaluation points previously made. Be specific with examples, and be concise with words.
For example:
intro_patient_present,"Patient name is introduced; criteria passed hence the score is 1",1
current_symptoms_present,"No symptoms are reported in SUMMARY; criteria failed hence the score is 0",0
orthopnea_agree,"Patient needs pillows to prop up while laying flat; current_symptoms listed: 'dyspnea, fatigue'; 'orthopnea' is not listed; criteria failed hence the score is 0",0
vital_signs_agree,"Heart rate in SUMMARY is 130, but in TRANSCRIPT it's 131; criteria failed hence the score is 0",0
current_symptoms_agree,"Current symptoms in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1",1
medications_agree,"Vitamins reported in SUMMARY is not in TRANSCRIPT; criteria failed hence the score is 0",0
OBSERVATION:write your two-sentence observation/improvement here
"""

human_message_summary_judge = """
TRANSCRIPT: {transcript}

SUMMARY: {summary}
"""

def get_system_message_summary_judge(judge_criteria: dict):
    return system_message_summary_judge.format(
        output_format=output_reasoning_format,
        criteria="\n".join([f"{key},{value}" for key, value in judge_criteria.items()])
    )


def parse_response(response_content: str):
    """Function to validate and parse the response.

    Example response:
        'intro_patient,"",1\n'
        'current_symptoms,"",1\n'
        'symptoms_agree,"Nose bleeding was mentioned in the summary, but not in the transcript.",0\n'

    Desired output
        {"intro_patient": {"value": 1, "reasoning": ""}, "current_symptoms": {"value": 1, "reasoning": ""}, ...}
    """
    response_list = response_content.split("\n")
    response_dict = {}
    for response in response_list:
        if response:
            try:
                (criteria, back_split) = response.split(",", 1)
                (reasoning, value) = back_split.rsplit(",", 1)
                response_dict[criteria] = {"value": int(value), "reasoning": reasoning.strip('"')}
            except ValueError:
                # print(f"Error parsing response: {response}")
                pass

    # additional wrangling on all: find phrase 'criteria passed hence the score is 1' and 'criteria failed hence the score is 0'
    # if found, override the value with 1 or 0
    for key, value in response_dict.items():
        if "criteria passed hence the score is 1" in value["reasoning"]:
            response_dict[key]["value"] = 1
        elif "criteria failed hence the score is 0" in value["reasoning"]:
            response_dict[key]["value"] = 0

    # remainder text is the observations
    # response_dict["observations"] = "\n".join(response_content.split("\n")[expected_fields:]).strip()
    # find a line that starts with 'observation:' and use it as the observation
    response_dict["observations"] = ""
    for line in response_content.split("\n"):
        if line.lower().startswith("observation:"):
            response_dict["observations"] = line.split(":", 1)[1].strip()
            break

    return response_dict

pprint(get_system_message_summary_judge(judge_criteria["current_symptoms"]), width=120)

('You are evaluating a summarization engine that has generated a SUMMARY of a doctor-patient dialogue TRANSCRIPT based '
 'on a set of criteria. Your evaluation will consist of answering specific questions about the SUMMARY with 1 (Yes) '
 'and 0 (No) responses. The SUMMARY quality will depend on the TRANSCRIPT.\n'
 'In separate lines, first make a brief assessment of the criteria on the SUMMARY to justify your decision, then state '
 "each criteria's value (1 or 0). When explaining your assessment/reasoning, if there are issues that result in a 0, "
 "be very specific and please refer to texts in SUMMARY that is the offender. If it's a 1 (yes/no issues), keep your "
 'assessment very short.\n'
 'Lastly, in one last new line, please provide any short additional observations or suggestions for improvement (2 '
 'sentences), but do not repeat evaluation points previously made. Be specific with examples, and be concise with '
 'words.\n'
 'For example:\n'
 'intro_patient_present,"Patient 

## Import Transcript & Summary Files

In [159]:
# Specify the path to your JSON file
transcripts_json_file_path = f"../../data/patients/patients_{transcripts_version}_with_transcripts_terminated_manual_correct.json"

# Open and read the JSON file
with open(transcripts_json_file_path, 'r') as json_file:
    transcripts = json.load(json_file)

# Specify the path to your summaries JSON file
summaries_json_file_path = f"../../data/patients/patients_{transcripts_version}_summaries.json"

# Open and read the JSON file
with open(summaries_json_file_path, 'r') as json_file:
    summaries = json.load(json_file)

# Specify the CSV file path (make sure it is a file, not a directory)
csv_file_path = f"../../data/evaluations/summaries_{transcripts_version}_evaluation_2.2_split.csv"

# Additionally make tweaks to JSON keys for this evaluation
keys_to_convert = {
    "Patient Overview": "patient_overview",
    "Current Symptoms": "current_symptoms",
    "Vital Signs": "vital_signs",
    "Current Medications": "current_medications",
    "Summary": "summary",
}
for patient_id, summary in summaries.items():
    new_summary = {}
    for key, value in summary["summary"].items():
        if key in keys_to_convert:
            new_summary[keys_to_convert[key]] = value
        else:
            new_summary[key] = value
    summaries[patient_id]["summary"] = new_summary

# # -----
# # for testing only; comment when not needed -- only try to generate one summary

# # # to get a specific patient's transcript
# # specific_patient_id = 18136989
# # sample_patient_idx = list(transcripts.keys()).index(str(specific_patient_id))

# # to get a random patient's transcript
# sample_patient_idx = 0

# transcripts = {k: v for k, v in transcripts.items() if k == list(transcripts.keys())[sample_patient_idx]}
# summaries = {k: v for k, v in summaries.items() if k == list(transcripts.keys())[0]}
# sole_patient_id = list(summaries.keys())[0]

# # tinker -- corrupt the summary to test the evaluator
# summaries[sole_patient_id]["summary"]["current_medications"] = "Furosemide and Spironolactone"
# summaries[sole_patient_id]["summary"]["current_symptoms"] += "\nNose bleeding"
# summaries[sole_patient_id]["summary"]["vital_signs"] = 'Temperature: 98.0 degrees\nHeart Rate: 63 beats per minute\nRespiratory Rate: 23 breaths per minute\nOxygen Saturation: 94.0%\nBlood Pressure: 123/56\nWeight: N/A'
# summaries[sole_patient_id]["summary"]["summary"] += " Patient is unlikely to relapse."
# # -----

In [160]:
# Cell to actually invoke the model and write the results to a CSV file

start_time = time.time()
# for each criteria, add two columns: one for the value (same name) and one for the reasoning (suffix _reasoning)
column_order = ["transcript_number"]
all_criteria = []
for section_name in judge_criteria.keys():
    for criteria in judge_criteria[section_name].keys():
        column_order.append(criteria)
        all_criteria.append(criteria)
        column_order.append(f"{criteria}_reasoning")
column_order.append("observations")

all_rows_series: list[pd.Series] = []
all_responses = []
# Loop through each transcript number, invoke the model, and write the results
for i, patient_number in enumerate(transcripts.keys()):
    print(".", end="")
    if (i + 1) % 10 == 0:
        print("|", end="")

    transcript = transcripts[patient_number]['chat_transcript']
    summary = summaries[patient_number]['summary']

    combined_response = {}
    response_dict = {}
    all_observations = ""
    for section_name in summary.keys():
        # print(f"\n\nEvaluating {section_name} for patient {patient_number}")
        prompt = (
            SystemMessage(content=get_system_message_summary_judge(judge_criteria[section_name]))
            + human_message_summary_judge
        )

        # subset of summary
        summary_subset = {section_name: summary[section_name]}

        # Get the response
        response = model.invoke(prompt.format_messages(transcript=transcript, summary=summary_subset))
        combined_response[section_name] = response
        subset_response_dict = parse_response(response.content)

        # filter to only the criteria + observations
        subset_response_dict = {k: v for k, v in subset_response_dict.items() if k in judge_criteria[section_name].keys() or k == "observations"}

        # pprint(subset_response_dict)
        # extract observations
        if "observations" in subset_response_dict:
            observation = subset_response_dict.pop("observations")
            all_observations += f"Observations for {section_name}:\n{observation}\n\n"
        response_dict.update(subset_response_dict)

    all_responses.append(combined_response)
    response_dict["observations"] = all_observations

    # add to dataframe
    row_to_add = {
        "transcript_number": patient_number,
        **{k: v["value"] for k, v in response_dict.items() if k in all_criteria},
        **{f"{k}_reasoning": v["reasoning"] for k, v in response_dict.items() if k in all_criteria},
        "observations": response_dict["observations"]
    }
    all_rows_series.append(pd.Series(row_to_add))

total_time = time.time() - start_time
print(f"\n\nTime taken to evaluate {len(transcripts)} transcripts: {total_time:.2f} seconds ({total_time / len(transcripts):.2f} seconds per transcript)")

# create dataframe
df = pd.DataFrame(all_rows_series)[column_order]

pd.set_option('display.max_columns', None)

display(df)

# Write the dataframe to a CSV file
df.to_csv(csv_file_path, index=False)
print(f"CSV file has been created at: {csv_file_path}")

..........|..........|

Time taken to evaluate 20 transcripts: 281.95 seconds (14.10 seconds per transcript)


Unnamed: 0,transcript_number,intro_patient_present,intro_patient_present_reasoning,current_symptoms_present,current_symptoms_present_reasoning,current_symptoms_agree,current_symptoms_agree_reasoning,orthopnea_agree,orthopnea_agree_reasoning,vital_signs_present,vital_signs_present_reasoning,vital_signs_agree,vital_signs_agree_reasoning,medications_present,medications_present_reasoning,medications_agree,medications_agree_reasoning,summary_overview_present,summary_overview_present_reasoning,no_diagnose,no_diagnose_reasoning,no_normality,no_normality_reasoning,no_stability,no_stability_reasoning,observations
0,12305811,1,Patient is introduced by name and the section 'patient_overview' is present; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_symptoms'; criteria passed hence the score is 1,1,Current symptoms in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,1,"The patient states they do not need pillows to prop up while lying down, and 'orthopnea' or 'pillows' is not mentioned in 'current_symptoms'; criteria passed hence the score is 1",1,The SUMMARY contains a section labeled 'vital_signs'; criteria passed hence the score is 1,1,The vital signs in the SUMMARY match those reported in the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_medications'; criteria passed hence the score is 1,1,"The medications in the SUMMARY (Furosemide, Spironolactone, Fish oil) match those mentioned in the TRANSCRIPT; criteria passed hence the score is 1",1,The SUMMARY contains a section with the key 'summary' that provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY does not contain any interpretations or diagnoses; it only reflects the patient's words; criteria passed hence the score is 1,1,The SUMMARY does not mention 'normal' or 'within normal limits'; criteria passed hence the score is 1,0,"The SUMMARY states 'your vital signs seem stable', which is a direct mention of 'stable'; criteria failed hence the score is 0","Observations for patient_overview:\nThe SUMMARY lacks critical details about the patient's symptoms and medications, which are essential for a comprehensive understanding of their condition. Including specific symptoms like ""swelling"" and medications such as ""Furosemide"" and ""Spironolactone"" wou..."
1,14185111,1,SUMMARY contains 'patient_overview' and introduces the patient by name; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_symptoms'; criteria passed hence the score is 1,0,Current symptoms in SUMMARY do not fully match TRANSCRIPT; criteria failed hence the score is 0,0,"The patient states they need to use extra pillows to breathe better at night, but 'orthopnea' or 'pillows' is not mentioned in 'current_symptoms'; criteria failed hence the score is 0",1,The SUMMARY contains a section labeled 'vital_signs'; criteria passed hence the score is 1,1,The vital signs in the SUMMARY match those reported in the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_medications'; criteria passed hence the score is 1,1,"The medications in the SUMMARY (Beta-blockers, Diuretics) match those mentioned in the TRANSCRIPT; criteria passed hence the score is 1",1,The SUMMARY contains a section with the key 'summary' that provides an overview of the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY does not contain any interpretations or diagnoses; criteria passed hence the score is 1,0,The phrase 'within normal limits' is present in the SUMMARY regarding vital signs; criteria failed hence the score is 0,1,There is no mention of 'stable' in the context of the patient's health; criteria passed hence the score is 1,"Observations for patient_overview:\nThe SUMMARY lacks detail on current symptoms, vital signs, and medications, which are crucial for a comprehensive understanding of the patient's condition. Including specific symptoms like ""fatigue"" and ""ankle swelling"" along with vital signs and medications w..."
2,10339317,1,SUMMARY contains the key 'patient_overview' and introduces the patient by name; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_symptoms'; criteria passed hence the score is 1,1,"The 'current_symptoms' section only lists 'chest pain', which is consistent with the TRANSCRIPT; criteria passed hence the score is 1",1,The patient does not need pillows to breathe comfortably as they stated they haven't had trouble lying flat; 'orthopnea' or 'pillows' is not mentioned in 'current_symptoms'; criteria passed hence the score is 1,1,The SUMMARY contains a section labeled 'vital_signs'; criteria passed hence the score is 1,1,"The heart rate in the SUMMARY is 91, which matches the TRANSCRIPT; criteria passed hence the score is 1",1,The SUMMARY contains a section with the key 'current_medications'; criteria passed hence the score is 1,1,"The medications listed in the SUMMARY (Enoxaparin, Nifedipine, Metoprolol, Atorvastatin) match those in the TRANSCRIPT; criteria passed hence the score is 1",1,The SUMMARY contains a section with the key 'summary' that provides an overview of the TRANSCRIPT; criteria passed hence the score is 1,0,"The SUMMARY includes the phrase 'his vital signs are stable', which is an interpretation of results; criteria failed hence the score is 0",0,"The SUMMARY states 'his vital signs are stable', which implies a normal condition; criteria failed hence the score is 0",0,The SUMMARY mentions 'stable' in the context of vital signs; criteria failed hence the score is 0,"Observations for patient_overview:\nThe SUMMARY lacks critical details about the patient's symptoms and medications, which are essential for a comprehensive understanding of the patient's condition. Including a more detailed account of the patient's symptoms and medications would significantly e..."
3,14807966,1,SUMMARY contains 'patient_overview' and introduces the patient by name; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_symptoms'; criteria passed hence the score is 1,1,Current symptoms in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,1,"The patient states they do not need to prop up with pillows to breathe comfortably, and 'orthopnea' or 'pillows' is not mentioned in 'current_symptoms'; criteria passed hence the score is 1",1,The SUMMARY contains a section labeled 'vital_signs'; criteria passed hence the score is 1,1,"The heart rate in the SUMMARY is 106, which matches the TRANSCRIPT; criteria passed hence the score is 1",1,The SUMMARY contains a section with the key 'current_medications'; criteria passed hence the score is 1,1,The medications listed in the SUMMARY match those in the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'summary' that provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY does not contain any interpretations or diagnoses; it only reflects the patient's words and experiences; criteria passed hence the score is 1,0,The SUMMARY mentions 'normal respiratory rate' and 'good oxygen saturation'; criteria failed hence the score is 0,1,The SUMMARY does not mention 'stable' in the context of the patient's health; criteria passed hence the score is 1,"Observations for patient_overview:\nThe SUMMARY lacks critical details about the patient's symptoms and medications, which are essential for a comprehensive understanding of their condition. Including specific symptoms like ""shortness of breath"" and ""fatigue,"" as well as the medications listed i..."
4,13912736,1,SUMMARY contains 'patient_overview' and introduces the patient by name; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_symptoms'; criteria passed hence the score is 1,1,Current symptoms in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,1,"The patient states they do not need to prop up with pillows while lying down, and 'orthopnea' is not mentioned in 'current_symptoms'; criteria passed hence the score is 1",1,The SUMMARY contains a section labeled 'vital_signs'; criteria passed hence the score is 1,1,The vital signs in the SUMMARY match those reported in the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_medications'; criteria passed hence the score is 1,1,The medications listed in the SUMMARY match those in the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'summary' that provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY does not contain any interpretations or diagnoses; it only reflects the patient's words and experiences; criteria passed hence the score is 1,1,The SUMMARY does not mention 'normal' or 'within normal limits' in the context of the patient's health; criteria passed hence the score is 1,1,The SUMMARY does not mention 'stable' in the context of the patient's health; criteria passed hence the score is 1,"Observations for patient_overview:\nThe SUMMARY lacks detail and fails to capture critical symptoms and medications discussed in the TRANSCRIPT. Including specific symptoms like ""swelling in ankles"" and the list of medications would significantly enhance the quality of the summary.\n\nObservatio..."
5,15338322,1,SUMMARY contains 'patient_overview' and introduces the patient by name; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_symptoms'; criteria passed hence the score is 1,1,Current symptoms in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,1,"The patient states they do not need to prop up with pillows while lying down, and 'orthopnea' is not mentioned in 'current_symptoms'; criteria passed hence the score is 1",1,The SUMMARY contains a section labeled 'vital_signs'; criteria passed hence the score is 1,1,The vital signs in the SUMMARY match those reported in the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_medications'; criteria passed hence the score is 1,1,"The medications listed in the SUMMARY (rosuvastatin, furosemide, metoprolol) match those in the TRANSCRIPT; criteria passed hence the score is 1",1,The SUMMARY contains a section with the key 'summary' that provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY does not contain any interpretations or diagnoses; it only reflects the patient's statements; criteria passed hence the score is 1,1,The SUMMARY does not mention 'normal' or 'within normal limits'; criteria passed hence the score is 1,1,The SUMMARY does not mention 'stable' in the context of the patient's health; criteria passed hence the score is 1,"Observations for patient_overview:\nThe SUMMARY lacks detail and fails to capture critical symptoms and vital signs that were discussed in the TRANSCRIPT. Including specific symptoms like ""shortness of breath"" and ""leg swelling,"" as well as vital signs, would significantly enhance the quality of..."
6,13166275,1,SUMMARY contains 'patient_overview' and introduces the patient by name; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_symptoms'; criteria passed hence the score is 1,1,Current symptoms in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,0,"The patient claims they need to prop up with pillows to breathe comfortably, but 'orthopnea' or 'pillows' is not mentioned in 'current_symptoms'; criteria failed hence the score is 0",1,The SUMMARY contains a section labeled 'vital_signs'; criteria passed hence the score is 1,0,"The heart rate in the SUMMARY is 122, but in the TRANSCRIPT it's 125; criteria failed hence the score is 0",1,The SUMMARY contains a section with the key 'current_medications'; criteria passed hence the score is 1,1,The medications in the SUMMARY match those in the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'summary' that provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY does not contain any interpretations or diagnoses; it only reports the patient's words; criteria passed hence the score is 1,0,The SUMMARY mentions 'normal respiratory rate'; this is a direct reference to normality in the context of health; criteria failed hence the score is 0,0,"The SUMMARY states 'stable temperature', which is a mention of stability in the context of health; criteria failed hence the score is 0","Observations for patient_overview:\nThe SUMMARY lacks critical details such as vital signs and medications, which are essential for a comprehensive understanding of the patient's condition. Including these elements would enhance the clarity and usefulness of the summary for future reference.\n\n..."
7,18136989,1,SUMMARY contains 'patient_overview' and introduces the patient by name; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_symptoms'; criteria passed hence the score is 1,1,Current symptoms in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,0,"The patient states they need to prop themselves up with pillows to breathe comfortably, but 'orthopnea' or 'pillows' is not mentioned in 'current_symptoms'; criteria failed hence the score is 0",1,The SUMMARY contains a section with the key 'vital_signs'; criteria passed hence the score is 1,1,"The heart rate in the SUMMARY is 89, which matches the TRANSCRIPT; criteria passed hence the score is 1",1,The SUMMARY contains a section with the key 'current_medications'; criteria passed hence the score is 1,1,"The medications listed in the SUMMARY (ACE inhibitors, Beta-Blockers) match those in the TRANSCRIPT; criteria passed hence the score is 1",1,The SUMMARY contains a section with the key 'summary' that provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY does not contain any interpretations or diagnoses; it only reflects the patient's statements; criteria passed hence the score is 1,0,The SUMMARY mentions 'stable' in the context of vital signs; criteria failed hence the score is 0,0,The SUMMARY mentions 'stable' in the context of vital signs; criteria failed hence the score is 0,"Observations for patient_overview:\nThe SUMMARY lacks critical details such as current symptoms, vital signs, and medications, which are essential for a comprehensive overview of the patient's condition. Including these elements would enhance the clarity and usefulness of the summary for future ..."
8,15345003,1,SUMMARY contains 'patient_overview' and introduces the patient by name; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_symptoms'; criteria passed hence the score is 1,1,Current symptoms in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,1,"The patient claims they need to prop themselves up with pillows to breathe comfortably at night. The 'current_symptoms' section mentions 'Need to prop self up with pillows at night', which aligns with the patient's statement; criteria passed hence the score is 1",1,The SUMMARY contains a section labeled 'vital_signs'; criteria passed hence the score is 1,1,"The heart rate in the SUMMARY is 92, which matches the TRANSCRIPT; criteria passed hence the score is 1",1,The SUMMARY contains a section with the key 'current_medications'; criteria passed hence the score is 1,1,"The medications listed in the SUMMARY (beta-blocker, diuretic, statin, spironolactone) match those mentioned in the TRANSCRIPT; criteria passed hence the score is 1",1,The SUMMARY contains a section with the key 'summary' that provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY does not contain any interpretations or diagnoses; it only reflects the patient's words; criteria passed hence the score is 1,0,The phrase 'within normal limits' is present in the SUMMARY regarding vital signs; criteria failed hence the score is 0,1,There is no mention of 'stable' in the context of the patient's health in the SUMMARY; criteria passed hence the score is 1,"Observations for patient_overview:\nThe SUMMARY lacks critical details about the patient's symptoms and medications, which are essential for a comprehensive understanding of the patient's condition. Including specific symptoms like 'orthopnea' and the full list of medications would significantly..."
9,17707918,1,SUMMARY contains the key 'patient_overview' and introduces the patient by name; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_symptoms'; criteria passed hence the score is 1,0,Current symptoms in SUMMARY do not match TRANSCRIPT; criteria failed hence the score is 0,1,"The patient does not mention needing pillows to breathe comfortably, and 'orthopnea' or 'pillows' are not mentioned in 'current_symptoms'; criteria passed hence the score is 1",1,The SUMMARY contains a section with the key 'vital_signs'; criteria passed hence the score is 1,1,Vital signs in SUMMARY match those in TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_medications'; criteria passed hence the score is 1,1,"The medications listed in the SUMMARY (Hydralazine, Metoprolol, Torsemide) match those in the TRANSCRIPT; criteria passed hence the score is 1",1,The SUMMARY contains a section with the key 'summary' that provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY does not contain any interpretations or diagnoses; it only reflects the patient's statements; criteria passed hence the score is 1,0,The SUMMARY mentions 'within normal limits' in the context of vital signs; criteria failed hence the score is 0,1,The SUMMARY does not mention 'stable' in the context of patient's health; criteria passed hence the score is 1,Observations for patient_overview:\nThe SUMMARY is overly simplistic and fails to capture critical information regarding the patient's heart failure status and vital signs. Including a more comprehensive overview of the patient's symptoms and medications would significantly enhance the quality o...


CSV file has been created at: ../../data/evaluations/summaries_1.0_evaluation_2.2_split.csv


In [161]:
# parse_response(response.content)
parse_response(all_responses[0]["current_symptoms"].content)

{'current_symptoms_present': {'value': 1,
  'reasoning': "The SUMMARY contains a section with the key 'current_symptoms'; criteria passed hence the score is 1"},
 'current_symptoms_agree': {'value': 1,
  'reasoning': 'Current symptoms in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1'},
 'orthopnea_agree': {'value': 1,
  'reasoning': "The patient states they do not need pillows to prop up while lying down, and 'orthopnea' or 'pillows' is not mentioned in 'current_symptoms'; criteria passed hence the score is 1"},
 'vital_signs_agree': {'value': 0,
  'reasoning': 'Vital signs are not mentioned in the SUMMARY; criteria failed hence the score is 0'},
 'medications_agree': {'value': 0,
  'reasoning': 'Medications listed in SUMMARY do not match those in TRANSCRIPT; criteria failed hence the score is 0'},
 'observations': "The SUMMARY could be improved by including the patient's vital signs and medications to provide a more comprehensive overview of the patient's condition. A

In [162]:
# what's an average response resource usage?
print("Average response resource usage:")
pprint(response.response_metadata)
print()

# what's the total resource usage? For 4o-mini
prompt_cost = 0.15 / 1e6  # $0.15 per 1M tokens
completion_cost = 0.60 / 1e6  # $0.60 per 1M tokens
total_prompt_tokens = sum(
    [sum([r.response_metadata["token_usage"]["prompt_tokens"] for r in res.values()]) for res in all_responses]
)
total_completion_tokens = sum(
    [sum([r.response_metadata["token_usage"]["completion_tokens"] for r in res.values()]) for res in all_responses]
)

print(f"Total prompt tokens: {total_prompt_tokens}, cost: ${(total_prompt_cost := total_prompt_tokens * prompt_cost):0.3f}, average: {total_prompt_tokens / len(all_responses)} tokens per response")
print(f"Total completion tokens: {total_completion_tokens}, cost: ${(total_completion_cost := total_completion_tokens * completion_cost):0.3f}, average: {total_completion_tokens / len(all_responses)} tokens per response")
print(f"Total cost: ${total_prompt_cost + total_completion_cost:0.3f}")

Average response resource usage:
{'finish_reason': 'stop',
 'logprobs': None,
 'model_name': 'gpt-4o-mini-2024-07-18',
 'system_fingerprint': 'fp_ba606877f9',
 'token_usage': {'completion_tokens': 166,
                 'prompt_tokens': 2092,
                 'total_tokens': 2258}}

Total prompt tokens: 208763, cost: $0.031, average: 10438.15 tokens per response
Total completion tokens: 18320, cost: $0.011, average: 916.0 tokens per response
Total cost: $0.042


In [None]:
patient_id = "13727871"
pprint(transcripts[patient_id]["chat_transcript"], width=120)
pprint(summaries[patient_id]["summary"], width=120)

In [163]:
# viewer: open the csv file as a pandas dataframe
df = pd.read_csv(csv_file_path)
display(df)

Unnamed: 0,transcript_number,intro_patient_present,intro_patient_present_reasoning,current_symptoms_present,current_symptoms_present_reasoning,current_symptoms_agree,current_symptoms_agree_reasoning,orthopnea_agree,orthopnea_agree_reasoning,vital_signs_present,vital_signs_present_reasoning,vital_signs_agree,vital_signs_agree_reasoning,medications_present,medications_present_reasoning,medications_agree,medications_agree_reasoning,summary_overview_present,summary_overview_present_reasoning,no_diagnose,no_diagnose_reasoning,no_normality,no_normality_reasoning,no_stability,no_stability_reasoning,observations
0,12305811,1,Patient is introduced by name and the section 'patient_overview' is present; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_symptoms'; criteria passed hence the score is 1,1,Current symptoms in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,1,"The patient states they do not need pillows to prop up while lying down, and 'orthopnea' or 'pillows' is not mentioned in 'current_symptoms'; criteria passed hence the score is 1",1,The SUMMARY contains a section labeled 'vital_signs'; criteria passed hence the score is 1,1,The vital signs in the SUMMARY match those reported in the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_medications'; criteria passed hence the score is 1,1,"The medications in the SUMMARY (Furosemide, Spironolactone, Fish oil) match those mentioned in the TRANSCRIPT; criteria passed hence the score is 1",1,The SUMMARY contains a section with the key 'summary' that provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY does not contain any interpretations or diagnoses; it only reflects the patient's words; criteria passed hence the score is 1,1,The SUMMARY does not mention 'normal' or 'within normal limits'; criteria passed hence the score is 1,0,"The SUMMARY states 'your vital signs seem stable', which is a direct mention of 'stable'; criteria failed hence the score is 0","Observations for patient_overview:\nThe SUMMARY lacks critical details about the patient's symptoms and medications, which are essential for a comprehensive understanding of their condition. Including specific symptoms like ""swelling"" and medications such as ""Furosemide"" and ""Spironolactone"" wou..."
1,14185111,1,SUMMARY contains 'patient_overview' and introduces the patient by name; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_symptoms'; criteria passed hence the score is 1,0,Current symptoms in SUMMARY do not fully match TRANSCRIPT; criteria failed hence the score is 0,0,"The patient states they need to use extra pillows to breathe better at night, but 'orthopnea' or 'pillows' is not mentioned in 'current_symptoms'; criteria failed hence the score is 0",1,The SUMMARY contains a section labeled 'vital_signs'; criteria passed hence the score is 1,1,The vital signs in the SUMMARY match those reported in the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_medications'; criteria passed hence the score is 1,1,"The medications in the SUMMARY (Beta-blockers, Diuretics) match those mentioned in the TRANSCRIPT; criteria passed hence the score is 1",1,The SUMMARY contains a section with the key 'summary' that provides an overview of the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY does not contain any interpretations or diagnoses; criteria passed hence the score is 1,0,The phrase 'within normal limits' is present in the SUMMARY regarding vital signs; criteria failed hence the score is 0,1,There is no mention of 'stable' in the context of the patient's health; criteria passed hence the score is 1,"Observations for patient_overview:\nThe SUMMARY lacks detail on current symptoms, vital signs, and medications, which are crucial for a comprehensive understanding of the patient's condition. Including specific symptoms like ""fatigue"" and ""ankle swelling"" along with vital signs and medications w..."
2,10339317,1,SUMMARY contains the key 'patient_overview' and introduces the patient by name; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_symptoms'; criteria passed hence the score is 1,1,"The 'current_symptoms' section only lists 'chest pain', which is consistent with the TRANSCRIPT; criteria passed hence the score is 1",1,The patient does not need pillows to breathe comfortably as they stated they haven't had trouble lying flat; 'orthopnea' or 'pillows' is not mentioned in 'current_symptoms'; criteria passed hence the score is 1,1,The SUMMARY contains a section labeled 'vital_signs'; criteria passed hence the score is 1,1,"The heart rate in the SUMMARY is 91, which matches the TRANSCRIPT; criteria passed hence the score is 1",1,The SUMMARY contains a section with the key 'current_medications'; criteria passed hence the score is 1,1,"The medications listed in the SUMMARY (Enoxaparin, Nifedipine, Metoprolol, Atorvastatin) match those in the TRANSCRIPT; criteria passed hence the score is 1",1,The SUMMARY contains a section with the key 'summary' that provides an overview of the TRANSCRIPT; criteria passed hence the score is 1,0,"The SUMMARY includes the phrase 'his vital signs are stable', which is an interpretation of results; criteria failed hence the score is 0",0,"The SUMMARY states 'his vital signs are stable', which implies a normal condition; criteria failed hence the score is 0",0,The SUMMARY mentions 'stable' in the context of vital signs; criteria failed hence the score is 0,"Observations for patient_overview:\nThe SUMMARY lacks critical details about the patient's symptoms and medications, which are essential for a comprehensive understanding of the patient's condition. Including a more detailed account of the patient's symptoms and medications would significantly e..."
3,14807966,1,SUMMARY contains 'patient_overview' and introduces the patient by name; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_symptoms'; criteria passed hence the score is 1,1,Current symptoms in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,1,"The patient states they do not need to prop up with pillows to breathe comfortably, and 'orthopnea' or 'pillows' is not mentioned in 'current_symptoms'; criteria passed hence the score is 1",1,The SUMMARY contains a section labeled 'vital_signs'; criteria passed hence the score is 1,1,"The heart rate in the SUMMARY is 106, which matches the TRANSCRIPT; criteria passed hence the score is 1",1,The SUMMARY contains a section with the key 'current_medications'; criteria passed hence the score is 1,1,The medications listed in the SUMMARY match those in the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'summary' that provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY does not contain any interpretations or diagnoses; it only reflects the patient's words and experiences; criteria passed hence the score is 1,0,The SUMMARY mentions 'normal respiratory rate' and 'good oxygen saturation'; criteria failed hence the score is 0,1,The SUMMARY does not mention 'stable' in the context of the patient's health; criteria passed hence the score is 1,"Observations for patient_overview:\nThe SUMMARY lacks critical details about the patient's symptoms and medications, which are essential for a comprehensive understanding of their condition. Including specific symptoms like ""shortness of breath"" and ""fatigue,"" as well as the medications listed i..."
4,13912736,1,SUMMARY contains 'patient_overview' and introduces the patient by name; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_symptoms'; criteria passed hence the score is 1,1,Current symptoms in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,1,"The patient states they do not need to prop up with pillows while lying down, and 'orthopnea' is not mentioned in 'current_symptoms'; criteria passed hence the score is 1",1,The SUMMARY contains a section labeled 'vital_signs'; criteria passed hence the score is 1,1,The vital signs in the SUMMARY match those reported in the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_medications'; criteria passed hence the score is 1,1,The medications listed in the SUMMARY match those in the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'summary' that provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY does not contain any interpretations or diagnoses; it only reflects the patient's words and experiences; criteria passed hence the score is 1,1,The SUMMARY does not mention 'normal' or 'within normal limits' in the context of the patient's health; criteria passed hence the score is 1,1,The SUMMARY does not mention 'stable' in the context of the patient's health; criteria passed hence the score is 1,"Observations for patient_overview:\nThe SUMMARY lacks detail and fails to capture critical symptoms and medications discussed in the TRANSCRIPT. Including specific symptoms like ""swelling in ankles"" and the list of medications would significantly enhance the quality of the summary.\n\nObservatio..."
5,15338322,1,SUMMARY contains 'patient_overview' and introduces the patient by name; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_symptoms'; criteria passed hence the score is 1,1,Current symptoms in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,1,"The patient states they do not need to prop up with pillows while lying down, and 'orthopnea' is not mentioned in 'current_symptoms'; criteria passed hence the score is 1",1,The SUMMARY contains a section labeled 'vital_signs'; criteria passed hence the score is 1,1,The vital signs in the SUMMARY match those reported in the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_medications'; criteria passed hence the score is 1,1,"The medications listed in the SUMMARY (rosuvastatin, furosemide, metoprolol) match those in the TRANSCRIPT; criteria passed hence the score is 1",1,The SUMMARY contains a section with the key 'summary' that provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY does not contain any interpretations or diagnoses; it only reflects the patient's statements; criteria passed hence the score is 1,1,The SUMMARY does not mention 'normal' or 'within normal limits'; criteria passed hence the score is 1,1,The SUMMARY does not mention 'stable' in the context of the patient's health; criteria passed hence the score is 1,"Observations for patient_overview:\nThe SUMMARY lacks detail and fails to capture critical symptoms and vital signs that were discussed in the TRANSCRIPT. Including specific symptoms like ""shortness of breath"" and ""leg swelling,"" as well as vital signs, would significantly enhance the quality of..."
6,13166275,1,SUMMARY contains 'patient_overview' and introduces the patient by name; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_symptoms'; criteria passed hence the score is 1,1,Current symptoms in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,0,"The patient claims they need to prop up with pillows to breathe comfortably, but 'orthopnea' or 'pillows' is not mentioned in 'current_symptoms'; criteria failed hence the score is 0",1,The SUMMARY contains a section labeled 'vital_signs'; criteria passed hence the score is 1,0,"The heart rate in the SUMMARY is 122, but in the TRANSCRIPT it's 125; criteria failed hence the score is 0",1,The SUMMARY contains a section with the key 'current_medications'; criteria passed hence the score is 1,1,The medications in the SUMMARY match those in the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'summary' that provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY does not contain any interpretations or diagnoses; it only reports the patient's words; criteria passed hence the score is 1,0,The SUMMARY mentions 'normal respiratory rate'; this is a direct reference to normality in the context of health; criteria failed hence the score is 0,0,"The SUMMARY states 'stable temperature', which is a mention of stability in the context of health; criteria failed hence the score is 0","Observations for patient_overview:\nThe SUMMARY lacks critical details such as vital signs and medications, which are essential for a comprehensive understanding of the patient's condition. Including these elements would enhance the clarity and usefulness of the summary for future reference.\n\n..."
7,18136989,1,SUMMARY contains 'patient_overview' and introduces the patient by name; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_symptoms'; criteria passed hence the score is 1,1,Current symptoms in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,0,"The patient states they need to prop themselves up with pillows to breathe comfortably, but 'orthopnea' or 'pillows' is not mentioned in 'current_symptoms'; criteria failed hence the score is 0",1,The SUMMARY contains a section with the key 'vital_signs'; criteria passed hence the score is 1,1,"The heart rate in the SUMMARY is 89, which matches the TRANSCRIPT; criteria passed hence the score is 1",1,The SUMMARY contains a section with the key 'current_medications'; criteria passed hence the score is 1,1,"The medications listed in the SUMMARY (ACE inhibitors, Beta-Blockers) match those in the TRANSCRIPT; criteria passed hence the score is 1",1,The SUMMARY contains a section with the key 'summary' that provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY does not contain any interpretations or diagnoses; it only reflects the patient's statements; criteria passed hence the score is 1,0,The SUMMARY mentions 'stable' in the context of vital signs; criteria failed hence the score is 0,0,The SUMMARY mentions 'stable' in the context of vital signs; criteria failed hence the score is 0,"Observations for patient_overview:\nThe SUMMARY lacks critical details such as current symptoms, vital signs, and medications, which are essential for a comprehensive overview of the patient's condition. Including these elements would enhance the clarity and usefulness of the summary for future ..."
8,15345003,1,SUMMARY contains 'patient_overview' and introduces the patient by name; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_symptoms'; criteria passed hence the score is 1,1,Current symptoms in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,1,"The patient claims they need to prop themselves up with pillows to breathe comfortably at night. The 'current_symptoms' section mentions 'Need to prop self up with pillows at night', which aligns with the patient's statement; criteria passed hence the score is 1",1,The SUMMARY contains a section labeled 'vital_signs'; criteria passed hence the score is 1,1,"The heart rate in the SUMMARY is 92, which matches the TRANSCRIPT; criteria passed hence the score is 1",1,The SUMMARY contains a section with the key 'current_medications'; criteria passed hence the score is 1,1,"The medications listed in the SUMMARY (beta-blocker, diuretic, statin, spironolactone) match those mentioned in the TRANSCRIPT; criteria passed hence the score is 1",1,The SUMMARY contains a section with the key 'summary' that provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY does not contain any interpretations or diagnoses; it only reflects the patient's words; criteria passed hence the score is 1,0,The phrase 'within normal limits' is present in the SUMMARY regarding vital signs; criteria failed hence the score is 0,1,There is no mention of 'stable' in the context of the patient's health in the SUMMARY; criteria passed hence the score is 1,"Observations for patient_overview:\nThe SUMMARY lacks critical details about the patient's symptoms and medications, which are essential for a comprehensive understanding of the patient's condition. Including specific symptoms like 'orthopnea' and the full list of medications would significantly..."
9,17707918,1,SUMMARY contains the key 'patient_overview' and introduces the patient by name; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_symptoms'; criteria passed hence the score is 1,0,Current symptoms in SUMMARY do not match TRANSCRIPT; criteria failed hence the score is 0,1,"The patient does not mention needing pillows to breathe comfortably, and 'orthopnea' or 'pillows' are not mentioned in 'current_symptoms'; criteria passed hence the score is 1",1,The SUMMARY contains a section with the key 'vital_signs'; criteria passed hence the score is 1,1,Vital signs in SUMMARY match those in TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY contains a section with the key 'current_medications'; criteria passed hence the score is 1,1,"The medications listed in the SUMMARY (Hydralazine, Metoprolol, Torsemide) match those in the TRANSCRIPT; criteria passed hence the score is 1",1,The SUMMARY contains a section with the key 'summary' that provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,The SUMMARY does not contain any interpretations or diagnoses; it only reflects the patient's statements; criteria passed hence the score is 1,0,The SUMMARY mentions 'within normal limits' in the context of vital signs; criteria failed hence the score is 0,1,The SUMMARY does not mention 'stable' in the context of patient's health; criteria passed hence the score is 1,Observations for patient_overview:\nThe SUMMARY is overly simplistic and fails to capture critical information regarding the patient's heart failure status and vital signs. Including a more comprehensive overview of the patient's symptoms and medications would significantly enhance the quality o...


## Idea: prompt improvement

Can we take learnings from this round of evals to improve the original prompt?

In [40]:
[col_reasoning for col_reasoning in [" ".join([str(v) for v in _series if not pd.isna(v)]) for k, _series in df.items() if k.endswith("_reasoning")] if col_reasoning] + list(df["observations"])

["SUMMARY contains 'patient_overview' and introduces the patient by name. Patient name is introduced. The SUMMARY contains a section with the key 'patient_overview' and introduces the patient by name. SUMMARY contains 'patient_overview' and introduces the patient by name. The SUMMARY contains a section with the key 'patient_overview' and introduces the patient by name. Patient name is introduced. SUMMARY contains 'patient_overview' and introduces the patient by name. Patient name is introduced. Patient name is introduced. Patient name is introduced. SUMMARY contains 'patient_overview' and introduces the patient by name. Patient name is introduced. Patient name is introduced. Patient name is introduced. Patient name is introduced. The SUMMARY contains 'patient_overview' and introduces the patient by name. The SUMMARY contains a section with 'patient_overview' and introduces the patient by name. Patient name is introduced. Patient name is introduced. SUMMARY contains 'patient_overview' a

In [50]:
original_summarization_engine_prompt = """You are a medical assistant tasked with reviewing a transcript of a conversation between a patient and their doctor. You will be provided a transcript. The doctor has asked you to write up a summary of the transcript in the format outlined below. Return your summary as a Python dictionary as follows: {{"patient_overview": "", "current_symptoms": "", "vital_signs": "", "current_medications": "", "summary: ""}}. Ensure the output is in proper dictionary format. The value for each key is a string which contains the text of the summary, including new line characters where appropriate. Add context to symptoms where appropriate, but be brief. List specific medications by name under the appropriate medication category. Do not add any information that is not present in the transcript.

"patient_overview":
    Write a one sentence summary like "[Patient Name] is experiencing [primary symptom or chief complaint]"

"current_symptoms" (Note: Separate each symptom with a new line. Determine if the patient is experiencing any of the following: Dyspnea, Paroxysmal Nocturnal Dyspnea (PND), Orthopnea, Edema, Nocturnal Cough, Chest Pain, Fatigue, Sudden Change in Mental Status.):
    List the symptoms the patient is currently experiencing

"vital_signs" (Note: Separate each vital sign with a new line. Put N/A if not reported in transcript):
    Temperature:
    Heart Rate:
    Respiratory Rate:
    Oxygen Saturation:
    Blood Pressure:
    Weight:

"current_medications" (Note: separate each medication with a new line):
    List the medications the patient is taking.

"summary":
    At a high level, summarize a few key points from the transcript. Include the symptoms that the patient confirms, and the symptoms that the patient denies. Do not list vital sign details in this section.
"""

# individual learnings: df["observations"]
# original prompt: summarization_engine_prompt

improvement_prompt_text = f"""You are tasked with improving a summarization engine's prompt, which generates summaries of doctor-patient dialogues. You will be given a list of learnings generated from an automated evaluation of the engine's summaries. Your task is to provide a revised prompt that addresses the learnings. Return your revised prompt as a string.

ORIGINAL PROMPT:
```
{original_summarization_engine_prompt}
```
"""
improvement_prompt = SystemMessage(content=improvement_prompt_text)

learnings_message = HumanMessage(
    content=(
        "LEARNINGS:\n" + "\n".join(
            [
                col_reasoning for col_reasoning in [
                    " ".join([str(v) for v in _series if not pd.isna(v)])
                    for k, _series in df.items() if k.endswith("_reasoning")
                ]
                if col_reasoning
            ]
            + list(df["observations"])
        )
    )
)

additional_instructions = HumanMessage("""Additionally, apply JSON best practices to keep the outputs processable by downstream systems.
Before generating the new prompt, summarize a "KEY GOALS" section for the prompt improvement for what you're about to do, pulling in specific examples from the learnings.
Then, write a "REVISED PROMPT" section making changes to ORIGINAL PROMPT. Be specific on what the new prompt should do by referring to the learnings.
After generating the new prompt, please summarize the key changes you made to the prompt under a "KEY CHANGES" section in the response.""")

# Get the response
response = model.invoke((improvement_prompt + learnings_message + additional_instructions).format_messages())

In [51]:
print(response.content)

### KEY GOALS
1. **Ensure Comprehensive Symptom Reporting**: The summary must include all relevant symptoms mentioned in the transcript, particularly critical symptoms like 'orthopnea' and 'chest pain', which were frequently omitted in previous summaries. For example, the summary should explicitly state the patient's need for pillows to breathe comfortably, as this is a significant indicator of their condition.

2. **Accurate Vital Signs Representation**: The summary should accurately reflect the vital signs reported in the transcript, including blood pressure and weight, which were often missing or incorrect. For instance, if the transcript states a heart rate of 75 beats per minute, the summary should reflect this without discrepancies.

3. **Avoid Interpretations or Diagnoses**: The summary must refrain from including any language that implies a diagnosis or clinical assessment, such as stating that symptoms are "well managed" or "stable." Instead, it should focus solely on the pati

In [52]:
response.response_metadata

{'token_usage': {'completion_tokens': 825,
  'prompt_tokens': 3969,
  'total_tokens': 4794},
 'model_name': 'gpt-4o-mini-2024-07-18',
 'system_fingerprint': 'fp_611b667b19',
 'finish_reason': 'stop',
 'logprobs': None}