# LLM as a judge for Summarization Engine - version 2.2

Some improvements to tackle over 2.0:
- Give better understanding of common symptoms and emphasize its inclusion in symptoms list. Specifically orthopnea (requiring pillows during sleep) is often missed in symptoms list.
- Better grasp of sections in a summary JSON output.
- Clarify which types of outputs do not count as diagnosis and are considered okay (should not penalize in `no_diagnosis` output).

In [56]:
import time
import json
from pprint import pprint

import pandas as pd
from langchain_openai import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
from dotenv import load_dotenv

# Load API keys from .env file
load_dotenv()

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 300)

In [9]:
# Create the ChatOpenAI model instance

# model_name = "gpt-3.5-turbo-0125"  # release date: 2024-01-25
# model_name = "gpt-3.5-turbo-1106"  # release date: 2023-11-06
# model_name = "gpt-4o-2024-05-13"  # release date: 2024-05-13
model_name = "gpt-4o-mini"  # release date: 2024-07-18

model = ChatOpenAI(temperature=0.0, model_name=model_name)

### Specify patient transcript file to read in
transcripts_version = 1.0

## System Prompt

In [123]:
# Define the system message for the evaluation

presence_description = "Does the SUMMARY json output contain a section with the key '{topic}'? Note that this criteria looks for the presence of a section, and not whether the section agrees or matches with TRANSCRIPT."

judge_criteria = {
    # summary content
    "intro_patient_present": presence_description.format(topic="patient_overview") + " Also, does the SUMMARY introduce patient by name?",
    "current_symptoms_present": presence_description.format(topic="current_symptoms"),
    "vital_signs_present": presence_description.format(topic="vital_signs"),
    "medications_present": presence_description.format(topic="current_medications"),
    "summary_overview_present": presence_description.format(topic="summary") + " Also, does this section give an overview of the content of the TRANSCRIPT",

    # accuracy of summary
    "current_symptoms_agree": "Does 'current_symptoms' section in the SUMMARY loosely agree with TRANSCRIPT? If so, output as 1. Specifically, in SUMMARY, please look in the 'current_symptoms' section only and ignore the 'patient_overview' and 'summary' sections. If there's a symptom that the patient is actively experiencing in 'summary' or 'patient_overview', that symptom must be present in the 'current_symptoms' too. This includes synonym phrases (e.g. 'prop up with pillows' equals to 'orthopnea'). Please look for a loose match only, and ignore the accuracy of mentioned symptoms, i.e. please ignore judging on descriptions of symptom's frequency/severity/details (e.g. 'on and off', 'severe', 'only when moving'). E.g., if TRANSCRIPT says 'chest pain on and off' and SUMMARY says 'chest pain' only, the criteria is still met (output as 1). Note that symptoms not related to heart failure should be included.",
    # special case: orthopnea
    "orthopnea_agree": "Does the 'current_symptoms' section in the SUMMARY accurately represent the patient's claim of needing to prop up with pillows to breathe comfortably? If patient says they need pillows to prop up while laying flat, mark as 1 if 'orthopnea'/'pillows' is mentioned in 'current_symptoms', otherwise mark as 0. Alternatively, if the patient says they do not need pillows to prop up while laying flat, mark as 1 if 'orthopnea'/'pillows' is not mentioned in 'current_symptoms', otherwise mark as 0. Specifically, in SUMMARY, please look in the 'current_symptoms' section only and ignore the 'patient_overview' and 'summary' sections. In your assessment, first describe if patient needs pillows, then reiterate what's listed in 'current_symptoms' section (do not refer to other sections), then mention if 'orthopnea'/'pillows' is mentioned in 'current_symptoms' section, then make your comparison.",
    "vital_signs_agree": "Do the vital signs in the SUMMARY and TRANSCRIPT agree?",
    "medications_agree": "Do the medications in the SUMMARY and TRANSCRIPT agree?",

    # quality of summary
    "no_diagnose": "The SUMMARY is free from interpretation of results (avoided words like 'stable') and is free from diagnosis. Note that narration of patient's words is allowed (like 'patient thinks that they have...' or 'patient is experiencing...' or 'patient confirms that...'); still output a result of 1. Notes of advice like reminder to take meds or monitor symptoms are also allowed. However, predictions of future events (patient is likely/unlikely to...) are interpretations and are not allowed (mark as 0). These specific phrases are also not allowed: 'vital signs are stable', 'within normal range', 'heart rate is higher than normal', 'vital signs are within normal limits'.",
    # special cases: normality and stability
    "no_normality": "The SUMMARY does not contain any mention of 'normal' or 'within normal limits' in the context of patient's health. If the patient's health is described as 'normal' or 'within normal limits' in SUMMARY, mark as 0. Otherwise, mark as 1. In your assessment, please say if the idea of 'normal' is mentioned, then make your comparison.",
    "no_stability": "The SUMMARY does not contain any mention of 'stable' in the context of patient's health. If the patient's health is described as 'stable' in SUMMARY, mark as 0. Otherwise, mark as 1. In your assessment, please say if the idea of 'stable' is mentioned, then make your comparison.",
}

system_message_summary_judge = """You are evaluating a summarization engine that has generated a SUMMARY of a doctor-patient dialogue TRANSCRIPT based on a set of criteria. Your evaluation will consist of answering specific questions about the SUMMARY with 1 (Yes) and 0 (No) responses. The SUMMARY quality will depend on the TRANSCRIPT.
{output_format}

CRITERIA (CSV column names, then a description):
""" + "\n".join([f"`{k}`: {v}" for k, v in judge_criteria.items()]) + """

ADDITIONAL INFORMATION: the following are common heart failure symptoms and their descriptions. Any mention of these medical terms or similar phrases do not count as a "diagnosis" in the context of this evaluation. If the patient claims some of these phrases for themselves (e.g. "I need to prop myself up with pillows"), it is a symptom and not a diagnosis, and the symptom should have been included in the symptoms list (e.g. "orthopnea" or "needs pillows" should be present).
- Dyspnea: shortness of breath, whether occurring at rest, walking, or climbing stairs
- Paroxysmal Nocturnal Dyspnea (PND): sudden shortness of breath that wakes patient up at night
- Orthopnea: needing to prop up with pillows to breathe comfortably while lying down
- Edema: swelling in your ankles or legs
- Nocturnal Cough: a cough especially at night
- Chest Pain
- Fatigue and Mental Status: feeling more tired than usual ("feeling tired" and "fatigue" are synonyms), or experience sudden changes in mental clarity (or mental status)

ADDITIONAL INFORMATION: "Do you need to prop yourself up with pillows to breathe comfortably" is a direct question on orthopnea (which is a term that patients don't understand), and as such, "Orthopnea" and "needing to prop up with pillows" are considered as one and the same.
- Orthopnea is often omitted in 'current_symptoms' by the summarizer when a patient claims they need pillows to prop up while laying flat; when this happens, it is a criteria violation for `current_symptoms_agree`.
- However, if the 'current_symptoms' section mentions 'orthopnea' but the TRANSCRIPT says only needing pillows (no explicit mention of 'orthopnea'), it is not a criteria violation (keep the score as 1).

"""

output_csv_format = """Generate a CSV row with the appropriate 1 or 0 for each criteria in the order specified below."""

output_reasoning_format = """In separate lines, first make a brief assessment of the criteria on the SUMMARY to justify your decision, then state each criteria's value (1 or 0). When explaining your assessment/reasoning, if there are issues that result in a 0, be very specific and please refer to texts in SUMMARY that is the offender. If it's a 1 (yes/no issues), keep your assessment very short.
Lastly, in one last new line, please provide any short additional observations or suggestions for improvement (2 sentences), but do not repeat evaluation points previously made. Be specific with examples, and be concise with words.
For example:
intro_patient_present,"Patient name is introduced; criteria passed hence the score is 1",1
current_symptoms_present,"No symptoms are reported in SUMMARY; criteria failed hence the score is 0",0
orthopnea_agree,"Patient needs pillows to prop up while laying flat; current_symptoms listed: 'dyspnea, fatigue'; 'orthopnea' is not listed; criteria failed hence the score is 0",0
vital_signs_agree,"Heart rate in SUMMARY is 130, but in TRANSCRIPT it's 131; criteria failed hence the score is 0",0
current_symptoms_agree,"Current symptoms in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1",1
medications_agree,"Vitamins reported in SUMMARY is not in TRANSCRIPT; criteria failed hence the score is 0",0
write your two-sentence observation/improvement here
"""

human_message_summary_judge = """
TRANSCRIPT: {transcript}

SUMMARY: {summary}
"""

def parse_response(response_content: str, expected_fields=len(judge_criteria)):
    """Function to validate and parse the response.

    Example response:
        'intro_patient,"",1\n'
        'current_symptoms,"",1\n'
        'symptoms_agree,"Nose bleeding was mentioned in the summary, but not in the transcript.",0\n'

    Desired output
        {"intro_patient": {"value": 1, "reasoning": ""}, "current_symptoms": {"value": 1, "reasoning": ""}, ...}
    """
    response_list = response_content.split("\n")[0:expected_fields]
    if len(response_list) != expected_fields:
        return {"error": "Invalid response count"}
    response_dict = {}
    for response in response_list:
        if response:
            (criteria, back_split) = response.split(",", 1)
            (reasoning, value) = back_split.rsplit(",", 1)
            response_dict[criteria] = {"value": int(value), "reasoning": reasoning.strip('"')}

    # additional wrangling on all: find phrase 'criteria passed hence the score is 1' and 'criteria failed hence the score is 0'
    # if found, override the value with 1 or 0
    for key, value in response_dict.items():
        if "criteria passed hence the score is 1" in value["reasoning"]:
            response_dict[key]["value"] = 1
        elif "criteria failed hence the score is 0" in value["reasoning"]:
            response_dict[key]["value"] = 0

    # remainder text is the observations
    response_dict["observations"] = "\n".join(response_content.split("\n")[expected_fields:]).strip()

    return response_dict

pprint(system_message_summary_judge.format(output_format=output_reasoning_format), width=120)

('You are evaluating a summarization engine that has generated a SUMMARY of a doctor-patient dialogue TRANSCRIPT based '
 'on a set of criteria. Your evaluation will consist of answering specific questions about the SUMMARY with 1 (Yes) '
 'and 0 (No) responses. The SUMMARY quality will depend on the TRANSCRIPT.\n'
 'In separate lines, first make a brief assessment of the criteria on the SUMMARY to justify your decision, then state '
 "each criteria's value (1 or 0). When explaining your assessment/reasoning, if there are issues that result in a 0, "
 "be very specific and please refer to texts in SUMMARY that is the offender. If it's a 1 (yes/no issues), keep your "
 'assessment very short.\n'
 'Lastly, in one last new line, please provide any short additional observations or suggestions for improvement (2 '
 'sentences), but do not repeat evaluation points previously made. Be specific with examples, and be concise with '
 'words.\n'
 'For example:\n'
 'intro_patient_present,"Patient 

## Import Transcript & Summary Files

In [124]:
# Specify the path to your JSON file
transcripts_json_file_path = f"../../data/patients/patients_{transcripts_version}_with_transcripts_terminated_manual_correct.json"

# Open and read the JSON file
with open(transcripts_json_file_path, 'r') as json_file:
    transcripts = json.load(json_file)

# Specify the path to your summaries JSON file
summaries_json_file_path = f"../../data/patients/patients_{transcripts_version}_summaries.json"

# Open and read the JSON file
with open(summaries_json_file_path, 'r') as json_file:
    summaries = json.load(json_file)

# Specify the CSV file path (make sure it is a file, not a directory)
csv_file_path = f"../../data/evaluations/summaries_{transcripts_version}_evaluation_2.2.csv"

# Additionally make tweaks to JSON keys for this evaluation
keys_to_convert = {
    "Patient Overview": "patient_overview",
    "Current Symptoms": "current_symptoms",
    "Vital Signs": "vital_signs",
    "Current Medications": "current_medications",
    "Summary": "summary",
}
for patient_id, summary in summaries.items():
    new_summary = {}
    for key, value in summary["summary"].items():
        if key in keys_to_convert:
            new_summary[keys_to_convert[key]] = value
        else:
            new_summary[key] = value
    summaries[patient_id]["summary"] = new_summary

# # -----
# # for testing only; comment when not needed -- only try to generate one summary

# # # to get a specific patient's transcript
# # specific_patient_id = 18136989
# # sample_patient_idx = list(transcripts.keys()).index(str(specific_patient_id))

# # to get a random patient's transcript
# sample_patient_idx = 0

# transcripts = {k: v for k, v in transcripts.items() if k == list(transcripts.keys())[sample_patient_idx]}
# summaries = {k: v for k, v in summaries.items() if k == list(transcripts.keys())[0]}
# sole_patient_id = list(summaries.keys())[0]

# # tinker -- corrupt the summary to test the evaluator
# summaries[sole_patient_id]["summary"]["current_medications"] = "Furosemide and Spironolactone"
# summaries[sole_patient_id]["summary"]["current_symptoms"] += "\nNose bleeding"
# summaries[sole_patient_id]["summary"]["vital_signs"] = 'Temperature: 98.0 degrees\nHeart Rate: 63 beats per minute\nRespiratory Rate: 23 breaths per minute\nOxygen Saturation: 94.0%\nBlood Pressure: 123/56\nWeight: N/A'
# summaries[sole_patient_id]["summary"]["summary"] += " Patient is unlikely to relapse."
# # -----

In [125]:
# Cell to actually invoke the model and write the results to a CSV file

start_time = time.time()
# for each criteria, add two columns: one for the value (same name) and one for the reasoning (suffix _reasoning)
column_order = ["transcript_number"]
for criteria in judge_criteria.keys():
    column_order.append(criteria)
    column_order.append(f"{criteria}_reasoning")
column_order.append("observations")

all_rows_series: list[pd.Series] = []
all_responses = []
# Loop through each transcript number, invoke the model, and write the results
for i, patient_number in enumerate(transcripts.keys()):
    print(".", end="")
    if (i + 1) % 10 == 0:
        print("|", end="")

    transcript = transcripts[patient_number]['chat_transcript']
    summary = summaries[patient_number]['summary']

    prompt = (
        SystemMessage(content=system_message_summary_judge.format(output_format=output_reasoning_format))
        + human_message_summary_judge
    )

    # Get the response
    response = model.invoke(prompt.format_messages(transcript=transcript, summary=summary))
    all_responses.append(response)
    # response = all_responses[i]  # for testing only
    response_dict = parse_response(response.content)

    # add to dataframe
    row_to_add = {
        "transcript_number": patient_number,
        **{k: v["value"] for k, v in response_dict.items() if k in judge_criteria.keys()},
        **{f"{k}_reasoning": v["reasoning"] for k, v in response_dict.items() if k in judge_criteria.keys()},
        "observations": response_dict["observations"]
    }
    all_rows_series.append(pd.Series(row_to_add))

total_time = time.time() - start_time
print(f"\n\nTime taken to evaluate {len(transcripts)} transcripts: {total_time:.2f} seconds ({total_time / len(transcripts):.2f} seconds per transcript)")

# create dataframe
df = pd.DataFrame(all_rows_series)[column_order]

pd.set_option('display.max_columns', None)

display(df)

# Write the dataframe to a CSV file
df.to_csv(csv_file_path, index=False)
print(f"CSV file has been created at: {csv_file_path}")

..........|..........|

Time taken to evaluate 20 transcripts: 118.54 seconds (5.93 seconds per transcript)


Unnamed: 0,transcript_number,intro_patient_present,intro_patient_present_reasoning,current_symptoms_present,current_symptoms_present_reasoning,vital_signs_present,vital_signs_present_reasoning,medications_present,medications_present_reasoning,summary_overview_present,summary_overview_present_reasoning,current_symptoms_agree,current_symptoms_agree_reasoning,orthopnea_agree,orthopnea_agree_reasoning,vital_signs_agree,vital_signs_agree_reasoning,medications_agree,medications_agree_reasoning,no_diagnose,no_diagnose_reasoning,no_normality,no_normality_reasoning,no_stability,no_stability_reasoning,observations
0,12305811,1,Patient name is introduced; criteria passed hence the score is 1,1,Current symptoms section is present in SUMMARY; criteria passed hence the score is 1,1,Vital signs section is present in SUMMARY; criteria passed hence the score is 1,1,Current medications section is present in SUMMARY; criteria passed hence the score is 1,1,Summary section is present and provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,"Current symptoms in SUMMARY (Fatigue, Edema) loosely agree with TRANSCRIPT (tiredness, leg swelling); criteria passed hence the score is 1",1,Patient does not need pillows to prop up while lying down; 'orthopnea'/'pillows' is not mentioned in 'current_symptoms'; criteria passed hence the score is 1,1,"Vital signs in SUMMARY (Temperature: 97.4, Heart Rate: 74, Respiratory Rate: 18, Oxygen Saturation: 95, Blood Pressure: 127/70) match TRANSCRIPT; criteria passed hence the score is 1",1,"Medications in SUMMARY (Furosemide, Spironolactone, Fish oil) match TRANSCRIPT; criteria passed hence the score is 1",1,SUMMARY does not contain any interpretation of results or diagnosis; criteria passed hence the score is 1,1,The idea of 'normal' is not mentioned in SUMMARY; criteria passed hence the score is 1,0,The idea of 'stable' is mentioned in SUMMARY; criteria failed hence the score is 0,"The summary could improve by avoiding the use of the term ""stable"" when discussing vital signs, as it violates the no_stability criteria. Additionally, it would be beneficial to include more specific details about the patient's symptoms, such as the context of tiredness and leg swelling, to enha..."
1,14185111,1,Patient name is introduced; criteria passed hence the score is 1,1,Current symptoms section is present in SUMMARY; criteria passed hence the score is 1,1,Vital signs section is present in SUMMARY; criteria passed hence the score is 1,1,Current medications section is present in SUMMARY; criteria passed hence the score is 1,1,Summary section is present and provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,0,"Current symptoms in SUMMARY include 'ankle swelling' and 'fatigue', but 'orthopnea' is not mentioned; criteria failed hence the score is 0",0,"Patient needs pillows to prop up while lying flat; 'orthopnea' is not mentioned in 'current_symptoms', criteria failed hence the score is 0",1,Vital signs in SUMMARY match those in TRANSCRIPT; criteria passed hence the score is 1,1,Medications in SUMMARY match those in TRANSCRIPT; criteria passed hence the score is 1,1,SUMMARY does not contain any interpretation of results or diagnosis; criteria passed hence the score is 1,0,The term 'normal' is mentioned in the context of vital signs; criteria failed hence the score is 0,1,The term 'stable' is not mentioned in the context of patient's health; criteria passed hence the score is 1,"The summary could improve by explicitly including 'orthopnea' in the current symptoms section to accurately reflect the patient's need for pillows. Additionally, avoiding the phrase ""within normal limits"" would enhance the clarity and adherence to the evaluation criteria."
2,10339317,1,Patient name is introduced; criteria passed hence the score is 1,1,Current symptoms section is present in SUMMARY; criteria passed hence the score is 1,1,Vital signs section is present in SUMMARY; criteria passed hence the score is 1,1,Current medications section is present in SUMMARY; criteria passed hence the score is 1,1,Summary section is present and provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,0,"Current symptoms in SUMMARY only mention 'chest pain', while TRANSCRIPT includes 'chest pain' and 'denies shortness of breath, swelling in legs, orthopnea'; criteria failed hence the score is 0",1,"Patient does not need pillows to breathe comfortably, and 'orthopnea' is not mentioned in 'current_symptoms'; criteria passed hence the score is 1",1,Vital signs in SUMMARY match those in TRANSCRIPT; criteria passed hence the score is 1,1,Medications in SUMMARY match those in TRANSCRIPT; criteria passed hence the score is 1,1,SUMMARY does not interpret results or provide a diagnosis; criteria passed hence the score is 1,1,The idea of 'normal' is not mentioned in SUMMARY; criteria passed hence the score is 1,0,The idea of 'stable' is mentioned in SUMMARY; criteria failed hence the score is 0,"The summary could improve by including all current symptoms mentioned in the TRANSCRIPT, such as the denial of shortness of breath and swelling in the legs, to provide a more comprehensive view of the patient's condition. Additionally, it should avoid using the term ""stable"" to adhere to the cri..."
3,14807966,1,Patient name is introduced; criteria passed hence the score is 1,1,Current symptoms section is present in SUMMARY; criteria passed hence the score is 1,1,Vital signs section is present in SUMMARY; criteria passed hence the score is 1,1,Current medications section is present in SUMMARY; criteria passed hence the score is 1,1,Summary section is present and provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,"Current symptoms in SUMMARY include dyspnea and fatigue, which match the TRANSCRIPT; criteria passed hence the score is 1",1,"Patient denies needing to prop up with pillows; 'orthopnea' is not mentioned in current_symptoms, which is correct; criteria passed hence the score is 1",1,Vital signs in SUMMARY match those reported in TRANSCRIPT; criteria passed hence the score is 1,1,Medications in SUMMARY match those reported in TRANSCRIPT; criteria passed hence the score is 1,1,SUMMARY does not contain any interpretation of results or diagnosis; criteria passed hence the score is 1,1,The idea of 'normal' is not mentioned in SUMMARY; criteria passed hence the score is 1,1,The idea of 'stable' is not mentioned in SUMMARY; criteria passed hence the score is 1,"The SUMMARY is well-structured and accurately reflects the TRANSCRIPT without any critical omissions. To improve, it could include more detail on the patient's emotional state or coping strategies, as this would provide a more holistic view of Gregory's health."
4,13912736,1,Patient name is introduced; criteria passed hence the score is 1,1,Current symptoms section is present in SUMMARY; criteria passed hence the score is 1,1,Vital signs section is present in SUMMARY; criteria passed hence the score is 1,1,Current medications section is present in SUMMARY; criteria passed hence the score is 1,1,Summary section is present and provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,Current symptoms in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,1,Patient does not need pillows to breathe comfortably; 'orthopnea' is not mentioned in 'current_symptoms'; criteria passed hence the score is 1,1,Vital signs in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,1,Medications in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,1,SUMMARY does not contain any interpretation of results or diagnosis; criteria passed hence the score is 1,1,The idea of 'normal' is not mentioned in SUMMARY; criteria passed hence the score is 1,1,The idea of 'stable' is not mentioned in SUMMARY; criteria passed hence the score is 1,"The SUMMARY is comprehensive and accurately reflects the patient's symptoms, vital signs, and medications. To improve, consider including more details about the patient's experience with symptoms, such as the frequency of shortness of breath, to provide a clearer picture of their condition."
5,15338322,1,Patient name is introduced; criteria passed hence the score is 1,1,Current symptoms section is present in SUMMARY; criteria passed hence the score is 1,1,Vital signs section is present in SUMMARY; criteria passed hence the score is 1,1,Current medications section is present in SUMMARY; criteria passed hence the score is 1,1,Summary section is present and provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,Current symptoms in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,1,Patient does not need pillows to breathe comfortably; 'orthopnea' is not mentioned in 'current_symptoms'; criteria passed hence the score is 1,1,Vital signs in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,1,Medications in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,1,SUMMARY does not contain any interpretation of results or diagnosis; criteria passed hence the score is 1,1,The idea of 'normal' is not mentioned in SUMMARY; criteria passed hence the score is 1,1,The idea of 'stable' is not mentioned in SUMMARY; criteria passed hence the score is 1,"The SUMMARY is comprehensive and accurately reflects the TRANSCRIPT. To improve, consider including more details about the patient's experience with symptoms, such as the frequency of shortness of breath, to provide a clearer picture of their condition."
6,13166275,1,Patient name is introduced; criteria passed hence the score is 1,1,Current symptoms section is present in SUMMARY; criteria passed hence the score is 1,1,Vital signs section is present in SUMMARY; criteria passed hence the score is 1,1,Current medications section is present in SUMMARY; criteria passed hence the score is 1,1,Summary section is present and provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,Current symptoms in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,0,Patient needs pillows to prop up while laying flat; current_symptoms listed: 'Paroxysmal nocturnal dyspnea'; 'orthopnea' is not explicitly mentioned; criteria failed hence the score is 0,0,"Heart rate in SUMMARY is 122, but in TRANSCRIPT it's 125; criteria failed hence the score is 0",1,Medications in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,0,Summary contains interpretation of results with 'stable' and 'elevated'; criteria failed hence the score is 0,0,The term 'normal' is mentioned in the context of respiratory rate; criteria failed hence the score is 0,0,The term 'stable' is mentioned in the context of temperature; criteria failed hence the score is 0,"The summary could improve by ensuring that all relevant symptoms, such as 'orthopnea', are explicitly mentioned to avoid confusion. Additionally, it should refrain from using terms like 'stable' and 'normal' to maintain clarity and avoid interpretations."
7,18136989,1,Patient name is introduced; criteria passed hence the score is 1,1,Current symptoms section is present in SUMMARY; criteria passed hence the score is 1,1,Vital signs section is present in SUMMARY; criteria passed hence the score is 1,1,Current medications section is present in SUMMARY; criteria passed hence the score is 1,1,Summary section is present and provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,"Current symptoms in SUMMARY mention 'dyspnea', which loosely agrees with the TRANSCRIPT where the patient reports shortness of breath; criteria passed hence the score is 1",1,Patient needs pillows to prop up while lying down; 'pillows' is mentioned in current_symptoms; criteria passed hence the score is 1,1,Vital signs in SUMMARY (heart rate 89 bpm) match TRANSCRIPT (heart rate 89 bpm); criteria passed hence the score is 1,1,"Medications in SUMMARY (ACE inhibitors, Beta-Blockers) match TRANSCRIPT; criteria passed hence the score is 1",1,SUMMARY does not contain any interpretation of results or diagnosis; criteria passed hence the score is 1,1,The idea of 'normal' is not mentioned in SUMMARY; criteria passed hence the score is 1,0,The idea of 'stable' is mentioned in the context of vital signs in SUMMARY; criteria failed hence the score is 0,"The summary could improve by avoiding the use of the term ""stable"" when discussing vital signs, as it violates the no_stability criteria. Additionally, it would be beneficial to include more detail about the patient's experience with shortness of breath to enhance the understanding of their cond..."
8,15345003,1,Patient name is introduced; criteria passed hence the score is 1,1,Current symptoms section is present in SUMMARY; criteria passed hence the score is 1,1,Vital signs section is present in SUMMARY; criteria passed hence the score is 1,1,Current medications section is present in SUMMARY; criteria passed hence the score is 1,1,Summary section is present and provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,Current symptoms in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,1,"Patient needs pillows to prop up while laying flat; current_symptoms listed: 'Need to prop self up with pillows at night'; 'orthopnea' is not explicitly mentioned, but the need for pillows is; criteria passed hence the score is 1",1,"Vital signs in SUMMARY (e.g., heart rate 92) match those in TRANSCRIPT; criteria passed hence the score is 1",1,Medications in SUMMARY match those in TRANSCRIPT; criteria passed hence the score is 1,1,SUMMARY does not contain any interpretation of results or diagnosis; criteria passed hence the score is 1,0,The idea of 'normal' is mentioned in the context of vital signs; criteria failed hence the score is 0,1,The idea of 'stable' is not mentioned in the context of patient's health; criteria passed hence the score is 1,"The summary could improve by avoiding the phrase ""vital signs are within normal limits"" to ensure compliance with the criteria regarding normality. Additionally, it would be beneficial to include a more detailed account of the patient's experience with fatigue and forgetfulness to provide a clea..."
9,17707918,1,Patient name is introduced; criteria passed hence the score is 1,1,Current symptoms section is present in SUMMARY; criteria passed hence the score is 1,1,Vital signs section is present in SUMMARY; criteria passed hence the score is 1,1,Current medications section is present in SUMMARY; criteria passed hence the score is 1,1,Summary section is present and provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,"Current symptoms in SUMMARY only mention 'left knee pain', while TRANSCRIPT indicates no new symptoms related to heart failure; criteria passed hence the score is 1",1,Patient does not mention needing pillows to breathe comfortably; 'orthopnea' or 'pillows' is not mentioned in 'current_symptoms'; criteria passed hence the score is 1,1,Vital signs in SUMMARY match those in TRANSCRIPT; criteria passed hence the score is 1,1,Medications in SUMMARY match those in TRANSCRIPT; criteria passed hence the score is 1,1,SUMMARY does not contain any interpretation of results or diagnosis; criteria passed hence the score is 1,0,The phrase 'within normal limits' is present in the context of vital signs; criteria failed hence the score is 0,1,The term 'stable' is not mentioned in the context of patient's health; criteria passed hence the score is 1,"The summary could improve by avoiding phrases like ""vital signs are within normal limits"" to ensure compliance with the criteria. Additionally, including any mention of heart failure symptoms, even if the patient denies them, would provide a more comprehensive overview."


CSV file has been created at: ../../data/evaluations/summaries_1.0_evaluation_2.2.csv


In [126]:
# parse_response(response.content)
parse_response(all_responses[5].content)

{'intro_patient_present': {'value': 1,
  'reasoning': 'Patient name is introduced; criteria passed hence the score is 1'},
 'current_symptoms_present': {'value': 1,
  'reasoning': 'Current symptoms section is present in SUMMARY; criteria passed hence the score is 1'},
 'vital_signs_present': {'value': 1,
  'reasoning': 'Vital signs section is present in SUMMARY; criteria passed hence the score is 1'},
 'medications_present': {'value': 1,
  'reasoning': 'Current medications section is present in SUMMARY; criteria passed hence the score is 1'},
 'summary_overview_present': {'value': 1,
  'reasoning': 'Summary section is present and provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1'},
 'current_symptoms_agree': {'value': 1,
  'reasoning': 'Current symptoms in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1'},
 'orthopnea_agree': {'value': 1,
  'reasoning': "Patient does not need pillows to breathe comfortably; 'orthopnea' is not men

In [122]:
# what's an average response resource usage?
print("Average response resource usage:")
pprint(response.response_metadata)
print()

# what's the total resource usage? For 4o-mini
prompt_cost = 0.15 / 1e6  # $0.15 per 1M tokens
completion_cost = 0.60 / 1e6  # $0.60 per 1M tokens
total_prompt_tokens = sum([r.response_metadata["token_usage"]["prompt_tokens"] for r in all_responses])
total_completion_tokens = sum([r.response_metadata["token_usage"]["completion_tokens"] for r in all_responses])
print(f"Total prompt tokens: {total_prompt_tokens}, cost: ${(total_prompt_cost := total_prompt_tokens * prompt_cost):0.3f}, average: {total_prompt_tokens / len(all_responses)} tokens per response")
print(f"Total completion tokens: {total_completion_tokens}, cost: ${(total_completion_cost := total_completion_tokens * completion_cost):0.3f}, average: {total_completion_tokens / len(all_responses)} tokens per response")
print(f"Total cost: ${total_prompt_cost + total_completion_cost:0.3f}")

Average response resource usage:
{'finish_reason': 'stop',
 'logprobs': None,
 'model_name': 'gpt-4o-mini-2024-07-18',
 'system_fingerprint': 'fp_ba606877f9',
 'token_usage': {'completion_tokens': 379,
                 'prompt_tokens': 2821,
                 'total_tokens': 3200}}

Total prompt tokens: 60774, cost: $0.009, average: 3038.7 tokens per response
Total completion tokens: 7731, cost: $0.005, average: 386.55 tokens per response
Total cost: $0.014


In [None]:
patient_id = "13727871"
pprint(transcripts[patient_id]["chat_transcript"], width=120)
pprint(summaries[patient_id]["summary"], width=120)

In [98]:
# viewer: open the csv file as a pandas dataframe
df = pd.read_csv(csv_file_path)
display(df)

Unnamed: 0,transcript_number,intro_patient_present,intro_patient_present_reasoning,current_symptoms_present,current_symptoms_present_reasoning,vital_signs_present,vital_signs_present_reasoning,medications_present,medications_present_reasoning,summary_overview_present,summary_overview_present_reasoning,current_symptoms_agree,current_symptoms_agree_reasoning,orthopnea_agree,orthopnea_agree_reasoning,vital_signs_agree,vital_signs_agree_reasoning,medications_agree,medications_agree_reasoning,no_diagnose,no_diagnose_reasoning,no_normality,no_normality_reasoning,no_stability,no_stability_reasoning,observations
0,12305811,1,Patient name is introduced; criteria passed hence the score is 1,1,Current symptoms section is present in SUMMARY; criteria passed hence the score is 1,1,Vital signs section is present in SUMMARY; criteria passed hence the score is 1,1,Current medications section is present in SUMMARY; criteria passed hence the score is 1,1,Summary section is present and provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,"Current symptoms in SUMMARY (Fatigue, Edema) loosely agree with TRANSCRIPT; criteria passed hence the score is 1",1,"Patient does not need pillows to prop up while lying down, and 'orthopnea' is not mentioned in 'current_symptoms'; criteria passed hence the score is 1",1,Vital signs in SUMMARY match those in TRANSCRIPT; criteria passed hence the score is 1,1,"Medications in SUMMARY (Furosemide, Spironolactone, Fish oil) agree with those in TRANSCRIPT; criteria passed hence the score is 1",1,SUMMARY does not contain any interpretation of results or diagnosis; criteria passed hence the score is 1,1,The idea of 'normal' is not mentioned in SUMMARY; criteria passed hence the score is 1,0,The idea of 'stable' is mentioned in SUMMARY; criteria failed hence the score is 0,"The summary could improve by avoiding the use of the term ""stable"" when discussing vital signs, as it violates the no_stability criteria. Additionally, it would be beneficial to include a more detailed account of the patient's symptoms, such as mentioning the patient's experience of being winded..."
1,14185111,1,Patient name is introduced; criteria passed hence the score is 1,1,Current symptoms section is present in SUMMARY; criteria passed hence the score is 1,1,Vital signs section is present in SUMMARY; criteria passed hence the score is 1,1,Current medications section is present in SUMMARY; criteria passed hence the score is 1,1,Summary section is present and provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,0,Current symptoms in SUMMARY include 'ankle swelling' but omit 'orthopnea' despite the patient needing pillows; criteria failed hence the score is 0,0,"Patient needs pillows to breathe comfortably, but 'orthopnea' is not mentioned in current_symptoms; criteria failed hence the score is 0",1,Vital signs in SUMMARY match those in TRANSCRIPT; criteria passed hence the score is 1,1,Medications in SUMMARY match those in TRANSCRIPT; criteria passed hence the score is 1,1,SUMMARY does not contain any interpretation of results or diagnosis; criteria passed hence the score is 1,0,The term 'normal' is mentioned in the context of vital signs; criteria failed hence the score is 0,1,The term 'stable' is not mentioned in the context of patient's health; criteria passed hence the score is 1,"The summary could improve by explicitly including 'orthopnea' in the current symptoms section to accurately reflect the patient's need for pillows. Additionally, avoiding phrases like ""vital signs are within normal limits"" would enhance clarity and adherence to the evaluation criteria."
2,10339317,1,Patient name is introduced; criteria passed hence the score is 1,1,Current symptoms section is present in SUMMARY; criteria passed hence the score is 1,1,Vital signs section is present in SUMMARY; criteria passed hence the score is 1,1,Current medications section is present in SUMMARY; criteria passed hence the score is 1,1,Summary section is present and provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,"Current symptoms in SUMMARY mention 'chest pain', which matches the TRANSCRIPT; criteria passed hence the score is 1",1,"Patient denies needing pillows to breathe comfortably, and 'orthopnea' is not mentioned in current symptoms; criteria passed hence the score is 1",1,Vital signs in SUMMARY match those in TRANSCRIPT; criteria passed hence the score is 1,1,Medications in SUMMARY match those in TRANSCRIPT; criteria passed hence the score is 1,1,SUMMARY does not contain any interpretation of results or diagnosis; criteria passed hence the score is 1,1,The term 'normal' is not mentioned in SUMMARY; criteria passed hence the score is 1,0,The term 'stable' is mentioned in the context of vital signs in SUMMARY; criteria failed hence the score is 0,"The summary could improve by avoiding the use of the term ""stable"" in relation to vital signs, as it violates the criteria for no_stability. Additionally, it would be beneficial to include a more detailed account of the patient's symptoms to provide a clearer picture of their condition."
3,14807966,1,Patient name is introduced; criteria passed hence the score is 1,1,Current symptoms section is present in SUMMARY; criteria passed hence the score is 1,1,Vital signs section is present in SUMMARY; criteria passed hence the score is 1,1,Current medications section is present in SUMMARY; criteria passed hence the score is 1,1,Summary section is present and provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,"Current symptoms in SUMMARY include dyspnea and fatigue, which match the TRANSCRIPT; criteria passed hence the score is 1",1,"Patient denies needing to prop up with pillows, and 'orthopnea' is not mentioned in current symptoms; criteria passed hence the score is 1",1,Vital signs in SUMMARY match those reported in TRANSCRIPT; criteria passed hence the score is 1,1,Medications in SUMMARY match those reported in TRANSCRIPT; criteria passed hence the score is 1,1,SUMMARY does not contain any interpretation of results or diagnosis; criteria passed hence the score is 1,1,The idea of 'normal' is not mentioned in SUMMARY; criteria passed hence the score is 1,1,The idea of 'stable' is not mentioned in SUMMARY; criteria passed hence the score is 1,"The SUMMARY is well-structured and accurately reflects the TRANSCRIPT without any critical omissions. To improve, it could include more specific details about the patient's experience with fatigue and lightheadedness to provide a clearer picture of their condition."
4,13912736,1,Patient name is introduced; criteria passed hence the score is 1,1,Current symptoms section is present in SUMMARY; criteria passed hence the score is 1,1,Vital signs section is present in SUMMARY; criteria passed hence the score is 1,1,Current medications section is present in SUMMARY; criteria passed hence the score is 1,1,Summary section is present and provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,Current symptoms in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,1,"Patient does not need pillows to breathe comfortably; 'orthopnea' is not mentioned in current_symptoms, which is correct; criteria passed hence the score is 1",1,Vital signs in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,1,Medications in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,1,SUMMARY does not contain any interpretation of results or diagnosis; criteria passed hence the score is 1,1,The idea of 'normal' is not mentioned in SUMMARY; criteria passed hence the score is 1,1,The idea of 'stable' is not mentioned in SUMMARY; criteria passed hence the score is 1,"The SUMMARY is well-structured and covers all necessary sections, providing a clear overview of the patient's condition. To improve, consider including more specific details about the patient's experience with symptoms, such as the frequency of shortness of breath, to enhance the context for fut..."
5,15338322,1,Patient name is introduced; criteria passed hence the score is 1,1,Current symptoms section is present in SUMMARY; criteria passed hence the score is 1,1,Vital signs section is present in SUMMARY; criteria passed hence the score is 1,1,Current medications section is present in SUMMARY; criteria passed hence the score is 1,1,Summary section is present and provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,Current symptoms in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,1,"Patient does not need pillows to breathe comfortably; 'orthopnea' is not mentioned in current_symptoms, criteria passed hence the score is 1",1,Vital signs in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,1,Medications in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,1,SUMMARY does not contain any interpretation of results or diagnosis; criteria passed hence the score is 1,1,The idea of 'normal' is not mentioned in SUMMARY; criteria passed hence the score is 1,1,The idea of 'stable' is not mentioned in SUMMARY; criteria passed hence the score is 1,"The SUMMARY is well-structured and covers all necessary sections, providing a clear overview of the patient's condition and treatment. To improve, consider including more specific details about the patient's symptoms, such as the frequency of shortness of breath, to enhance the context for futur..."
6,13166275,1,Patient name is introduced; criteria passed hence the score is 1,1,Current symptoms section is present in SUMMARY; criteria passed hence the score is 1,1,Vital signs section is present in SUMMARY; criteria passed hence the score is 1,1,Current medications section is present in SUMMARY; criteria passed hence the score is 1,1,Summary section is present and provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,Current symptoms in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,0,"Patient needs pillows to breathe comfortably, and 'orthopnea' is not mentioned in current symptoms; criteria failed hence the score is 0",0,"Heart rate in SUMMARY is 122, but in TRANSCRIPT it's 125; criteria failed hence the score is 0",1,Medications in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,1,SUMMARY does not contain any interpretation of results or diagnosis; criteria passed hence the score is 1,1,The term 'normal' is not mentioned in the context of patient's health; criteria passed hence the score is 1,0,The term 'stable' is mentioned in the context of patient's health; criteria failed hence the score is 0,"The summary could improve by explicitly mentioning ""orthopnea"" in the current symptoms section to accurately reflect the patient's experience. Additionally, ensuring that vital signs are reported accurately would enhance the reliability of the summary."
7,18136989,1,Patient name is introduced; criteria passed hence the score is 1,1,Current symptoms section is present in SUMMARY; criteria passed hence the score is 1,1,Vital signs section is present in SUMMARY; criteria passed hence the score is 1,1,Current medications section is present in SUMMARY; criteria passed hence the score is 1,1,Summary section is present and provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,"Current symptoms in SUMMARY mention 'Dyspnea', which loosely agrees with the patient's reported shortness of breath; criteria passed hence the score is 1",0,"Patient needs pillows to breathe comfortably while lying down, and 'orthopnea' is not mentioned in current_symptoms; criteria failed hence the score is 0",1,Vital signs in SUMMARY match those reported in TRANSCRIPT; criteria passed hence the score is 1,1,Medications in SUMMARY match those reported in TRANSCRIPT; criteria passed hence the score is 1,1,SUMMARY does not contain any interpretation of results or diagnosis; criteria passed hence the score is 1,1,The term 'normal' is not mentioned in SUMMARY; criteria passed hence the score is 1,0,The term 'stable' is mentioned in the context of vital signs in SUMMARY; criteria failed hence the score is 0,"The summary could improve by explicitly mentioning 'orthopnea' in the current symptoms section to accurately reflect the patient's experience. Additionally, avoiding the term 'stable' in the context of vital signs would enhance compliance with the evaluation criteria."
8,15345003,1,Patient name is introduced; criteria passed hence the score is 1,1,Current symptoms section is present in SUMMARY; criteria passed hence the score is 1,1,Vital signs section is present in SUMMARY; criteria passed hence the score is 1,1,Current medications section is present in SUMMARY; criteria passed hence the score is 1,1,Summary section is present and provides an overview of the content; criteria passed hence the score is 1,1,Current symptoms in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,0,"Patient needs pillows to breathe comfortably at night; 'orthopnea' is not mentioned in current_symptoms, which is a violation; criteria failed hence the score is 0",1,Vital signs in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,1,Medications in SUMMARY match TRANSCRIPT; criteria passed hence the score is 1,1,SUMMARY does not contain any interpretation of results or diagnosis; criteria passed hence the score is 1,0,The phrase 'within normal limits' is mentioned in the context of vital signs; criteria failed hence the score is 0,1,The term 'stable' is not mentioned in the context of patient's health; criteria passed hence the score is 1,"The summary could improve by explicitly mentioning ""orthopnea"" in the current symptoms section to accurately reflect the patient's need for pillows. Additionally, avoiding phrases like ""within normal limits"" would enhance the clarity and adherence to the evaluation criteria."
9,17707918,1,Patient name is introduced; criteria passed hence the score is 1,1,Current symptoms section is present in SUMMARY; criteria passed hence the score is 1,1,Vital signs section is present in SUMMARY; criteria passed hence the score is 1,1,Current medications section is present in SUMMARY; criteria passed hence the score is 1,1,Summary section is present and provides an overview of the content of the TRANSCRIPT; criteria passed hence the score is 1,1,"Current symptoms in SUMMARY mention left knee pain, which is consistent with the TRANSCRIPT; criteria passed hence the score is 1",1,"Patient does not mention needing to prop up with pillows; 'orthopnea' is not mentioned in current symptoms, which is acceptable; criteria passed hence the score is 1",1,Vital signs in SUMMARY match those reported in TRANSCRIPT; criteria passed hence the score is 1,1,Medications in SUMMARY match those reported in TRANSCRIPT; criteria passed hence the score is 1,1,SUMMARY does not contain any interpretation of results or diagnosis; criteria passed hence the score is 1,0,The term 'normal' is mentioned in the context of vital signs; criteria failed hence the score is 0,1,The term 'stable' is not mentioned in the context of patient's health; criteria passed hence the score is 1,"The summary could improve by avoiding phrases like ""vital signs are within normal limits"" to ensure compliance with the criteria regarding normality. Additionally, it would be beneficial to include a brief mention of the patient's heart failure symptoms to provide a more comprehensive overview."


## Idea: prompt improvement

Can we take learnings from this round of evals to improve the original prompt?

In [40]:
[col_reasoning for col_reasoning in [" ".join([str(v) for v in _series if not pd.isna(v)]) for k, _series in df.items() if k.endswith("_reasoning")] if col_reasoning] + list(df["observations"])

["SUMMARY contains 'patient_overview' and introduces the patient by name. Patient name is introduced. The SUMMARY contains a section with the key 'patient_overview' and introduces the patient by name. SUMMARY contains 'patient_overview' and introduces the patient by name. The SUMMARY contains a section with the key 'patient_overview' and introduces the patient by name. Patient name is introduced. SUMMARY contains 'patient_overview' and introduces the patient by name. Patient name is introduced. Patient name is introduced. Patient name is introduced. SUMMARY contains 'patient_overview' and introduces the patient by name. Patient name is introduced. Patient name is introduced. Patient name is introduced. Patient name is introduced. The SUMMARY contains 'patient_overview' and introduces the patient by name. The SUMMARY contains a section with 'patient_overview' and introduces the patient by name. Patient name is introduced. Patient name is introduced. SUMMARY contains 'patient_overview' a

In [50]:
original_summarization_engine_prompt = """You are a medical assistant tasked with reviewing a transcript of a conversation between a patient and their doctor. You will be provided a transcript. The doctor has asked you to write up a summary of the transcript in the format outlined below. Return your summary as a Python dictionary as follows: {{"patient_overview": "", "current_symptoms": "", "vital_signs": "", "current_medications": "", "summary: ""}}. Ensure the output is in proper dictionary format. The value for each key is a string which contains the text of the summary, including new line characters where appropriate. Add context to symptoms where appropriate, but be brief. List specific medications by name under the appropriate medication category. Do not add any information that is not present in the transcript.

"patient_overview":
    Write a one sentence summary like "[Patient Name] is experiencing [primary symptom or chief complaint]"

"current_symptoms" (Note: Separate each symptom with a new line. Determine if the patient is experiencing any of the following: Dyspnea, Paroxysmal Nocturnal Dyspnea (PND), Orthopnea, Edema, Nocturnal Cough, Chest Pain, Fatigue, Sudden Change in Mental Status.):
    List the symptoms the patient is currently experiencing

"vital_signs" (Note: Separate each vital sign with a new line. Put N/A if not reported in transcript):
    Temperature:
    Heart Rate:
    Respiratory Rate:
    Oxygen Saturation:
    Blood Pressure:
    Weight:

"current_medications" (Note: separate each medication with a new line):
    List the medications the patient is taking.

"summary":
    At a high level, summarize a few key points from the transcript. Include the symptoms that the patient confirms, and the symptoms that the patient denies. Do not list vital sign details in this section.
"""

# individual learnings: df["observations"]
# original prompt: summarization_engine_prompt

improvement_prompt_text = f"""You are tasked with improving a summarization engine's prompt, which generates summaries of doctor-patient dialogues. You will be given a list of learnings generated from an automated evaluation of the engine's summaries. Your task is to provide a revised prompt that addresses the learnings. Return your revised prompt as a string.

ORIGINAL PROMPT:
```
{original_summarization_engine_prompt}
```
"""
improvement_prompt = SystemMessage(content=improvement_prompt_text)

learnings_message = HumanMessage(
    content=(
        "LEARNINGS:\n" + "\n".join(
            [
                col_reasoning for col_reasoning in [
                    " ".join([str(v) for v in _series if not pd.isna(v)])
                    for k, _series in df.items() if k.endswith("_reasoning")
                ]
                if col_reasoning
            ]
            + list(df["observations"])
        )
    )
)

additional_instructions = HumanMessage("""Additionally, apply JSON best practices to keep the outputs processable by downstream systems.
Before generating the new prompt, summarize a "KEY GOALS" section for the prompt improvement for what you're about to do, pulling in specific examples from the learnings.
Then, write a "REVISED PROMPT" section making changes to ORIGINAL PROMPT. Be specific on what the new prompt should do by referring to the learnings.
After generating the new prompt, please summarize the key changes you made to the prompt under a "KEY CHANGES" section in the response.""")

# Get the response
response = model.invoke((improvement_prompt + learnings_message + additional_instructions).format_messages())

In [51]:
print(response.content)

### KEY GOALS
1. **Ensure Comprehensive Symptom Reporting**: The summary must include all relevant symptoms mentioned in the transcript, particularly critical symptoms like 'orthopnea' and 'chest pain', which were frequently omitted in previous summaries. For example, the summary should explicitly state the patient's need for pillows to breathe comfortably, as this is a significant indicator of their condition.

2. **Accurate Vital Signs Representation**: The summary should accurately reflect the vital signs reported in the transcript, including blood pressure and weight, which were often missing or incorrect. For instance, if the transcript states a heart rate of 75 beats per minute, the summary should reflect this without discrepancies.

3. **Avoid Interpretations or Diagnoses**: The summary must refrain from including any language that implies a diagnosis or clinical assessment, such as stating that symptoms are "well managed" or "stable." Instead, it should focus solely on the pati

In [52]:
response.response_metadata

{'token_usage': {'completion_tokens': 825,
  'prompt_tokens': 3969,
  'total_tokens': 4794},
 'model_name': 'gpt-4o-mini-2024-07-18',
 'system_fingerprint': 'fp_611b667b19',
 'finish_reason': 'stop',
 'logprobs': None}