# LLM as a judge for Summarization Engine - version 2.0

Some improvements to tackle:
- have real evaluation (not just label everything as 1/good)
- give some reasoning as to why the model is marking things as bad

In [81]:
import json
from pprint import pprint

import pandas as pd
from langchain_openai import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
from dotenv import load_dotenv

# Load API keys from .env file
load_dotenv()

True

In [93]:
# Create the ChatOpenAI model instance

# model_name = "gpt-3.5-turbo-0125"  # release date: 2024-01-25
# model_name = "gpt-3.5-turbo-1106"  # release date: 2023-11-06
model_name = "gpt-4o-2024-05-13"  # release date: 2024-05-13

model = ChatOpenAI(temperature=0.0, model_name=model_name)

### Specify patient transcript file to read in
transcripts_version = 1.0

## System Prompt

In [64]:
# Define the system message for the evaluation

presence_description = "Does the SUMMARY contain the presence of the patient's {topic}? It doesn't have to agree or match with TRANSCRIPT."

judge_criteria = {
    # summary content
    "intro_patient": "Does the SUMMARY introduce patient by name?",
    "current_symptoms": presence_description.format(topic="current symptoms"),
    "vital_signs": presence_description.format(topic="vital signs"),
    "medications": presence_description.format(topic="medications"),
    "summary_overview": "Does the SUMMARY include an overview of the content of the TRANSCRIPT",

    # accuracy of summary
    "symptoms_agree": "Do the symptoms in the SUMMARY and TRANSCRIPT agree?",
    "vital_signs_agree": "Do the vital signs in the SUMMARY and TRANSCRIPT agree?",
    "meds_agree": "Do the meds in the SUMMARY and TRANSCRIPT agree?",

    # quality of summary
    # "formatting": "Is the formatting of the SUMMARY consistent and appropriate?",
    # "language_fitting": "Is the language in the SUMMARY fitting for the audience (doctors)?",
    "no_diagnose": "The SUMMARY is free from interpretation of results (avoided words like 'stable') and is free from diagnosis. Narration of patient's words is allowed (like 'patient thinks that they have...').",
}

system_message_summary_judge = """You are evaluating a summarization engine that has generated a SUMMARY of a doctor-patient dialogue TRANSCRIPT based on a set of criteria. Your evaluation will consist of answering specific questions about the SUMMARY with 1 (Yes) and 0 (No) responses. The SUMMARY quality will depend on the TRANSCRIPT.
{output_format}

CRITERIA (CSV column names, then a description):
""" + "\n".join([f"{k}, {v}" for k, v in judge_criteria.items()])

output_csv_format = """Generate a CSV row with the appropriate 1 or 0 for each criteria in the order specified below."""

output_reasoning_format = """In separate lines, state each criteria's value (1 or 0) and briefly explain your reasoning if it's a 0. When explaining reasoning, be very specific and please refer to texts in SUMMARY that is the offender. If it's a 1 (yes), leave the reasoning empty.
Lastly, in one last new line, please provide any short additional observations or suggestions for improvement (1 sentence), but do not repeat evaluation points previously made.
For example:
intro_patient,1,""
current_symptoms,0,"No symptoms are reported in SUMMARY"
vital_signs_agree,0,"Heart rate in SUMMARY is 130, but in TRANSCRIPT it's 131"
meds_agree,0,"Vitamins reported in SUMMARY is not in TRANSCRIPT"
write your one-sentence observation/improvement here
"""

human_message_summary_judge = """
TRANSCRIPT: {transcript}

SUMMARY: {summary}
"""

# took out the following:
# 13. Any additional observations or suggestions for improvement?
#     - Are there any additional observations or suggestions for improvement?

In [65]:
pprint(system_message_summary_judge.format(output_format=output_reasoning_format))

('You are evaluating a summarization engine that has generated a SUMMARY of a '
 'doctor-patient dialogue TRANSCRIPT based on a set of criteria. Your '
 'evaluation will consist of answering specific questions about the SUMMARY '
 'with 1 (Yes) and 0 (No) responses. The SUMMARY quality will depend on the '
 'TRANSCRIPT.\n'
 "In separate lines, state each criteria's value (1 or 0) and briefly explain "
 "your reasoning if it's a 0. When explaining reasoning, be very specific and "
 "please refer to texts in SUMMARY that is the offender. If it's a 1 (yes), "
 'leave the reasoning empty.\n'
 'Lastly, in one last new line, please provide any short additional '
 'observations or suggestions for improvement (1 sentence), but do not repeat '
 'evaluation points previously made.\n'
 'For example:\n'
 'intro_patient,1,""\n'
 'current_symptoms,0,"No symptoms are reported in SUMMARY"\n'
 'vital_signs_agree,0,"Heart rate in SUMMARY is 130, but in TRANSCRIPT it\'s '
 '131"\n'
 'meds_agree,0,"Vitami

## Import Transcript & Summary Files

In [66]:
# Specify the path to your JSON file
transcripts_json_file_path = f"../../data/patients/patients_{transcripts_version}_with_transcripts.json"

# Open and read the JSON file
with open(transcripts_json_file_path, 'r') as json_file:
    transcripts = json.load(json_file)

# Specify the path to your summaries JSON file
summaries_json_file_path = f"../../data/patients/patients_{transcripts_version}_summaries.json"

# Open and read the JSON file
with open(summaries_json_file_path, 'r') as json_file:
    summaries = json.load(json_file)

# Specify the CSV file path (make sure it is a file, not a directory)
csv_file_path = f"../../data/evaluations/summaries_{transcripts_version}_evaluation_2.csv"

# # -----
# # for testing only; comment when not needed -- only try to generate one summary
# sample_patient_idx = 3
# transcripts = {k: v for k, v in transcripts.items() if k == list(transcripts.keys())[sample_patient_idx]}
# summaries = {k: v for k, v in summaries.items() if k == list(transcripts.keys())[0]}
# sole_patient_id = list(summaries.keys())[0]

# # tinker -- corrupt the summary to test the evaluator
# summaries[sole_patient_id]["summary"]["Current Medications"] = "Furosemide and Spironolactone"
# summaries[sole_patient_id]["summary"]["Current Symptoms"] += "\nNose bleeding"
# summaries[sole_patient_id]["summary"]["Vital Signs"] = 'Temperature: 98.0 degrees\nHeart Rate: 63 beats per minute\nRespiratory Rate: 23 breaths per minute\nOxygen Saturation: 94.0%\nBlood Pressure: 123/56\nWeight: N/A'
# summaries[sole_patient_id]["summary"]["Summary"] += " Patient is unlikely to relapse."
# # -----

In [67]:
# Function to validate and parse the response

# Example response
#  'intro_patient,1,""\n'
#  'current_symptoms,1,""\n'
#  'symptoms_agree,0,"Nose bleeding was mentioned in the summary, but not in the transcript."\n'

# Desired output
# {"intro_patient": {"value": 1, "reasoning": ""}, "current_symptoms": {"value": 1, "reasoning": ""}, ...}

def parse_response(response_content: str, expected_fields=len(judge_criteria)):
    response_list = response_content.split("\n")[0:expected_fields]
    if len(response_list) != expected_fields:
        return {"error": "Invalid response count"}
    response_dict = {}
    for response in response_list:
        if response:
            # split by first two commas, but keep the rest of the string
            response_split = response.split(",", 2)
            response_dict[response_split[0]] = {"value": int(response_split[1]), "reasoning": response_split[2].strip('"')}

    # remainder text is the observations
    response_dict["observations"] = "\n".join(response_content.split("\n")[expected_fields:]).strip()

    return response_dict

In [68]:
# Write the header to dataframe
# for each criteria, add two columns: one for the value (same name) and one for the reasoning (suffix _reasoning)
column_order = ["transcript_number"]
for criteria in judge_criteria.keys():
    column_order.append(criteria)
    column_order.append(f"{criteria}_reasoning")
column_order.append("observations")

all_rows_series: list[pd.Series] = []
# Loop through each transcript number, invoke the model, and write the results
for patient_number in transcripts.keys():
    if patient_number in transcripts:
        transcript = transcripts[patient_number]['chat_transcript']
        summary = summaries[patient_number]['summary']

        prompt = (
            SystemMessage(content=system_message_summary_judge.format(output_format=output_reasoning_format))
            + human_message_summary_judge
        )

        # Get the response
        response = model.invoke(prompt.format_messages(transcript=transcript, summary=summary))
        response_dict = parse_response(response.content)

        # add to dataframe
        row_to_add = {
            "transcript_number": patient_number,
            **{k: v["value"] for k, v in response_dict.items() if k in judge_criteria.keys()},
            **{f"{k}_reasoning": v["reasoning"] for k, v in response_dict.items() if k in judge_criteria.keys()},
            "observations": response_dict["observations"]
        }
        all_rows_series.append(pd.Series(row_to_add))

# create dataframe
df = pd.DataFrame(all_rows_series)[column_order]

display(df)

# Write the dataframe to a CSV file
df.to_csv(csv_file_path, index=False)
print(f"CSV file has been created at: {csv_file_path}")

Unnamed: 0,transcript_number,intro_patient,intro_patient_reasoning,current_symptoms,current_symptoms_reasoning,vital_signs,vital_signs_reasoning,medications,medications_reasoning,summary_overview,...,symptoms_agree_reasoning,vital_signs_agree,vital_signs_agree_reasoning,meds_agree,meds_agree_reasoning,formatting,formatting_reasoning,no_diagnose,no_diagnose_reasoning,observations
0,12305811,1,,1,,1,,1,,1,...,,1,,1,,1,,0,The phrase 'Kevin Morris is mainly experiencin...,The summary is comprehensive and well-structur...
1,14185111,1,,1,,1,,1,,1,...,SUMMARY mentions 'Fatigue' which is not explic...,1,,1,,1,,0,SUMMARY states 'Vital signs are within normal ...,The SUMMARY should avoid adding symptoms not e...
2,10339317,1,,1,,1,,1,,1,...,,1,,1,,1,,0,"The SUMMARY states 'Overall, his symptoms are ...",The summary should avoid any interpretation of...
3,14807966,1,,1,,1,,1,,1,...,,1,,1,,1,,1,,The summary is comprehensive and well-structur...
4,13912736,1,,1,,1,,1,,1,...,The SUMMARY does not mention lightheadedness u...,1,,1,,1,,1,,The SUMMARY should include the symptom of ligh...
5,15338322,1,,1,,1,,1,,1,...,,1,,1,,1,,0,The phrase 'The doctor advised him to continue...,The summary is comprehensive and well-structur...
6,13166275,1,,1,,1,,1,,1,...,,0,"Heart rate in SUMMARY is 122, but in TRANSCRIP...",1,,0,"Oxygen Saturation is listed as N/A in SUMMARY,...",0,The SUMMARY states 'Vital signs show a stable ...,Ensure all vital signs are accurately reported...
7,18136989,1,,1,,1,,1,,1,...,,1,,1,,0,The formatting is inconsistent; 'Current Sympt...,0,The phrase 'Vital signs are stable' implies a ...,Consider removing unnecessary newline characte...
8,15345003,1,,1,,1,,1,,1,...,,1,,0,The SUMMARY lists 'Spironolactone' as a single...,1,,0,The SUMMARY states 'Vital signs are within nor...,The summary should avoid interpretations like ...
9,17707918,1,,1,,1,,1,,1,...,,1,,1,,1,,0,The phrase 'Vital signs are within normal limi...,The summary is comprehensive and well-structur...


CSV file has been created at: ../data/evaluations/summaries_1.0_evaluation_2.csv


In [69]:
parse_response(response.content)

{'intro_patient': {'value': 1, 'reasoning': ''},
 'current_symptoms': {'value': 1, 'reasoning': ''},
 'vital_signs': {'value': 1, 'reasoning': ''},
 'medications': {'value': 1, 'reasoning': ''},
 'summary_overview': {'value': 1, 'reasoning': ''},
 'symptoms_agree': {'value': 1, 'reasoning': ''},
 'vital_signs_agree': {'value': 1, 'reasoning': ''},
 'meds_agree': {'value': 1, 'reasoning': ''},
 'formatting': {'value': 1, 'reasoning': ''},
 'no_diagnose': {'value': 0,
  'reasoning': "The phrase 'Vital signs are within normal limits' implies an interpretation of the results."},
 'observations': "The summary is comprehensive and well-structured, but it should avoid any interpretation of the patient's condition."}

In [60]:
# what's an average response resource usage?
response.response_metadata

{'token_usage': {'completion_tokens': 209,
  'prompt_tokens': 2521,
  'total_tokens': 2730},
 'model_name': 'gpt-4o-2024-05-13',
 'system_fingerprint': 'fp_d33f7b429e',
 'finish_reason': 'stop',
 'logprobs': None}

In [None]:
from pprint import pprint

patient_id = "13727153"
pprint(transcripts[patient_id]["chat_transcript"])
pprint(summaries[patient_id]["summary"])

In [75]:
# viewer: open the csv file as a pandas dataframe

import pandas as pd

# view options -- max number of columns and column width
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 300)

df = pd.read_csv(csv_file_path)

display(df)

Unnamed: 0,transcript_number,intro_patient,intro_patient_reasoning,current_symptoms,current_symptoms_reasoning,vital_signs,vital_signs_reasoning,medications,medications_reasoning,summary_overview,summary_overview_reasoning,symptoms_agree,symptoms_agree_reasoning,vital_signs_agree,vital_signs_agree_reasoning,meds_agree,meds_agree_reasoning,formatting,formatting_reasoning,no_diagnose,no_diagnose_reasoning,observations
0,12305811,1,,1,,1,,1,,1,,1,,1,,1,,1,,0,The phrase 'Kevin Morris is mainly experiencing tiredness and leg swelling' implies a diagnosis.,"The summary is comprehensive and well-structured, but it should avoid any language that implies a diagnosis."
1,14185111,1,,1,,1,,1,,1,,0,SUMMARY mentions 'Fatigue' which is not explicitly stated in the TRANSCRIPT.,1,,1,,1,,0,"SUMMARY states 'Vital signs are within normal limits,' which is an interpretation.",The SUMMARY should avoid adding symptoms not explicitly mentioned in the TRANSCRIPT and refrain from interpreting the patient's vital signs.
2,10339317,1,,1,,1,,1,,1,,1,,1,,1,,1,,0,"The SUMMARY states 'Overall, his symptoms are well managed,' which is an interpretation of results.",The summary should avoid any interpretation of the patient's condition and stick to reporting the facts.
3,14807966,1,,1,,1,,1,,1,,1,,1,,1,,1,,1,,"The summary is comprehensive and well-structured, but it could benefit from a brief mention of the patient's inability to provide weight."
4,13912736,1,,1,,1,,1,,1,,0,"The SUMMARY does not mention lightheadedness upon standing quickly, which is reported in the TRANSCRIPT.",1,,1,,1,,1,,The SUMMARY should include the symptom of lightheadedness upon standing quickly as reported by the patient.
5,15338322,1,,1,,1,,1,,1,,1,,1,,1,,1,,0,"The phrase 'The doctor advised him to continue his medication regimen and monitor symptoms closely' implies medical advice, which can be seen as an interpretation.","The summary is comprehensive and well-structured, but it should avoid including medical advice to maintain objectivity."
6,13166275,1,,1,,1,,1,,1,,1,,0,"Heart rate in SUMMARY is 122, but in TRANSCRIPT it's 125.",1,,0,"Oxygen Saturation is listed as N/A in SUMMARY, but it is provided in TRANSCRIPT as 94.0 percent.",0,"The SUMMARY states 'Vital signs show a stable temperature,' which is an interpretation.",Ensure all vital signs are accurately reported and avoid any interpretation of results in the SUMMARY.
7,18136989,1,,1,,1,,1,,1,,1,,1,,1,,0,The formatting is inconsistent; 'Current Symptoms' and 'Current Medications' sections have unnecessary newline characters.,0,The phrase 'Vital signs are stable' implies a diagnosis.,Consider removing unnecessary newline characters and avoiding diagnostic language to improve the summary.
8,15345003,1,,1,,1,,1,,1,,1,,1,,0,"The SUMMARY lists 'Spironolactone' as a single medication, but the TRANSCRIPT specifies 'two types of spironolactone'.",1,,0,"The SUMMARY states 'Vital signs are within normal limits', which is an interpretation.","The summary should avoid interpretations like ""Vital signs are within normal limits"" to maintain objectivity."
9,17707918,1,,1,,1,,1,,1,,1,,1,,1,,1,,0,The phrase 'Vital signs are within normal limits' implies an interpretation of results.,"The summary is comprehensive and well-structured, but it should avoid any interpretation of the patient's condition."


## Idea: prompt improvement

Can we take learnings from this round of evals to improve the original prompt?

In [94]:
original_summarization_engine_prompt = """You are a medical assistant tasked with reviewing a transcript of a conversation between a patient and their doctor. You will be provided a transcript. The doctor has asked you to write up a summary of the transcript in the format outlined below. Return your summary as a Python dictionary as follows: {{"patient_overview": "", "current_symptoms": "", "vital_signs": "", "current_medications": "", "summary: ""}}. Ensure the output is in proper dictionary format. The value for each key is a string which contains the text of the summary, including new line characters where appropriate. Add context to symptoms where appropriate, but be brief. List specific medications by name under the appropriate medication category. Do not add any information that is not present in the transcript.

"patient_overview":
    Write a one sentence summary like "[Patient Name] is experiencing [primary symptom or chief complaint]"

"current_symptoms" (Note: Separate each symptom with a new line. Determine if the patient is experiencing any of the following: Dyspnea, Paroxysmal Nocturnal Dyspnea (PND), Orthopnea, Edema, Nocturnal Cough, Chest Pain, Fatigue, Sudden Change in Mental Status.):
    List the symptoms the patient is currently experiencing

"vital_signs" (Note: Separate each vital sign with a new line. Put N/A if not reported in transcript):
    Temperature:
    Heart Rate:
    Respiratory Rate:
    Oxygen Saturation:
    Blood Pressure:
    Weight:

"current_medications" (Note: separate each medication with a new line):
    List the medications the patient is taking.

"summary":
    At a high level, summarize a few key points from the transcript. Include the symptoms that the patient confirms, and the symptoms that the patient denies. Do not list vital sign details in this section.
"""

# individual learnings: df["observations"]
# original prompt: summarization_engine_prompt

improvement_prompt_text = f"""You are tasked with improving a summarization engine's prompt, which generates summaries of doctor-patient dialogues. You will be given a list of learnings generated from an automated evaluation of the engine's summaries. Your task is to provide a revised prompt that addresses the learnings. Return your revised prompt as a string.

ORIGINAL PROMPT:
```
{original_summarization_engine_prompt}
```
"""
improvement_prompt = SystemMessage(content=improvement_prompt_text)

learnings_message = HumanMessage(content="LEARNINGS:\n" + "\n".join(df["observations"]))

additional_instructions = HumanMessage("""Additionally, apply JSON best practices to keep the outputs processable by downstream systems. After generating the new prompt, please summarize the key changes you made to the prompt under a "KEY CHANGES" section in the response.""")

# Get the response
response = model.invoke((improvement_prompt + learnings_message + additional_instructions).format_messages())

In [99]:
print(response.content)

```
You are a medical assistant tasked with reviewing a transcript of a conversation between a patient and their doctor. You will be provided a transcript. The doctor has asked you to write up a summary of the transcript in the format outlined below. Return your summary as a JSON object as follows: {"patient_overview": "", "current_symptoms": "", "vital_signs": "", "current_medications": "", "summary": ""}. Ensure the output is in proper JSON format. The value for each key is a string which contains the text of the summary, including new line characters where appropriate. Add context to symptoms where appropriate, but be brief. List specific medications by name under the appropriate medication category. Do not add any information that is not present in the transcript. Avoid any language that implies a diagnosis or interpretation of the patient's condition. Stick to reporting the facts.

"patient_overview":
    Write a one sentence summary like "[Patient Name] is experiencing [primary s

In [100]:
response.response_metadata

{'token_usage': {'completion_tokens': 579,
  'prompt_tokens': 886,
  'total_tokens': 1465},
 'model_name': 'gpt-4o-2024-05-13',
 'system_fingerprint': 'fp_d33f7b429e',
 'finish_reason': 'stop',
 'logprobs': None}