# LLM as a judge for Summarization Engine - version 1.1

Some improvements to tackle:
- have real evaluation (not just label everything as 1/good)
- give some reasoning as to why the model is marking things as bad

In [118]:
import json
from pprint import pprint

import pandas as pd
from langchain_openai import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
from dotenv import load_dotenv

# Load API keys from .env file
load_dotenv()

True

In [137]:
# Create the ChatOpenAI model instance

model_name = "gpt-3.5-turbo-0125"  # release date: 2024-01-25
# model_name = "gpt-3.5-turbo-1106"  # release date: 2023-11-06
# model_name = "gpt-4o-2024-05-13"  # release date: 2024-05-13

model = ChatOpenAI(temperature=0.0, model_name=model_name)

### Specify patient transcript file to read in
transcripts_version = 1.0

## System Prompt

In [138]:
# Define the system message for the evaluation

presence_description = "Does the SUMMARY contain the presence of the patient's {topic}? It doesn't have to agree with TRANSCRIPT."

judge_criteria = {
    # summary content
    "intro_patient": "Does the SUMMARY introduce patient by name?",
    "current_symptoms": presence_description.format(topic="current symptoms"),
    "vital_signs": presence_description.format(topic="vital signs"),
    "medications": presence_description.format(topic="medications"),
    "summary_overview": "Does the SUMMARY include an overview of the content of the TRANSCRIPT",

    # accuracy of summary
    "symptoms_agree": "Do the symptoms in the SUMMARY and TRANSCRIPT agree?",
    "vital_signs_agree": "Do the vital signs in the SUMMARY and TRANSCRIPT agree?",
    "meds_agree": "Do the meds in the SUMMARY and TRANSCRIPT agree?",

    # quality of summary
    "formatting": "Is the formatting of the SUMMARY consistent and appropriate?",
    # "language_fitting": "Is the language in the SUMMARY fitting for the audience (doctors)?",
    "no_diagnose": "The SUMMARY is free from interpretation of results (avoided words like 'stable') and is free from diagnosis. Narration of patient's words is allowed (like 'patient thinks that they have...').",
}

system_message_summary_judge = """You are evaluating a summarization engine that has generated a SUMMARY of a doctor-patient dialogue TRANSCRIPT based on a set of criteria. Your evaluation will consist of answering specific questions about the SUMMARY with 1 (Yes) and 0 (No) responses. The SUMMARY quality will depend on the TRANSCRIPT.
{output_format}

Criteria (CSV column names, then a description):
""" + "\n".join([f"{k}, {v}" for k, v in judge_criteria.items()])

output_csv_format = """Generate a CSV row with the appropriate 1 or 0 for each criteria in the order specified below."""

output_reasoning_format = """In separate lines, state each criteria's value (1 or 0) and briefly explain your reasoning if it's a 0. For example:
intro_patient,1,""
current_symptoms,0,"Dyspnea was not mentioned in the summary"
"""

human_message_summary_judge = """
TRANSCRIPT: {transcript}

SUMMARY: {summary}
"""

# took out the following:
# 13. Any additional observations or suggestions for improvement?
#     - Are there any additional observations or suggestions for improvement?

In [139]:
pprint(system_message_summary_judge)

('You are evaluating a summarization engine that has generated a SUMMARY of a '
 'doctor-patient dialogue TRANSCRIPT based on a set of criteria. Your '
 'evaluation will consist of answering specific questions about the SUMMARY '
 'with 1 (Yes) and 0 (No) responses. The SUMMARY quality will depend on the '
 'TRANSCRIPT.\n'
 '{output_format}\n'
 '\n'
 'Criteria (CSV column names, then a description):\n'
 'intro_patient, Does the SUMMARY introduce patient by name?\n'
 "current_symptoms, Does the SUMMARY contain the presence of the patient's "
 "current symptoms? It doesn't have to agree with TRANSCRIPT.\n"
 "vital_signs, Does the SUMMARY contain the presence of the patient's vital "
 "signs? It doesn't have to agree with TRANSCRIPT.\n"
 "medications, Does the SUMMARY contain the presence of the patient's "
 "medications? It doesn't have to agree with TRANSCRIPT.\n"
 'summary_overview, Does the SUMMARY include an overview of the content of the '
 'TRANSCRIPT\n'
 'symptoms_agree, Do the sy

## Import Transcript & Summary Files

In [140]:
# Specify the path to your JSON file
transcripts_json_file_path = f"../data/patients/patients_{transcripts_version}_with_transcripts.json"

# Open and read the JSON file
with open(transcripts_json_file_path, 'r') as json_file:
    transcripts = json.load(json_file)

# Specify the path to your summaries JSON file
summaries_json_file_path = f"../data/patients/patients_{transcripts_version}_summaries.json"

# Open and read the JSON file
with open(summaries_json_file_path, 'r') as json_file:
    summaries = json.load(json_file)

# Specify the CSV file path (make sure it is a file, not a directory)
csv_file_path = f"../data/evaluations/summaries_{transcripts_version}_evaluation_2.csv"

# -----
# for testing only; comment when not needed -- only try to generate one summary
sample_patient_idx = 1
transcripts = {k: v for k, v in transcripts.items() if k == list(transcripts.keys())[sample_patient_idx]}
summaries = {k: v for k, v in summaries.items() if k == list(transcripts.keys())[0]}
sole_patient_id = list(summaries.keys())[0]

# tinker -- corrupt the summary to test the evaluator
summaries[sole_patient_id]["summary"]["Current Medications"] = "Furosemide and Spironolactone"
summaries[sole_patient_id]["summary"]["Current Symptoms"] += "\nNose bleeding"
summaries[sole_patient_id]["summary"]["Vital Signs"] = 'Temperature: 98.0 degrees\nHeart Rate: 63 beats per minute\nRespiratory Rate: 23 breaths per minute\nOxygen Saturation: 94.0%\nBlood Pressure: 123/56\nWeight: N/A'
summaries[sole_patient_id]["summary"]["Summary"] += " Patient is unlikely to relapse."
# -----

In [141]:
# Function to validate and parse the response

# Example response
#  'intro_patient,1,""\n'
#  'current_symptoms,1,""\n'
#  'symptoms_agree,0,"Nose bleeding was mentioned in the summary, but not in the transcript."\n'

# Desired output
# {"intro_patient": {"value": 1, "reasoning": ""}, "current_symptoms": {"value": 1, "reasoning": ""}, ...}

def parse_response(response_content: str, expected_fields=len(judge_criteria)):
    response_list = response_content.split("\n")
    if len(response_list) != expected_fields:
        return {"error": "Invalid response count"}
    response_dict = {}
    for response in response_list:
        if response:
            # split by first two commas, but keep the rest of the string
            response_split = response.split(",", 2)
            response_dict[response_split[0]] = {"value": int(response_split[1]), "reasoning": response_split[2].strip('"')}
    return response_dict

In [142]:
# Write the header to dataframe
# for each criteria, add two columns: one for the value (same name) and one for the reasoning (suffix _reasoning)
column_order = ["transcript_number"]
for criteria in judge_criteria.keys():
    column_order.append(criteria)
    column_order.append(f"{criteria}_reasoning")

all_rows_series: list[pd.Series] = []
# Loop through each transcript number, invoke the model, and write the results
for patient_number in transcripts.keys():
    if patient_number in transcripts:
        transcript = transcripts[patient_number]['chat_transcript']
        summary = summaries[patient_number]['summary']

        prompt = (
            SystemMessage(content=system_message_summary_judge.format(output_format=output_reasoning_format))
            + human_message_summary_judge
        )

        # Get the response
        response = model.invoke(prompt.format_messages(transcript=transcript, summary=summary))
        response_dict = parse_response(response.content)

        # add to dataframe
        row_to_add = {
            "transcript_number": patient_number,
            **{k: v["value"] for k, v in response_dict.items()},
            **{f"{k}_reasoning": v["reasoning"] for k, v in response_dict.items()},
        }
        all_rows_series.append(pd.Series(row_to_add))

# create dataframe
df = pd.DataFrame(all_rows_series)[column_order]

display(df)

# Write the dataframe to a CSV file
df.to_csv(csv_file_path, index=False)
print(f"CSV file has been created at: {csv_file_path}")

Unnamed: 0,transcript_number,intro_patient,intro_patient_reasoning,current_symptoms,current_symptoms_reasoning,vital_signs,vital_signs_reasoning,medications,medications_reasoning,summary_overview,...,symptoms_agree,symptoms_agree_reasoning,vital_signs_agree,vital_signs_agree_reasoning,meds_agree,meds_agree_reasoning,formatting,formatting_reasoning,no_diagnose,no_diagnose_reasoning
0,14185111,1,,0,Nose bleeding was not mentioned in the summary,1,,1,,1,...,0,The summary mentions fatigue which was not pre...,0,The heart rate in the summary is 63 beats per ...,0,The summary mentions Furosemide and Spironolac...,1,,1,


CSV file has been created at: ../data/evaluations/summaries_1.0_evaluation_2.csv


In [145]:
# what's an average response resource usage?
response.response_metadata

{'token_usage': {'completion_tokens': 148,
  'prompt_tokens': 2177,
  'total_tokens': 2325},
 'model_name': 'gpt-3.5-turbo-0125',
 'system_fingerprint': None,
 'finish_reason': 'stop',
 'logprobs': None}

In [None]:
from pprint import pprint

pprint(transcripts[sole_patient_id]["chat_transcript"])
pprint(summaries[sole_patient_id]["summary"])

In [144]:
# viewer: open the csv file as a pandas dataframe

import pandas as pd

df = pd.read_csv(csv_file_path)

display(df)

Unnamed: 0,transcript_number,intro_patient,intro_patient_reasoning,current_symptoms,current_symptoms_reasoning,vital_signs,vital_signs_reasoning,medications,medications_reasoning,summary_overview,...,symptoms_agree,symptoms_agree_reasoning,vital_signs_agree,vital_signs_agree_reasoning,meds_agree,meds_agree_reasoning,formatting,formatting_reasoning,no_diagnose,no_diagnose_reasoning
0,14185111,1,,0,Nose bleeding was not mentioned in the summary,1,,1,,1,...,0,The summary mentions fatigue which was not pre...,0,The heart rate in the summary is 63 beats per ...,0,The summary mentions Furosemide and Spironolac...,1,,1,
