### Objective:

This file implements evaluation harness that automatically scores an LLM's output across multiple evalutaion criteria.
*   Automate scoring mechanism as the LLM system evolves
*   Evaluation criteria include accurcay, relevance, completeness, tone
*   Each criteria is scored on a scale of 1-5

In [48]:
!pip install openai



In [49]:
from dataclasses import dataclass
import openai
import json

In [50]:
from google.colab import userdata
openai_api_key = userdata.get("OPENAI_API_KEY").strip()

In [58]:
# below is the source document and hypothetical responses given by LLM
SOURCE_DOCUMENT = """
Policy Number: CYB-2024-10234.
This cyber liability policy covers data breaches, ransomware attacks,
and business interruption losses up to $5 million per occurrence.
The retroactive date is January 1, 2022.
Exclusions include: war, nuclear events, and intentional acts by the insured.
Annual premium: $48,000. Renewal date: December 31, 2024.
"""

# The question asked
QUESTION = "What does this cyber policy cover and what are its exclusions?"

# A good answer (what we hope the LLM produces)
GOOD_ANSWER = """
This cyber liability policy covers data breaches, ransomware attacks,
and business interruption losses up to $5 million per occurrence.
It excludes war, nuclear events, and intentional acts by the insured.
"""

# A bad answer (vague, incomplete)
BAD_ANSWER = """
The policy covers various cyber risks and has some exclusions
related to certain events. Coverage is available for incidents.
"""

# A hallucinated answer (makes up facts not in the document)
HALLUCINATED_ANSWER = """
This cyber policy covers data breaches and ransomware attacks up to $10 million.
It also covers physical damage to hardware and employee theft.
Exclusions include acts of God and pandemics.
"""

In [69]:
from dataclasses import dataclass
import openai
import json

@dataclass
class EvaluationClass():
  """ Structured container for evaluation scores """
  accuracy: int             # 1-5 how correct LLM's response is with respect to source document
  relevance: int            # 1-5 is the response relevant to the user's query
  completeness: int         # 1-5 does the response contain all the required information
  tone: int                 # 1-5 is the response language professional
  overall: int              # average of above scores
  feedback: str             # one-line evaluation summary

def evaluate_response(question: str, source_document: str, llm_response: str) -> EvaluationClass:
  """ Evaluates LLM's response using four criteria using another LLM.
  response is returned in JSON format (machine readable and easy to store in database for tracking) """

  prompt = f""" You are an expert evaluator of LLM response insurance domain. Your job is to rate the LLM's
  response using four criteria. Only use original source document for evaluation and DO NOT use external information
  -----
  Question: {question}
  Source Document (ground truth): {source_document}
  Answer to evaluate: {llm_response}
  -----
  Score each answer from 1-5 on below four criteria:
  Accuracy (how factually correct LLM's response is with respect to source document?):
  5 - Perfectly factually correct, no errors
  3 - Partially correct or has some mistakes
  1 - Not correct or has made-up information

  Relevance (does the answer address user query?):
  5 - Very relevant, addresses the user query completely
  3 - Partially addresses the user query
  1 - Does not address the user query

  Completeness (does the answer answer all parts of user question?):
  5 - Addresses the user query completely
  3 - Covers some parts of the user query and not all
  1 - Barely answers the user query

  Tone (does the answer sound professional for insurance context?)
  5 - the tone of the answer is professional and clear
  3 - the tone is somewhat formal but can be improved
  1 - the tone is not at all professional

  Your response should ONLY follow below JSON format and DO NOT add additional information
  {{
    accuracy <score 1-5>
    relevance <score 1-5>
    completeness <score 1-5>
    tone <score 1-5>
    feedback <one line sentence describing strength or weakness of the answer>
  }}
  """

  client = openai.OpenAI(api_key=openai_api_key)
  response = client.chat.completions.create(
      model="gpt-4-turbo",
      messages=[
          {"role": "user", "content": prompt}
      ],
      temperature=0         # 0 for evaluation since we want consistency
  )

  raw_text = response.choices[0].message.content
  output_scores = json.loads(raw_text)
  overall_score = (output_scores["accuracy"] + output_scores["relevance"] + output_scores["completeness"] + output_scores["tone"]) / 4

  return EvaluationClass(
      accuracy= output_scores["accuracy"],
      relevance= output_scores["relevance"],
      completeness= output_scores["completeness"],
      tone= output_scores["tone"],
      overall = round(overall_score, 2),
      feedback= output_scores["feedback"]
  )

In [70]:
def run_evaluation_harness(test_cases: list[dict]) -> None:
  print("*"*50)
  print("Evaluation Summary")
  print("*"*50)
  results = []
  for test_case in test_cases:
    print("Evaluating case: ", test_case["label"])
    question = test_case["question"]
    source_document = test_case["source_document"]
    llm_response = test_case["llm_response"]

    evaluation_result = evaluate_response(question, source_document, llm_response)
    results.append((test_case["label"], evaluation_result))

    # print evaluation results
    print(f"Accuracy: {evaluation_result.accuracy}")
    print(f"Relevance: {evaluation_result.relevance}")
    print(f"Completeness: {evaluation_result.completeness}")
    print(f"Tone: {evaluation_result.tone}")
    print(f"Overall: {evaluation_result.overall}")
    print(f"Feedback: {evaluation_result.feedback}")

In [71]:
if __name__== "__main__":
  test_cases = [
      {
          "label": "Good Answer",
          "question": QUESTION,
          "source_document": SOURCE_DOCUMENT,
          "llm_response": GOOD_ANSWER
      },
      {
          "label": "Bad Answer",
          "question": QUESTION,
          "source_document": SOURCE_DOCUMENT,
          "llm_response": BAD_ANSWER
      },
      {
          "label": "Hallucinated Answer",
          "question": QUESTION,
          "source_document": SOURCE_DOCUMENT,
          "llm_response": HALLUCINATED_ANSWER
      }
  ]

  run_evaluation_harness(test_cases)

**************************************************
Evaluation Summary
**************************************************
Evaluating case:  Good Answer
Accuracy: 5
Relevance: 5
Completeness: 5
Tone: 5
Overall: 5.0
Feedback: The response accurately and completely addresses the query with a professional tone.
Evaluating case:  Bad Answer
Accuracy: 3
Relevance: 3
Completeness: 2
Tone: 4
Overall: 3.0
Feedback: The response is somewhat accurate and relevant but lacks specific details and completeness.
Evaluating case:  Hallucinated Answer
Accuracy: 1
Relevance: 3
Completeness: 1
Tone: 5
Overall: 2.5
Feedback: The response includes incorrect details and omits key information from the source document.
