In [2]:
import os
import json
import re
from openai import OpenAI
import instructor
from pydantic import BaseModel, Field

deepseek_client = OpenAI(api_key=os.environ["DEEPSEEK_API_KEY"], base_url="https://api.deepseek.com")

class Evaluation(BaseModel):
    score: int = Field(..., description="Evaluation score between 0-10", ge=0, le=10)
    reasoning: str = Field(..., description="Detailed explanation of the evaluation and score justification")

client = instructor.from_openai(OpenAI())

openai_client = OpenAI()

In [3]:
with open('evaluation_results.json', 'r') as file:
    evaluation_data = json.load(file)

# for item in evaluation_data:
#     question_number = item['question_number']
#     question = item['question']
#     reference_answer = item['reference_answer']
#     generated_answer = item['generated_answer']

#     print(f"Question {question_number}:")
#     print(f"Question: {question}")
#     print(f"Reference Answer: {reference_answer}")
#     print(f"Generated Answer: {generated_answer}")
#     print("-" * 80)

In [4]:
def provide_assesment(data):
    try: 
        prompt = f"""
        You are an expert evaluator specializing in assessing RAG (Retrieval-Augmented Generation) system responses. Your role is to evaluate both the retrieval accuracy and the quality of generated answers.

        Given the following:
            {data}

        Please provide:
        1. A score between 0-10 based on the following criteria:
        - 0-3: Poor retrieval and/or incorrect information generation
        - 4-6: Partial retrieval with some relevant information but gaps or inaccuracies
        - 7-8: Good retrieval with mostly accurate information and minor inconsistencies
        - 9-10: Excellent retrieval and generation with comprehensive and correct information

        2. Detailed reasoning for the score, considering:
        - **Information Presence**: Does the response contain the key information required?
        - **Information Accuracy**: Is the retrieved information factually correct?
        - **Generation Quality**: Is the response well-structured and coherent?
        - **Hallucination Check**: Does the response introduce any incorrect or fabricated details?
        - **Context Utilization**: How effectively is the retrieved information incorporated into the response?

        Focus on whether the necessary information is included and accurate rather than requiring an exact match with a reference answer.

        Format your response as:
        Score: [number]  
        Reasoning: [your detailed RAG system evaluation]
        """

        response = openai_client.chat.completions.create(
            model="o1-mini",
            messages=[{'role': 'user', 'content': prompt}],
        )
        assesment = response.choices[0].message.content
        return assesment
    except Exception as e:
        print(e)
        return "Error: Could not generate assesment"

In [5]:
def process_evaluation(llm_response):
    try:
        SYSTEM_PROMPT = """
        Your role is to take the assesment and structure it in a format that is easy to read and understand.

        Please provide the assesment in the following format.
        """

        USER_PROMPT = f"""
        Assesment: {llm_response}
        """

        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": USER_PROMPT}
            ],
            temperature = 0.2,
            response_model=Evaluation
        )
        return response
    except json.JSONDecodeError:
        raise ValueError(f"Could not parse LLM response: {str(e)}")

In [None]:
for item in evaluation_data[0:]:
    eval_string = f"""
    <question>{item['question']}</question>
    <reference_answer>{item['reference_answer']}</reference_answer>
    <generated_answer>{item['generated_answer']}</generated_answer>
    """
    assesment = provide_assesment(eval_string)
    evaluation = process_evaluation(assesment)

    score = evaluation.score
    reasoning = evaluation.reasoning
    print(score)
    item['reasoning'] = reasoning
    item['score'] = score

In [7]:
with open('evaluation.json', 'w') as file:
    json.dump(evaluation_data, file, indent=4)

In [None]:
def calculate_average_score(evaluations: list[Evaluation]) -> float:
    if not evaluations:
        return 0.0
        
    total_score = sum(eval.score for eval in evaluations)
    average = total_score / len(evaluations)
    return round(average, 2)

results = []
for item in evaluation_data:
    score = item['score']
    reasoning = item['reasoning']
    results.append(Evaluation(score=score, reasoning=reasoning))

average_score = calculate_average_score(results)
print(f"Average RAG System Score: {average_score}/10")