In [53]:
EVALUATION_PROMPT = """### Task Description:
An instruction (might include an input inside it), a response to evaluate, a reference answer that receives a score of 5, and a score rubric representing multiple evaluation criteria are provided.

1. Write specific and constructive feedback that assesses the response’s quality strictly based on the given score rubrics below. If the response is more detailed or lengthy, this is not a disadvantage unless it includes off-topic or irrelevant content.
2. After writing feedback, provide a score between 1 and 5 for each evaluation criterion.
3. After feedback and scores, provide an overall correctness score (Correct or Incorrect) if the response, in the context of a yes/no question, is correct.
4. Format your output as: "Feedback: {{feedback for each criterion}} [SCORE_FACTUALITY] {{score}} [SCORE_RELEVANCE] {{score}} [SCORE_COMPLETENESS] {{score}} [SCORE_CONFIDENCE] {{score}} [CORRECTNESS] {{Correct or Incorrect}}"
5. Please do not add any other opening, closing, or explanations. Include [SCORE_FACTUALITY], [SCORE_RELEVANCE], [SCORE_COMPLETENESS], [SCORE_CONFIDENCE], and [CORRECTNESS] in your output.

### The instruction to evaluate:
{instruction}

### Response to evaluate:
{response}

### Reference Answer (Score 5):
{reference_answer}

### Score Rubrics:
1. **Factuality**: Is the response correct, accurate, and factual based on the reference answer?
   - Score 1: Completely incorrect, inaccurate, and/or not factual.
   - Score 2: Mostly incorrect, inaccurate, and/or not factual.
   - Score 3: Somewhat correct, accurate, and/or factual.
   - Score 4: Mostly correct, accurate, and factual.
   - Score 5: Completely correct, accurate, and factual.

2. **Relevance**: Does the response stay focused on the instruction and provide relevant information without introducing unnecessary or off-topic content?
   - Score 1: Completely irrelevant to the instruction or question.
   - Score 2: Mostly irrelevant with some on-topic information.
   - Score 3: Somewhat relevant but includes some unnecessary information.
   - Score 4: Mostly relevant with little unnecessary information.
   - Score 5: Fully relevant and focused on the instruction.

3. **Completeness**: Does the response thoroughly cover all parts of the question or instruction without omitting important details?
   - Score 1: Completely incomplete, misses all key points.
   - Score 2: Misses most key points, partially complete.
   - Score 3: Addresses some key points but is incomplete in other aspects.
   - Score 4: Addresses most key points with minor omissions.
   - Score 5: Fully complete, addresses all key points directly.

4. **Confidence**: How confident is the response in providing accurate information based on the reference answer?
   - Score 1: Completely unsure or lacking confidence.
   - Score 2: Mostly unsure, indicates low confidence.
   - Score 3: Somewhat confident but lacks strong evidence.
   - Score 4: Mostly confident with some solid backing.
   - Score 5: Completely confident, well-supported by evidence.

### Feedback:"""

evaluation_prompt_template = ChatPromptTemplate.from_messages([HumanMessagePromptTemplate.from_template(EVALUATION_PROMPT)])


In [65]:
import google.generativeai as genai
import pandas as pd
import time
import os
from datasets import Dataset
from tqdm.auto import tqdm
from langchain.prompts.chat import (ChatPromptTemplate, HumanMessagePromptTemplate)
import numpy as np

# Configure API
with open("gemini_api_key.txt") as f:
    gemini_api_key = f.read().strip()
genai.configure(api_key=gemini_api_key)

evaluator = genai.GenerativeModel(model_name='gemini-1.5-flash')

evaluation_prompt_template = ChatPromptTemplate.from_messages([HumanMessagePromptTemplate.from_template(EVALUATION_PROMPT)])

# Function to load test set from an Excel file
def load_test_set(file_path: str, rag_flag: bool) -> Dataset:
    result_df = pd.read_excel(file_path)
    testset = []
    for _, row in result_df.iterrows():
        question = row['Question']
        answer = row['RAG Answer'] if rag_flag else row['Direct Answer']
        ground_truth = row['Right Answer']
        company = row['Company']
        similarity_score = row['Similarity Score']
        optimal_index = row['Optimal Index'] if "Optimal Index" in result_df else None
        testset.append({
            "question": question,
            "generated_answer": answer,
            "true_answer": ground_truth,
            "company": company,
            "similarity_score": similarity_score,
            "optimal_index": optimal_index
        })
    return testset

# Function to extract metrics from evaluation response
def extract_evaluation_metrics(eval_response_text):
    text = eval_response_text
    # Extract feedback (before the first score marker)
    feedback = text.split("[SCORE_FACTUALITY]")[0].strip()
    # Extract each score and correctness using the markers
    factuality_score = int(text.split("[SCORE_FACTUALITY]")[1].split("[SCORE_RELEVANCE]")[0].strip())
    relevance_score = int(text.split("[SCORE_RELEVANCE]")[1].split("[SCORE_COMPLETENESS]")[0].strip())
    completeness_score = int(text.split("[SCORE_COMPLETENESS]")[1].split("[SCORE_CONFIDENCE]")[0].strip())
    confidence_score = int(text.split("[SCORE_CONFIDENCE]")[1].split("[CORRECTNESS]")[0].strip())
    correctness = text.split("[CORRECTNESS]")[1].strip()
    
    return {
        "feedback": feedback,
        "factuality_score": factuality_score,
        "relevance_score": relevance_score,
        "completeness_score": completeness_score,
        "confidence_score": confidence_score,
        "correctness": correctness
    }

# Function to evaluate a test set and save results
def evaluate_and_save(testset, evaluator, test_name: str, save_path: str, rag_flag: bool):
    os.makedirs(save_path, exist_ok=True)
    evaluation_results = []
    for experiment in testset:
        evaluation_prompt = evaluation_prompt_template.format_messages(
            instruction=experiment["question"],
            response=experiment["generated_answer"],
            reference_answer=experiment["true_answer"]
        )
        eval_response = evaluator.generate_content(str(evaluation_prompt))
        time.sleep(5)  # avoid hitting rate limits
        eval_metrics = extract_evaluation_metrics(eval_response.text)
        experiment.update({
            "eval_factuality": eval_metrics["factuality_score"],
            "eval_relevance": eval_metrics["relevance_score"],
            "eval_completeness": eval_metrics["completeness_score"],
            "eval_confidence": eval_metrics["confidence_score"],
            "eval_correctness": eval_metrics["correctness"],
            "eval_feedback": eval_metrics["feedback"],
            "mean_similarity_score": np.mean([float(x) for x in experiment["similarity_score"][1:-1].split(", ")]),
            "max_similarity_score": np.max([float(x) for x in experiment["similarity_score"][1:-1].split(", ")])
        })
        evaluation_results.append(experiment)
    suffix = "_RAG" if rag_flag else "_Direct"
    evaluation_df = pd.DataFrame(evaluation_results)
    evaluation_df.to_csv(os.path.join(save_path, f"{test_name}{suffix}_evaluation.csv"), index=False)


In [64]:
# Run evaluations on all files in model_responses folder for both RAG and Direct answers
model_responses_folder = "model_responses_test"
evaluation_folder = "evaluation_test"
for file_name in tqdm(os.listdir(model_responses_folder)):
    if file_name.endswith(".xlsx"):
        file_path = os.path.join(model_responses_folder, file_name)
        test_name = os.path.splitext(file_name)[0]
        for rag_flag in [True, False]:
            testset = load_test_set(file_path, rag_flag=rag_flag)
            evaluate_and_save(testset, evaluator, test_name, evaluation_folder, rag_flag=rag_flag)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]