In [7]:
import google.generativeai as genai
import pandas as pd
import time
from datasets import Dataset
from tqdm.auto import tqdm
from langchain.prompts.chat import (ChatPromptTemplate, HumanMessagePromptTemplate)

with open("gemini_api_key.txt") as f:
    gemini_api_key = f.read().strip()
genai.configure(api_key=gemini_api_key)

evaluator = genai.GenerativeModel(model_name='gemini-1.5-flash')

In [13]:
EVALUATION_PROMPT = """### Task Description:
An instruction (might include an input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing multiple evaluation criteria are given.
1. Write detailed feedback that assesses the quality of the response strictly based on the given score rubrics below. Additional information is not a disadvantage unless it negatively impacts clarity or relevance.
2. After writing feedback, provide a score that is an integer between 1 and 5 for each evaluation criterion.
3. After writing the scores, provide an overall correctness score (Correct or Incorrect) if the response, in the context of a yes/no question, is correct.
4. The output format should look as follows: "Feedback: {{write feedback for each criterion}} [SCORE_FACTUALITY] {{score for factuality}} [SCORE_RELEVANCE] {{score for relevance}} [SCORE_COMPLETENESS] {{score for completeness}} [SCORE_CLARITY] {{score for clarity}} [SCORE_CONFIDENCE] {{score for confidence}} [CORRECTNESS] {{Correct or Incorrect}}"
5. Please do not generate any other opening, closing, or explanations. Be sure to include [SCORE_FACTUALITY], [SCORE_RELEVANCE], [SCORE_COMPLETENESS], [SCORE_CLARITY], [SCORE_CONFIDENCE], and [CORRECTNESS] in your output.

### The instruction to evaluate:
{instruction}

### Response to evaluate:
{response}

### Reference Answer (Score 5):
{reference_answer}

### Score Rubrics:
1. **Factuality**: Is the response correct, accurate, and factual based on the reference answer?
   - Score 1: Completely incorrect, inaccurate, and/or not factual.
   - Score 2: Mostly incorrect, inaccurate, and/or not factual.
   - Score 3: Somewhat correct, accurate, and/or factual.
   - Score 4: Mostly correct, accurate, and factual.
   - Score 5: Completely correct, accurate, and factual.

2. **Relevance**: Does the response stay focused on the instruction and provide relevant information without introducing unnecessary or off-topic content?
   - Score 1: Completely irrelevant to the instruction or question.
   - Score 2: Mostly irrelevant with some on-topic information.
   - Score 3: Somewhat relevant but introduces some unnecessary information.
   - Score 4: Mostly relevant with little unnecessary information.
   - Score 5: Fully relevant and focused on the instruction.

3. **Completeness**: Does the response thoroughly cover all parts of the question or instruction without omitting important details, and without merely directing the reader to consult the T&C document?
   - Score 1: Completely incomplete, misses all key points.
   - Score 2: Misses most key points, partially complete.
   - Score 3: Addresses some key points but is incomplete in other aspects.
   - Score 4: Addresses most key points with minor omissions.
   - Score 5: Fully complete, addresses all key points directly without deferring to the T&C document.

4. **Clarity**: Is the response clear and easy to understand without being overly complex or ambiguous?
   - Score 1: Completely unclear, hard to understand.
   - Score 2: Mostly unclear or difficult to follow.
   - Score 3: Somewhat clear but may include ambiguous or confusing parts.
   - Score 4: Mostly clear with minor clarity issues.
   - Score 5: Completely clear and easy to understand.

5. **Confidence**: How confident is the response in providing accurate information based on the reference answer?
   - Score 1: Completely unsure or lacking confidence.
   - Score 2: Mostly unsure, indicates low confidence.
   - Score 3: Somewhat confident but lacks strong evidence.
   - Score 4: Mostly confident with some solid backing.
   - Score 5: Completely confident, well-supported by evidence.

### Feedback:"""

evaluation_prompt_template = ChatPromptTemplate.from_messages([HumanMessagePromptTemplate.from_template(EVALUATION_PROMPT)])

In [12]:
def load_test_set(file_path: str, rag_flag) -> Dataset:
    """
    Load responses from an Excel file and transform them into a Dataset object.

    Parameters:
    - file_path (str): The path to the Excel file containing the responses.
    - rag_flag (bool): A flag indicating whether to evaluate the RAG Answer.

    Returns:
    - Dataset: A Dataset object containing the transformed data.
    """
    # Load the Excel file
    result_df = pd.read_excel(file_path)
    # Filter and rename the necessary columns
    result_df = result_df[['Question', 'Company', 'Context', 'RAG Answer', 'Direct Answer', 'Right Answer']]

    # Transform the DataFrame into the desired format
    testset = []
    for index, row in result_df.iterrows():
        question = row['Question']
        answer = row['RAG Answer'] if rag_flag else row['Direct Answer']
        ground_truth = row['Right Answer'] 
        testset.append({
            "question": question,
            "generated_answer": answer,
            "true_answer": ground_truth
        })
    display(pd.DataFrame(testset))
    # Create a Dataset object from the data
    return Dataset.from_list(testset)

In [19]:
def extract_evaluation_metrics(eval_response_text):
    """
    Extract feedback, scores, and correctness from evaluation response text.
    
    Parameters:
    eval_response_text (str): The evaluation response text to parse.
    
    Returns:
    dict: A dictionary containing the feedback, scores, and correctness.
    """
    text = eval_response_text
    
    # Extract feedback (before the first score marker)
    feedback = text.split("[SCORE_FACTUALITY]")[0].strip()
    
    # Extract each score and correctness using the markers
    factuality_score = int(text.split("[SCORE_FACTUALITY]")[1].split("[SCORE_RELEVANCE]")[0].strip())
    relevance_score = int(text.split("[SCORE_RELEVANCE]")[1].split("[SCORE_COMPLETENESS]")[0].strip())
    completeness_score = int(text.split("[SCORE_COMPLETENESS]")[1].split("[SCORE_CLARITY]")[0].strip())
    clarity_score = int(text.split("[SCORE_CLARITY]")[1].split("[SCORE_CONFIDENCE]")[0].strip())
    confidence_score = int(text.split("[SCORE_CONFIDENCE]")[1].split("[CORRECTNESS]")[0].strip())
    correctness = text.split("[CORRECTNESS]")[1].strip()
    
    return {
        "feedback": feedback,
        "factuality_score": factuality_score,
        "relevance_score": relevance_score,
        "completeness_score": completeness_score,
        "clarity_score": clarity_score,
        "confidence_score": confidence_score,
        "correctness": correctness
    }

def evaluate(testset, evaluator):
    evaluation = []
    count = 0
    batch_size = 5  # Number of rows to process at a time
    for experiment in tqdm(testset):
        # Create the evaluation prompt with the new metrics
        evaluation_prompt = evaluation_prompt_template.format_messages(
            instruction=experiment["question"],
            response=experiment["generated_answer"],
            reference_answer=experiment["true_answer"],
        )
        
        # Generate the evaluation response from the evaluator
        eval_response = evaluator.generate_content(str(evaluation_prompt))
        
        try:
            eval_metrics = extract_evaluation_metrics(eval_response.text)
            # Add the extracted evaluation metrics to the experiment
            experiment["eval_factuality"] = eval_metrics["factuality_score"]
            experiment["eval_relevance"] = eval_metrics["relevance_score"]
            experiment["eval_completeness"] = eval_metrics["completeness_score"]
            experiment["eval_clarity"] = eval_metrics["clarity_score"]
            experiment["eval_confidence"] = eval_metrics["confidence_score"]
            experiment["eval_correctness"] = eval_metrics["correctness"]
            experiment["eval_feedback"] = eval_metrics["feedback"]
            evaluation.append(experiment)
            
        except Exception as e:
            # Handle any unexpected error and assign default values
            print(f"Error processing evaluation: {e}")
        if (count + 1) % batch_size == 0:
            time.sleep(30)  
        count += 1
    return evaluation


In [20]:
file_path = 'responses.xlsx'
testset = load_test_set(file_path, rag_flag=False)
evaluation = pd.DataFrame(evaluate(testset, evaluator))
evaluation

Unnamed: 0,question,generated_answer,true_answer
0,What is the policy on retaining driver data a...,"I do not have access to real-time information,...","After a driver's account is closed, Bolt Head..."
1,How long does Shopify keep store information ...,"Unfortunately, I do not have access to private...",Shopify retains store information for two year...
2,What is Amazon's policy regarding the storage...,"I do not have access to real-time information,...","Amazon states that they will not retain, use, ..."
3,What is the policy on returns in Uber?\n,"I do not have access to real-time information,...",The Uber Terms of Service do not explicitly m...
4,What company am I contracting with when I use...,If you are a resident of the UK and use TikTok...,"If you are resident in the United Kingdom, you..."
5,What happens to my Google One membership afte...,"I do not have access to real-time information,...",After the promotional period (Offer Period) en...
6,What is the maximum time it can take for Inst...,"Unfortunately, Instagram doesn't explicitly st...","According to Instagram's policies, it can take..."
7,What is the policy on the number of product l...,"Unfortunately, I do not have access to real-ti...",AliExpress reserves the right to place restri...
8,What are the governing terms and conditions f...,While I can provide general information about ...,If you purchase Azure services through a Micr...
9,What kind of data does Tradera collect automa...,"I do not have access to real-time information,...",Tradera automatically collects information sen...


  0%|          | 0/50 [00:00<?, ?it/s]

Unnamed: 0,question,generated_answer,true_answer,eval_factuality,eval_relevance,eval_completeness,eval_clarity,eval_confidence,eval_correctness,eval_feedback
0,What is the policy on retaining driver data a...,"I do not have access to real-time information,...","After a driver's account is closed, Bolt Head...",2,4,2,5,3,Incorrect,Feedback: The response correctly indicates tha...
1,How long does Shopify keep store information ...,"Unfortunately, I do not have access to private...",Shopify retains store information for two year...,2,3,1,5,2,Incorrect,Feedback: The response is accurate in stating ...
2,What is Amazon's policy regarding the storage...,"I do not have access to real-time information,...","Amazon states that they will not retain, use, ...",3,3,2,4,2,Incorrect,Feedback: The response correctly points out it...
3,What is the policy on returns in Uber?\n,"I do not have access to real-time information,...",The Uber Terms of Service do not explicitly m...,3,4,3,4,3,Incorrect,Feedback: The response provides useful informa...
4,What company am I contracting with when I use...,If you are a resident of the UK and use TikTok...,"If you are resident in the United Kingdom, you...",4,4,3,5,3,Incorrect,Feedback: The response provides the correct co...
5,What happens to my Google One membership afte...,"I do not have access to real-time information,...",After the promotional period (Offer Period) en...,4,4,3,4,3,Incorrect,Feedback: The response is factually accurate i...
6,What is the maximum time it can take for Inst...,"Unfortunately, Instagram doesn't explicitly st...","According to Instagram's policies, it can take...",4,4,3,5,3,Incorrect,Feedback: The response accurately states that ...
7,What is the policy on the number of product l...,"Unfortunately, I do not have access to real-ti...",AliExpress reserves the right to place restri...,3,4,3,5,3,Incorrect,Feedback: The response accurately states that ...
8,What are the governing terms and conditions f...,While I can provide general information about ...,If you purchase Azure services through a Micr...,3,3,3,4,2,Incorrect,Feedback: The response is helpful in providing...
9,What kind of data does Tradera collect automa...,"I do not have access to real-time information,...",Tradera automatically collects information sen...,3,4,2,5,2,Incorrect,Feedback: The response is accurate in acknowle...


In [10]:
test_name = 'direct_spacy_gemini_basic'
evaluation.to_csv(f"{test_name}_evaluation.csv", index=False)

In [150]:
# compute accuracy, precision, recall, f1 score
def compute_accuracy(evaluation):
    correct = evaluation[evaluation["eval_correctness"] == "Correct"]
    total = len(evaluation)
    correct_count = len(correct)
    return correct_count / total

In [170]:
print(f"Accuracy: {compute_accuracy(evaluation)}")
# print avg scores

print(f"Average Factuality Score: {evaluation['eval_factuality'].mean()}/5")
print(f"Average Relevance Score: {evaluation['eval_relevance'].mean()}/5")
print(f"Average Completeness Score: {evaluation['eval_completeness'].mean()}/5")
print(f"Average Clarity Score: {evaluation['eval_clarity'].mean()}/5")
print(f"Average Confidence Score: {evaluation['eval_confidence'].mean()}/5")


Accuracy: 0.25
Average Factuality Score: 3.25/5
Average Relevance Score: 4.25/5
Average Completeness Score: 3.5/5
Average Clarity Score: 4.5/5
Average Confidence Score: 3.25/5
Overall Average Score: 3.75/5
