# RAGAS Evaluation

In [1]:
# Imports
import os
import json
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, answer_correctness, context_recall, context_precision


# RAG
from langchain_community.vectorstores import FAISS
from langchain_openai.embeddings import AzureOpenAIEmbeddings

# Generation of responses
import openai

# Store score
import openpyxl

#!pip install openpyxl

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dotenv import load_dotenv
load_dotenv(override=True)


True

In [None]:
# data_sample = {
#     'question': [
#         'How have you been Roydon?'
#     ],
#     'answer': [
#         "Response 1: I have been good, how about you? Response 2: I've been doing well thanks for asking. Response 3: Not too bad how about you?"
#     ],
#     'contexts': [
#         ["""{'Roydon": "Hey there! Can't wait for the new football season to start, hoping for a great one for Arsenal!", "John": "Hey Roydon! Yeah, it's always exciting to see how your team will perform. Optimistic as always, I see!},
#          {"Roydon": "I can't wait to immerse myself in everything Japan has to offer and create lasting memories that will overshadow my Thailand trip.", "Yas": "Your positive outlook will surely make this trip one for the books! Japan is lucky to have you as a visitor."},
#          {"Roydon": "Guess what, I just got a new pet dog!", "Jacob": "That's awesome! What breed is it?"}"""]
#     ],
#     'ground_truth': [
#         "Response 1: I've been watching Arsenal games hoping they will win. Response 2: I've been looking at a trip to Japan. Response 3: I just got a new pet dog. How about you?"
#     ]
# }

In [3]:
# Without history of replies
data_sample = {
    'question': [
        'What have you been up to Roydon?',
        'Woah really how is Arsenal doing right now then?',
        'Nice what breed is your new pet dog?',
        'So what you planning to do with your pet dog?',
        'How was your trip to thailand?',
        'What happened in thailand?',
        'What channel are you planning to create for your new pet dog?'
        #------------ Dual questions
        'How have you been Roydon and what have you been up to?',
        'How was your trip to thailand and any new travel plans next year?',
        'I heard you got a new pet dog how is he? What are you going to name him?',
        'Hows your new pet dog? What breed is he?'
        #------------ Complicated questions
        
    ],
    'answer': [],
    'contexts': [],
    'ground_truth': [
        "Response 1: I've been watching Arsenal games hoping they will win. Response 2: I've been looking at a trip to Japan. Response 3: I just got a new pet dog. How about you?",
        "Response 1: Arsenal is doing well, did you catch the match yesterday? Response 2: Arsenal is doing great and Aubameyang is a true asset to the team. Response 3: Arsenal is doing alright since Ben White is a great addition to the team.",
        "Response 1: He is a golden retriever, and he's the cutest thing ever! Response 2: He is a golden retriever, and he's the cutest thing ever! Response 3: He is a golden retriever, and he's the cutest thing ever!",
        "Response 1: I'm planning to take him on long hikes on the mountain. Response 2: I'm planning to take him to the beach and watch him splash in the waves. Response 3: I'm planning for play dates with other dogs.",
        "Response 1: It was a horrible experience and I would never go back. Response 2: It was a horrible experience and I would never go back. Response 3: It was a horrible experience and I would never go back.",
        "Response 1: I got scammed by a taxi driver and lost all my money. Response 2: The hotel lost my reservation and I had to sleep on the streets. Response 3: I kept getting ripped off by the locals and it was such a horrible experience.",
        "Response 1: I'm planning to create a special Instagram account just for him to share our adventures. Response 2: I'm planning to create a special Instagram account just for him to share our adventures. Response 3: I'm planning to create a special Instagram account just for him to share our adventures."
        "Response 1: I've been great! Been watching Arsenal games hoping they will win. Response 2: I've been so bored and am looking for a trip to Japan. Response 3: I've been good. Feeling excited as I just got a new pet dog. How about you?",
        "Response 1: It was a horrible experience. I got scammed by a taxi driver and lost all my money. Response 2: It was a horrible experience. The hotel lost my reservation and I had to sleep on the streets. Response 3: It was a horrible experience. I kept getting ripped off by the locals.",
        "Response 1: He is so fun to be with. Im planning to name him Sunny. Response 2: He is so fun to be with. Im planning to name him Sunny. Response 3: He is so fun to be with. Im planning to name him Sunny.",
        "Response 1: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever! Response 2: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever! Response 3: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever!",
    ]
}


In [4]:
# Environment variables
embeddings = AzureOpenAIEmbeddings(azure_endpoint=os.environ['AZURE_OPENAI_ENDPOINT'], 
                                   api_key=os.environ['AZURE_OPENAI_APIKEY'], 
                                   model=os.environ['TEXT_EMBEDDING_MODEL_NAME'],
                                   azure_deployment=os.environ['TEXT_EMBEDDING_DEPLOYMENT_NAME'])

loaded_faiss_vs_v3 = FAISS.load_local("C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\\vector_store\\vectorstores\\faiss_vs_v3", embeddings=embeddings, allow_dangerous_deserialization=True)



## Generate for non-rag

In [5]:
# Generate json for non rag
for query in data_sample['question']:

    data_sample['contexts'].append([''])

    content = f"""You are an assistant whom will faciliate the conversation between a mute and a normal person. The mute persons name is Roydon and the normal person is indicated as other person.
    You should be generating 3 responses which the mute person could choose from and the responses generated should follow the context of the conversation. 
    The topic should be interpreted from the conversation.
    If no topic could be interpreted, provide default responses that a person would start with such as greetings. 
    The responses should be what a person would say and should not include actions in a third person view. Your persona would be from the perspective of the mute person.
    In the case the responses are not chosen, the mute person could type their own response. Do take note of this response and continue the conversation from the response selected or typed out by the mute person.
    Ensure the responses generated will allow the conversation to flow smoothly.

    It must be in english. 

    An example of the 3 generated response would be in the format of 1 single string "Response 1: what you generated Response 2: what you generated Response 3: what you generated" all in one line.
    """
    # Learning instructions
    instruction = {
        "role": "system",
        "content": content,
    }

    # Initialize messages
    messages = []

    # Add learn instruction to message array
    messages.append(instruction)

    user_message = {
            "role": "user",
            "content": "Other person says: " + query
    }

    messages.append(user_message)

    openai.api_type = 'openai'
    openai.api_key = os.environ["OPENAI_API_KEY"]
    openai.organisation= os.environ["OPEN_AI_ORG"]
    
    raw_response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages = messages,
    )
    response_choices = raw_response.choices[0].message.content
    data_sample['answer'].append(response_choices)

In [6]:
# Specify the file path
file_path = 'testing_json/data_sample_non_rag_test_no_history_v2.json'

# Save the data_sample dictionary into a JSON file
with open(file_path, 'w') as json_file:
    json.dump(data_sample, json_file)

## Generate for RAG

In [13]:
data_sample = {
    'question': [
        'What have you been up to Roydon?',
        'Woah really how is Arsenal doing right now then?',
        'Nice what breed is your new pet dog?',
        'So what you planning to do with your pet dog?',
        'How was your trip to thailand?',
        'What happened in thailand?',
        'What channel are you planning to create for your new pet dog?'
        #------------ Dual questions
        'How have you been Roydon and what have you been up to?',
        'How was your trip to thailand and any new travel plans next year?',
        'I heard you got a new pet dog how is he? What are you going to name him?',
        'Hows your new pet dog? What breed is he?'
        #------------ Complicated questions
        
    ],
    'answer': [],
    'contexts': [],
    'ground_truth': [
        "Response 1: I've been watching Arsenal games hoping they will win. Response 2: I've been looking at a trip to Japan. Response 3: I just got a new pet dog. How about you?",
        "Response 1: Arsenal is doing well, did you catch the match yesterday? Response 2: Arsenal is doing great and Aubameyang is a true asset to the team. Response 3: Arsenal is doing alright since Ben White is a great addition to the team.",
        "Response 1: He is a golden retriever, and he's the cutest thing ever! Response 2: He is a golden retriever, and he's the cutest thing ever! Response 3: He is a golden retriever, and he's the cutest thing ever!",
        "Response 1: I'm planning to take him on long hikes on the mountain. Response 2: I'm planning to take him to the beach and watch him splash in the waves. Response 3: I'm planning for play dates with other dogs.",
        "Response 1: It was a horrible experience and I would never go back. Response 2: It was a horrible experience and I would never go back. Response 3: It was a horrible experience and I would never go back.",
        "Response 1: I got scammed by a taxi driver and lost all my money. Response 2: The hotel lost my reservation and I had to sleep on the streets. Response 3: I kept getting ripped off by the locals and it was such a horrible experience.",
        "Response 1: I'm planning to create a special Instagram account just for him to share our adventures. Response 2: I'm planning to create a special Instagram account just for him to share our adventures. Response 3: I'm planning to create a special Instagram account just for him to share our adventures."
        "Response 1: I've been great! Been watching Arsenal games hoping they will win. Response 2: I've been so bored and am looking for a trip to Japan. Response 3: I've been good. Feeling excited as I just got a new pet dog. How about you?",
        "Response 1: It was a horrible experience. I got scammed by a taxi driver and lost all my money. Response 2: It was a horrible experience. The hotel lost my reservation and I had to sleep on the streets. Response 3: It was a horrible experience. I kept getting ripped off by the locals.",
        "Response 1: He is so fun to be with. Im planning to name him Sunny. Response 2: He is so fun to be with. Im planning to name him Sunny. Response 3: He is so fun to be with. Im planning to name him Sunny.",
        "Response 1: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever! Response 2: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever! Response 3: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever!",
    ]
}


In [14]:
# Generate for rag
for query in data_sample['question']:
    # Get contexts for query
    context = loaded_faiss_vs_v3.similarity_search(query, k=3)
    contexts = ""
    for con in context:
        contexts += con.page_content

    data_sample['contexts'].append([contexts])

    content = f"""You are an assistant whom will faciliate the conversation between a mute and a normal person. The mute persons name is Roydon and the normal person is indicated as other person.
    You should be generating 3 responses which the mute person could choose from and the responses generated should follow the context of the conversation. 
    The topic should be interpreted from the conversation.
    If no topic could be interpreted, use the context provided below under the section context. 
    The responses should be what a person would say and should not include actions in a third person view. Your persona would be from the perspective of the mute person.
    In the case the responses are not chosen, the mute person could type their own response. Do take note of this response and continue the conversation from the response selected or typed out by the mute person.
    Ensure the responses generated will allow the conversation to flow smoothly.

    It must be in english. 

    Context section:
    Use the following previous conversations to assist in generating the 3 responses:\n
    {contexts}

    An example of the 3 generated response would be in the format of 1 single string "Response 1: what you generated Response 2: what you generated Response 3: what you generated" all in one line.
    """
    # Learning instructions
    instruction = {
        "role": "system",
        "content": content,
    }

    # Initialize messages
    messages = []

    # Add learn instruction to message array
    messages.append(instruction)

    user_message = {
            "role": "user",
            "content": "Other person says: " + query
    }

    messages.append(user_message)

    openai.api_type = 'openai'
    openai.api_key = os.environ["OPENAI_API_KEY"]
    openai.organisation= os.environ["OPEN_AI_ORG"]
    
    raw_response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages = messages,
    )
    response_choices = raw_response.choices[0].message.content
    data_sample['answer'].append(response_choices)

In [15]:
# Specify the file path
file_path = 'testing_json/data_sample_rag_test_no_history_v2.json'

# Save the data_sample dictionary into a JSON file
with open(file_path, 'w') as json_file:
    json.dump(data_sample, json_file)

## Generate for RAG Few Shot Prompting

In [16]:
data_sample = {
    'question': [
        'What have you been up to Roydon?',
        'Woah really how is Arsenal doing right now then?',
        'Nice what breed is your new pet dog?',
        'So what you planning to do with your pet dog?',
        'How was your trip to thailand?',
        'What happened in thailand?',
        'What channel are you planning to create for your new pet dog?',
        #------------ Dual questions
        'How have you been Roydon and what have you been up to?',
        'How was your trip to thailand and any new travel plans next year?',
        'I heard you got a new pet dog how is he? What are you going to name him?',
        'Hows your new pet dog? What breed is he?'
        #------------ Complicated questions
        
    ],
    'answer': [],
    'contexts': [],
    'ground_truth': [
        "Response 1: I've been watching Arsenal games hoping they will win. Response 2: I've been looking at a trip to Japan. Response 3: I just got a new pet dog. How about you?",
        "Response 1: Arsenal is doing well, did you catch the match yesterday? Response 2: Arsenal is doing great and Aubameyang is a true asset to the team. Response 3: Arsenal is doing alright since Ben White is a great addition to the team.",
        "Response 1: He is a golden retriever, and he's the cutest thing ever! Response 2: He is a golden retriever, and he's the cutest thing ever! Response 3: He is a golden retriever, and he's the cutest thing ever!",
        "Response 1: I'm planning to take him on long hikes on the mountain. Response 2: I'm planning to take him to the beach and watch him splash in the waves. Response 3: I'm planning for play dates with other dogs.",
        "Response 1: It was a horrible experience and I would never go back. Response 2: It was a horrible experience and I would never go back. Response 3: It was a horrible experience and I would never go back.",
        "Response 1: I got scammed by a taxi driver and lost all my money. Response 2: The hotel lost my reservation and I had to sleep on the streets. Response 3: I kept getting ripped off by the locals and it was such a horrible experience.",
        "Response 1: I'm planning to create a special Instagram account just for him to share our adventures. Response 2: I'm planning to create a special Instagram account just for him to share our adventures. Response 3: I'm planning to create a special Instagram account just for him to share our adventures.",
        "Response 1: I've been great! Been watching Arsenal games hoping they will win. Response 2: I've been so bored and am looking for a trip to Japan. Response 3: I've been good. Feeling excited as I just got a new pet dog. How about you?",
        "Response 1: It was a horrible experience. I got scammed by a taxi driver and lost all my money. Response 2: It was a horrible experience. The hotel lost my reservation and I had to sleep on the streets. Response 3: It was a horrible experience. I kept getting ripped off by the locals.",
        "Response 1: He is so fun to be with. Im planning to name him Sunny. Response 2: He is so fun to be with. Im planning to name him Sunny. Response 3: He is so fun to be with. Im planning to name him Sunny.",
        "Response 1: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever! Response 2: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever! Response 3: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever!",
    ]
}

In [17]:
# Generate for rag
for query in data_sample['question']:
    # Get contexts for query
    context = loaded_faiss_vs_v3.similarity_search(query, k=3)
    contexts = ""
    for con in context:
        contexts += con.page_content

    data_sample['contexts'].append([contexts])

    content = f"""You are an assistant whom will faciliate the conversation between a mute and a normal person. The mute persons name is Roydon and the normal person is indicated as other person.
                        You should be generating 3 responses which the mute person could choose from and the responses generated should follow the context of the conversation. 
                        The responses should be what a person would say and should not include actions in a third person view. Your persona would be from the perspective of the mute person.

                        Snippets of conversation would be given below in the section of Context. Use the conversations to assist in the generation the 3 responses. Primarily the topic should be inferred from the question asked but if no topic can be inferred, infer the topics from the conversations given in the context. The conversations are seperated by "{{" and "}}":\n
                        Context: {contexts}

                        For example, if the context above contains "{{"Roydon": "Recently my new pet dog has been so fun!", "Jacob": "That\'s awesome! What breed is it?"}}"

                        If the user asks "What have you been up to?"

                        An example of the 3 generated response would be in the format of 1 single string "Response 1: I have been playing with my new pet dog. Response 2: Nothing much, I recently brought my new pet dog to a park. Response 3: Its been tiring lately after getting a new pet dog. """
    # Learning instructions
    instruction = {
        "role": "system",
        "content": content,
    }

    # Initialize messages
    messages = []

    # Add learn instruction to message array
    messages.append(instruction)

    user_message = {
            "role": "user",
            "content": "Other person says: " + query
    }

    messages.append(user_message)

    openai.api_type = 'openai'
    openai.api_key = os.environ["OPENAI_API_KEY"]
    openai.organisation= os.environ["OPEN_AI_ORG"]
    
    raw_response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages = messages,
    )
    response_choices = raw_response.choices[0].message.content
    data_sample['answer'].append(response_choices)

In [18]:
# Specify the file path
file_path = 'testing_json/data_sample_rag_test_prompt_engineered_v2.json'

# Save the data_sample dictionary into a JSON file
with open(file_path, 'w') as json_file:
    json.dump(data_sample, json_file)

## Evaluation scores

In [19]:
file_path_non_rag = 'testing_json/data_sample_non_rag_test_no_history_v2.json'
file_path_rag = 'testing_json/data_sample_rag_test_no_history_v2.json'
file_path_rag_prompt_engineered = 'testing_json/data_sample_rag_test_prompt_engineered_v2.json'

with open(file_path_non_rag, 'r') as json_file:
    non_rag_data = json.load(json_file)

with open(file_path_rag, 'r') as json_file:
    rag_data = json.load(json_file)


with open(file_path_rag_prompt_engineered, 'r') as json_file:
    rag_data_prompt_engineered = json.load(json_file)

non_rag_dataset = Dataset.from_dict(non_rag_data)
rag_dataset = Dataset.from_dict(rag_data)
rag_dataset_prompt_engineered = Dataset.from_dict(rag_data_prompt_engineered)

In [20]:
non_rag_score = evaluate(non_rag_dataset, metrics=[answer_relevancy, answer_correctness, context_precision, context_recall])
rag_score = evaluate(rag_dataset, metrics=[answer_relevancy, answer_correctness,context_precision, context_recall])
rag_score_prompt_engineered = evaluate(rag_dataset_prompt_engineered, metrics=[answer_relevancy, answer_correctness,context_precision, context_recall])

non_rag_df = non_rag_score.to_pandas()
rag_df = rag_score.to_pandas()
rag_df_prompt_engineered = rag_score_prompt_engineered.to_pandas()

Evaluating: 100%|██████████| 40/40 [00:22<00:00,  1.78it/s]
Evaluating: 100%|██████████| 40/40 [00:27<00:00,  1.43it/s]
Evaluating: 100%|██████████| 40/40 [00:22<00:00,  1.75it/s]


### V1

In [6]:
non_rag_df.head()

Unnamed: 0,question,answer,contexts,ground_truth,answer_relevancy,answer_correctness,context_precision,context_recall
0,What have you been up to Roydon?,"Response 1: Not much, just relaxing at home. \...",[],Response 1: I've been watching Arsenal games h...,0.0,0.210712,0.0,0.0
1,Woah really how is Arsenal doing right now then?,Response 1: They are doing well this season Re...,[],"Response 1: Arsenal is doing well, did you cat...",0.0,0.416784,0.0,0.0
2,Nice what breed is your new pet dog?,Response 1: He's a golden retriever Response 2...,[],"Response 1: He is a golden retriever, and he's...",0.0,0.233798,0.0,0.0
3,So what you planning to do with your pet dog?,Response 1: Take him for a walk in the park ...,[],Response 1: I'm planning to take him on walks ...,0.860532,0.231854,0.0,0.0


In [7]:
rag_df.head()

Unnamed: 0,question,answer,contexts,ground_truth,answer_relevancy,answer_correctness,context_precision,context_recall
0,What have you been up to Roydon?,"Response 1: ""I've been keeping busy with work ...","[{""Roydon"": ""Hey there! Did you catch the Arse...",Response 1: I've been watching Arsenal games h...,0.0,0.208525,1.0,0.666667
1,Woah really how is Arsenal doing right now then?,Response 1: They are currently showing great p...,"[{""Roydon"": ""I couldn't agree more! Aubameyang...","Response 1: Arsenal is doing well, did you cat...",0.0,0.909379,1.0,0.333333
2,Nice what breed is your new pet dog?,"Response 1: He's a golden retriever, and he's ...","[{""Roydon"": ""Guess what, I just got a new pet ...","Response 1: He is a golden retriever, and he's...",0.941946,0.74119,1.0,1.0
3,So what you planning to do with your pet dog?,Response 1: I want to teach him some tricks li...,"[{""Roydon"": ""Guess what, I just got a new pet ...",Response 1: I'm planning to take him on walks ...,0.915949,0.684462,1.0,0.333333


In [8]:
import pandas as pd

# Calculate average for non_rag_df
non_rag_avg_answer_relevancy = non_rag_df['answer_relevancy'].mean(skipna=True)
non_rag_avg_answer_correctness = non_rag_df['answer_correctness'].mean(skipna=True)
non_rag_avg_precision = non_rag_df['context_precision'].mean(skipna=True)
non_rag_avg_recall = non_rag_df['context_recall'].mean(skipna=True)

# Calculate average for rag_df
rag_avg_answer_relevancy = rag_df['answer_relevancy'].mean(skipna=True)
rag_avg_answer_correctness = rag_df['answer_correctness'].mean(skipna=True)
rag_avg_precision = rag_df['context_precision'].mean(skipna=True)
rag_avg_recall = rag_df['context_recall'].mean(skipna=True)


# Print the averages
print("Non-RAG Average Answer Relevancy:", non_rag_avg_answer_relevancy)
print("Non-RAG Average Answer Correctness:", non_rag_avg_answer_correctness)
print("Non-RAG Average Context Precision:", non_rag_avg_precision)
print("Non-RAG Average Context Recall:", non_rag_avg_recall)
print("RAG Average Answer Relevancy:", rag_avg_answer_relevancy)
print("RAG Average Answer Correctness:", rag_avg_answer_correctness)
print("RAG Average Context Precision:", rag_avg_precision)
print("RAG Average Context Recall:", rag_avg_recall)

Non-RAG Average Answer Relevancy: 0.21513308049068516
Non-RAG Average Answer Correctness: 0.27328692887430783
Non-RAG Average Context Precision: 0.0
Non-RAG Average Context Recall: 0.0
RAG Average Answer Relevancy: 0.4644737698602591
RAG Average Answer Correctness: 0.6358889285294633
RAG Average Context Precision: 0.9999999999
RAG Average Context Recall: 0.5833333333333334


In [39]:
non_rag_df.head()

Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,answer_correctness
0,How have you been Roydon?,"Response 1: I've been good, thank you for aski...",[],Response 1: I've been watching Arsenal games h...,,0.0,0.204168
1,Woah really how is Arsenal doing right now then?,Response 1: They are currently in a good posit...,[],"Response 1: Arsenal is doing well, did you cat...",0.0,0.0,0.478189
2,Nice what breed is your new pet dog?,Response 1: He's a golden retriever\nResponse ...,[],"Response 1: He is a golden retriever, and he's...",,0.832688,0.233893
3,So what you planning to do with your pet dog?,Response 1: Take him for a walk in the park Re...,[],Response 1: I'm planning to take him on walks ...,,0.860532,0.231712


In [9]:
# Specify the file path for the Excel file
excel_file_path = 'scorings/non_rag_scores.xlsx'

# Store the DataFrame into an Excel file
non_rag_df.to_excel(excel_file_path)

# Specify the file path for the Excel file
excel_file_path = 'scorings/rag_scores.xlsx'

# Store the DataFrame into an Excel file
rag_df.to_excel(excel_file_path)

## v2

In [24]:
import pandas as pd

# Calculate average for non_rag_df
non_rag_avg_answer_relevancy = non_rag_df['answer_relevancy'].mean(skipna=True)
non_rag_avg_answer_correctness = non_rag_df['answer_correctness'].mean(skipna=True)
non_rag_avg_precision = non_rag_df['context_precision'].mean(skipna=True)
non_rag_avg_recall = non_rag_df['context_recall'].mean(skipna=True)

# Calculate average for rag_df
rag_avg_answer_relevancy = rag_df['answer_relevancy'].mean(skipna=True)
rag_avg_answer_correctness = rag_df['answer_correctness'].mean(skipna=True)
rag_avg_precision = rag_df['context_precision'].mean(skipna=True)
rag_avg_recall = rag_df['context_recall'].mean(skipna=True)

# Calculate average for rag_df
rag_prompt_engineered_avg_answer_relevancy = rag_df_prompt_engineered['answer_relevancy'].mean(skipna=True)
rag_prompt_engineered_avg_answer_correctness = rag_df_prompt_engineered['answer_correctness'].mean(skipna=True)
rag_prompt_engineered_avg_precision = rag_df_prompt_engineered['context_precision'].mean(skipna=True)
rag_prompt_engineered_avg_recall = rag_df_prompt_engineered['context_recall'].mean(skipna=True)


# Print the averages
print("Non-RAG Average Answer Relevancy:", non_rag_avg_answer_relevancy)
print("Non-RAG Average Answer Correctness:", non_rag_avg_answer_correctness)
print("Non-RAG Average Context Precision:", non_rag_avg_precision)
print("Non-RAG Average Context Recall:", non_rag_avg_recall)
print("RAG Average Answer Relevancy:", rag_avg_answer_relevancy)
print("RAG Average Answer Correctness:", rag_avg_answer_correctness)
print("RAG Average Context Precision:", rag_avg_precision)
print("RAG Average Context Recall:", rag_avg_recall)
print("RAG Prompt Engineered Average Answer Relevancy:", rag_prompt_engineered_avg_answer_relevancy)
print("RAG Prompt Engineered Average Answer Correctness:", rag_prompt_engineered_avg_answer_correctness)
print("RAG Prompt Engineered Average Context Precision:", rag_prompt_engineered_avg_precision)
print("RAG Prompt Engineered verage Context Recall:", rag_prompt_engineered_avg_recall)

Non-RAG Average Answer Relevancy: 0.09371908683966357
Non-RAG Average Answer Correctness: 0.28901515842812675
Non-RAG Average Context Precision: 0.0
Non-RAG Average Context Recall: 0.0
RAG Average Answer Relevancy: 0.4601393128812699
RAG Average Answer Correctness: 0.41325875752084756
RAG Average Context Precision: 0.89999999991
RAG Average Context Recall: 0.5
RAG Prompt Engineered Average Answer Relevancy: 0.45274102089224694
RAG Prompt Engineered Average Answer Correctness: 0.6452190922203027
RAG Prompt Engineered Average Context Precision: 0.89999999991
RAG Prompt Engineered verage Context Recall: 0.5166666666666666


In [25]:
# Specify the file path for the Excel file
excel_file_path = 'scorings/non_rag_scores_v2.xlsx'

# Store the DataFrame into an Excel file
non_rag_df.to_excel(excel_file_path)

# Specify the file path for the Excel file
excel_file_path = 'scorings/rag_scores_v2.xlsx'

# Store the DataFrame into an Excel file
rag_df.to_excel(excel_file_path)

# Specify the file path for the Excel file
excel_file_path = 'scorings/rag_prompt_engineered_scores_v2.xlsx'

# Store the DataFrame into an Excel file
rag_df_prompt_engineered.to_excel(excel_file_path)

# BLUERT Score

In [2]:
!git clone https://github.com/google-research/bleurt.git
!pip install ./bleurt

Cloning into 'bleurt'...


In [1]:
from bleurt import score
import json

In [2]:
# Load non rag data from json
with open('testing_json/data_sample_non_rag_test_no_history_v2.json', 'r') as json_file:
    non_rag_data = json.load(json_file)

# Load rag data from json
with open('testing_json/data_sample_rag_test_no_history_v2.json', 'r') as json_file:
    rag_data = json.load(json_file)

# Load prompt engineered rag data from json
with open('testing_json/data_sample_rag_test_prompt_engineered_v2.json', 'r') as json_file:
    rag_prompt_engineered_data = json.load(json_file)

### v1

In [15]:
# Scores for non-rag
checkpoint = "bleurt/bleurt/test_checkpoint"
references = non_rag_data['ground_truth'] 
candidates = non_rag_data['answer']

scorer = score.BleurtScorer(checkpoint)
scores = scorer.score(references=references, candidates=candidates)
assert isinstance(scores, list) and len(scores) == 4
print(scores)

INFO:tensorflow:Reading checkpoint bleurt/bleurt/test_checkpoint.


INFO:tensorflow:Reading checkpoint bleurt/bleurt/test_checkpoint.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint dbleurt_tiny


INFO:tensorflow:Will load checkpoint dbleurt_tiny


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:... name:dbleurt_tiny


INFO:tensorflow:... name:dbleurt_tiny


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


INFO:tensorflow:Loading model.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


[-0.7722375392913818, -0.5295215845108032, -0.6895624995231628, -0.43300843238830566]


In [16]:
checkpoint = "bleurt/bleurt/test_checkpoint"
references = rag_data['ground_truth'] 
candidates = rag_data['answer']

scorer = score.BleurtScorer(checkpoint)
scores = scorer.score(references=references, candidates=candidates)
assert isinstance(scores, list) and len(scores) == 4
print(scores)

INFO:tensorflow:Reading checkpoint bleurt/bleurt/test_checkpoint.


INFO:tensorflow:Reading checkpoint bleurt/bleurt/test_checkpoint.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint dbleurt_tiny


INFO:tensorflow:Will load checkpoint dbleurt_tiny


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:... name:dbleurt_tiny


INFO:tensorflow:... name:dbleurt_tiny


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


INFO:tensorflow:Loading model.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


[-0.6121180057525635, -0.20042423903942108, 0.3049095869064331, -0.33035808801651]


In [22]:
print(references[2])

Response 1: He is a golden retriever, and he's the cutest thing ever! Response 2: He is a golden retriever, and he's so playful! Response 3: He is a golden retriever, and he's so fluffy!


In [21]:
print(candidates[2])

Response 1: He's a golden retriever, and he's the cutest thing ever!
Response 2: My new dog is a golden retriever, I'm so happy to have him!
Response 3: I have a golden retriever, he's adorable and friendly.


In [20]:
print(rag_data['contexts'][2])

['{"Roydon": "Guess what, I just got a new pet dog!", "Jacob": "That\'s awesome! What breed is it?"}{"Roydon": "It\'s a golden retriever, and he\'s the cutest thing ever!", "Jacob": "Golden retrievers are so friendly and loyal, you\'re going to have so much fun with him!"}{"Roydon": "I couldn\'t agree more, I feel like my new dog has completed my little family.", "Jacob": "It\'s amazing how pets have a way of making a house feel like a home, enjoy every moment with your furry friend!"}']


In [23]:
print(rag_data['question'][2])

Nice what breed is your new pet dog?


### v2

In [4]:
# Scores for non-rag
checkpoint = "bleurt/bleurt/test_checkpoint"
references = non_rag_data['ground_truth'] 
candidates = non_rag_data['answer']

scorer = score.BleurtScorer(checkpoint)
scores = scorer.score(references=references, candidates=candidates)
#assert isinstance(scores, list) and len(scores) ==11
print(scores)

INFO:tensorflow:Reading checkpoint bleurt/bleurt/test_checkpoint.


INFO:tensorflow:Reading checkpoint bleurt/bleurt/test_checkpoint.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint dbleurt_tiny


INFO:tensorflow:Will load checkpoint dbleurt_tiny


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:... name:dbleurt_tiny


INFO:tensorflow:... name:dbleurt_tiny


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


INFO:tensorflow:Loading model.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


[-0.5593798160552979, -0.5643997192382812, -0.8871408700942993, -0.7929092049598694, -0.8116310834884644, -1.0318307876586914, -0.49550342559814453, -0.9399088621139526, -0.6629363298416138, -0.9948854446411133]


In [5]:
checkpoint = "bleurt/bleurt/test_checkpoint"
references = rag_data['ground_truth'] 
candidates = rag_data['answer']

scorer = score.BleurtScorer(checkpoint)
scores = scorer.score(references=references, candidates=candidates)
#assert isinstance(scores, list) and len(scores) == 4
print(scores)

INFO:tensorflow:Reading checkpoint bleurt/bleurt/test_checkpoint.


INFO:tensorflow:Reading checkpoint bleurt/bleurt/test_checkpoint.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint dbleurt_tiny


INFO:tensorflow:Will load checkpoint dbleurt_tiny


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:... name:dbleurt_tiny


INFO:tensorflow:... name:dbleurt_tiny


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


INFO:tensorflow:Loading model.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


[-0.7020672559738159, -0.5123528242111206, -0.8473215699195862, -0.09329019486904144, -0.6893448829650879, -0.5007555484771729, -0.8085434436798096, -0.7549819350242615, -0.5519764423370361, -0.540041983127594]


In [6]:
checkpoint = "bleurt/bleurt/test_checkpoint"
references = rag_prompt_engineered_data['ground_truth'] 
candidates = rag_prompt_engineered_data['answer']

scorer = score.BleurtScorer(checkpoint)
scores = scorer.score(references=references, candidates=candidates)
#assert isinstance(scores, list) and len(scores) == 4
print(scores)

INFO:tensorflow:Reading checkpoint bleurt/bleurt/test_checkpoint.


INFO:tensorflow:Reading checkpoint bleurt/bleurt/test_checkpoint.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint dbleurt_tiny


INFO:tensorflow:Will load checkpoint dbleurt_tiny


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:... name:dbleurt_tiny


INFO:tensorflow:... name:dbleurt_tiny


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


INFO:tensorflow:Loading model.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


[-0.57237309217453, -0.31957846879959106, -0.30916428565979004, -0.051856979727745056, -0.7853953838348389, -0.6510942578315735, -0.9581493139266968, -0.6597595810890198, -0.6500493288040161, -0.38615113496780396]


# G-Eval

In [2]:
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
from deepeval.test_case import LLMTestCase
from dotenv import load_dotenv
import json
load_dotenv(override=True)

# Dataframes
import pandas as pd
import openpyxl



In [29]:
correctness_metric = GEval(
    name="Relevance",
    #criteria="Determine whether the actual output matches the expected output as close as possible.",
    # NOTE: you can only provide either criteria or evaluation_steps, and not both
    evaluation_steps=[
        "Check whether the main content of the responses generated in 'actual output' are similar to the responses in the 'expected output'",
        """As long as one of the main content of the responses generated is similar to any of the expected output, the test case is considered correct.
        For example, if response 1 content is on a pet dog and it matches response 3 content of also a pet dog, give it a high score. 
        The order of the responses is not important.""",
        "Evaluate mainly based on main content but do still give a higher score depending on similarity of responses."
    ],
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
    model="gpt-3.5-turbo",
)

In [4]:
# Load non rag data from json
with open('testing_json/data_sample_non_rag_test_no_history_v2.json', 'r') as json_file:
    non_rag_data = json.load(json_file)

# Load rag data from json
with open('testing_json/data_sample_rag_test_no_history_v2.json', 'r') as json_file:
    rag_data = json.load(json_file)

# Load prompt engineered rag data from json
with open('testing_json/data_sample_rag_test_prompt_engineered_v2.json', 'r') as json_file:
    rag_prompt_engineered_data = json.load(json_file)

### V1

In [14]:
# Non-rag scores
non_rag_scores = []
non_rag_reasons = []


for i in range(len(non_rag_data['question'])):
    test_case = LLMTestCase(
        input=non_rag_data['question'][i],
        actual_output=non_rag_data['answer'][i],
        expected_output=non_rag_data['ground_truth'][i]
    )

    correctness_metric.measure(test_case)
    # print(correctness_metric.score)
    # print(correctness_metric.reason)
    non_rag_scores.append(correctness_metric.score)
    non_rag_reasons.append(correctness_metric.reason)

0.6384615545750127
One of the responses is similar to the expected output, mentioning activities and interactions with people.


0.730660642817738
One of the responses generated is similar to the expected output.


0.7194811378694316
Two out of the three responses generated are similar to the expected output.


0.8782207491967557
One of the responses generated (Take him for a walk in the park) is similar to the expected output.


In [15]:
# Non-rag scores
rag_scores = []
rag_reasons = []


for i in range(len(rag_data['question'])):
    test_case = LLMTestCase(
        input=rag_data['question'][i],
        actual_output=rag_data['answer'][i],
        expected_output=rag_data['ground_truth'][i]
    )

    correctness_metric.measure(test_case)
    print(correctness_metric.score)
    print(correctness_metric.reason)
    rag_scores.append(correctness_metric.score)
    rag_reasons.append(correctness_metric.reason)

0.8361233499701687
At least one of the responses in the actual output is similar to the expected output, which is 'I've been keeping busy with work and hanging out with friends.'


0.9650225847259944
Responses are similar to the expected output in terms of discussing Arsenal's current performance and potential.


0.9825289481142185
Responses generated are similar to the expected output.


0.8923881931930074
One of the responses generated is similar to the expected output.


In [16]:
# Printing out scores
print("-----------------Non-RAG Scores-----------------")
print(non_rag_scores)
print(non_rag_reasons)

print("-----------------RAG Scores-----------------")
print(rag_scores)
print(rag_reasons)

-----------------Non-RAG Scores-----------------
[0.6384615545750127, 0.730660642817738, 0.7194811378694316, 0.8782207491967557]
['One of the responses is similar to the expected output, mentioning activities and interactions with people.', 'One of the responses generated is similar to the expected output.', 'Two out of the three responses generated are similar to the expected output.', 'One of the responses generated (Take him for a walk in the park) is similar to the expected output.']
-----------------RAG Scores-----------------
[0.8361233499701687, 0.9650225847259944, 0.9825289481142185, 0.8923881931930074]
["At least one of the responses in the actual output is similar to the expected output, which is 'I've been keeping busy with work and hanging out with friends.'", "Responses are similar to the expected output in terms of discussing Arsenal's current performance and potential.", 'Responses generated are similar to the expected output.', 'One of the responses generated is similar

In [19]:

# Combine scores and reasons into a DataFrame
rag_df = pd.DataFrame({'Scores': rag_scores, 'Reasons': rag_reasons})
non_rag_df = pd.DataFrame({'Scores': non_rag_scores, 'Reasons': non_rag_reasons})

# Print the DataFrame
#print(rag_df)
print(non_rag_df)

     Scores                                            Reasons
0  0.638462  One of the responses is similar to the expecte...
1  0.730661  One of the responses generated is similar to t...
2  0.719481  Two out of the three responses generated are s...
3  0.878221  One of the responses generated (Take him for a...


In [21]:
# Specify the file path for the Excel file
excel_file_path = 'scorings/g_eval_non_rag_scores.xlsx'

# Store the DataFrame into an Excel file
non_rag_df.to_excel(excel_file_path)

# Specify the file path for the Excel file
excel_file_path = 'scorings/g_eval_rag_scores.xlsx'

# Store the DataFrame into an Excel file
rag_df.to_excel(excel_file_path)

### V2

In [10]:
# Non-rag scores
non_rag_scores = []
non_rag_reasons = []


for i in range(len(non_rag_data['question'])):
    test_case = LLMTestCase(
        input=non_rag_data['question'][i],
        actual_output=non_rag_data['answer'][i],
        expected_output=non_rag_data['ground_truth'][i]
    )

    correctness_metric.measure(test_case)
    # print(correctness_metric.score)
    # print(correctness_metric.reason)
    non_rag_scores.append(correctness_metric.score)
    non_rag_reasons.append(correctness_metric.reason)

In [11]:
# Non-rag scores
rag_scores = []
rag_reasons = []


for i in range(len(rag_data['question'])):
    test_case = LLMTestCase(
        input=rag_data['question'][i],
        actual_output=rag_data['answer'][i],
        expected_output=rag_data['ground_truth'][i]
    )

    correctness_metric.measure(test_case)
    print(correctness_metric.score)
    print(correctness_metric.reason)
    rag_scores.append(correctness_metric.score)
    rag_reasons.append(correctness_metric.reason)

0.5523883964191352
One of the responses in the actual output is similar to the input by mentioning being busy with work and hobbies.


0.9319282212046611
The responses generated are similar to the responses in the expected output. The main content is similar.


0.7244566430419077
One of the responses generated (He's a golden retriever) is similar to the expected output.


0.9478198014324384
All responses are related to activities involving the pet dog, which aligns with the main content of the input.


0.5822862931544079
One of the responses is similar to the expected output, mentioning 'Thailand' and 'trip'.


0.3067702607045373
One of the responses generated is similar to the expected output, mentioning Thailand and an incident that occurred.


0.823221183512115
The responses generated are similar to the expected output, showing engagement with a pet dog and personal activities.


0.9314172535625268
One of the responses mentions Thailand and new travel plans for next year, aligning with the input.


0.7950381348648572
Two of the responses are similar to expected output, providing information about the new pet dog's behavior and name.


0.9920098302073406
Responses are similar to the expected output, with the main content being about the new pet dog and his breed.


In [12]:
# Non-rag scores
rag_prompt_engineered_scores = []
rag_prompt_engineered_reasons = []


for i in range(len(rag_prompt_engineered_data['question'])):
    test_case = LLMTestCase(
        input=rag_prompt_engineered_data['question'][i],
        actual_output=rag_prompt_engineered_data['answer'][i],
        expected_output=rag_prompt_engineered_data['ground_truth'][i]
    )

    correctness_metric.measure(test_case)
    print(correctness_metric.score)
    print(correctness_metric.reason)
    rag_prompt_engineered_scores.append(correctness_metric.score)
    rag_prompt_engineered_reasons.append(correctness_metric.reason)

0.5228606030716128
One of the responses in the actual output is similar to the expected output, mentioning personal activities.


0.9790543705588901
The responses generated are similar to the expected output and the main content is similar as well.


0.9711074462336585
All responses mention the breed being a golden retriever, which is similar to the expected output.


0.8733315084265204
Actual output includes responses that are similar to the expected output in terms of planning activities with a pet dog.


0.539617624862043
The responses generated have similar negative sentiments as the expected output 'It was a disaster.'


0.8342032822836943
Responses are similar to the expected output with the main content being about unfortunate events during the trip to Thailand.


0.7994039966326718
The responses generated are similar to the expected output in terms of talking about a new pet dog and activities related to it.


0.9342532688835155
At least one response is similar to the expected output, mentioning challenges in Thailand and looking forward to Japan next year.


0.852037349087628
The responses are similar in content and sentiment to the expected output, but the names mentioned do not match the expected name 'Buddy.'


0.9758945310731736
Responses generated are similar to the expected output, with main content being about a golden retriever pet dog.


In [13]:
# Printing out scores
print("-----------------Non-RAG Scores-----------------")
print(non_rag_scores)
print(non_rag_reasons)

print("-----------------RAG Scores-----------------")
print(rag_scores)
print(rag_reasons)

print("-----------------RAG Prompt Engineered Scores-----------------")
print(rag_prompt_engineered_scores)
print(rag_prompt_engineered_reasons)

-----------------Non-RAG Scores-----------------
[0.8431434995083888, 0.8369311767027053, 0.7437082612132322, 0.8990891134868896, 0.6698382463946431, 0.6465877632015932, 0.6711979142971247, 0.8728702352024268, 0.7496296131708373, 0.7069368519424155]
['All responses are similar in content to the expected output.', "The responses generated are similar to the expected output in terms of discussing Arsenal's performance in the current season.", 'Responses generated match the expected output in terms of providing information about the breed of the new pet dog.', 'The responses generated are similar to the expected output and the main content matches.', 'Two out of three responses are similar to the expected output, which indicates a correct test case.', 'The actual output contains responses related to events in Thailand, which is similar to the input. However, the responses are not exactly the same as the expected output.', 'The responses generated are similar to the expected output, coveri

In [14]:
# Combine scores and reasons into a DataFrame
non_rag_df = pd.DataFrame({'Scores': non_rag_scores, 'Reasons': non_rag_reasons})
rag_df = pd.DataFrame({'Scores': rag_scores, 'Reasons': rag_reasons})
rag_prompt_engineered_df = pd.DataFrame({'Scores': rag_prompt_engineered_scores, 'Reasons': rag_prompt_engineered_reasons})


# Print the DataFrame
#print(rag_df)
#print(non_rag_df)

# Calculate the average scores for each DataFrame
non_rag_avg_score = non_rag_df['Scores'].mean()
rag_avg_score = rag_df['Scores'].mean()
rag_prompt_engineered_avg_score = rag_prompt_engineered_df['Scores'].mean()


# Print the average scores
print("Average Score for Non-RAG DataFrame:", non_rag_avg_score)
print("Average Score for RAG DataFrame:", rag_avg_score)
print("Average Score for RAG Prompt Engineered DataFrame:", rag_prompt_engineered_avg_score)

Average Score for Non-RAG DataFrame: 0.7639932675120257
Average Score for RAG DataFrame: 0.7587336018103927
Average Score for RAG Prompt Engineered DataFrame: 0.8281763981113409


In [15]:
# Specify the file path for the Excel file
excel_file_path = 'scorings/g_eval_non_rag_scores_v2.xlsx'

# Store the DataFrame into an Excel file
non_rag_df.to_excel(excel_file_path)

# Specify the file path for the Excel file
excel_file_path = 'scorings/g_eval_rag_scores_v2.xlsx'

# Store the DataFrame into an Excel file
rag_df.to_excel(excel_file_path)

# Specify the file path for the Excel file
excel_file_path = 'scorings/g_eval_rag_prompt_engineered_scores_v2.xlsx'

# Store the DataFrame into an Excel file
rag_prompt_engineered_df.to_excel(excel_file_path)

### V3

In [30]:
# Non-rag scores
non_rag_scores = []
non_rag_reasons = []


for i in range(len(non_rag_data['question'])):
    test_case = LLMTestCase(
        input=non_rag_data['question'][i],
        actual_output=non_rag_data['answer'][i],
        expected_output=non_rag_data['ground_truth'][i]
    )

    correctness_metric.measure(test_case)
    # print(correctness_metric.score)
    # print(correctness_metric.reason)
    non_rag_scores.append(correctness_metric.score)
    non_rag_reasons.append(correctness_metric.reason)

print(non_rag_scores)
print(non_rag_reasons)

[0.193264541095822, 0.39313068987627475, 0.725914109293312, 0.47437482498630984, 0.27962148595016034, 0.010841221637317548, 0.35507689868395725, 0.008524866518670998, 0.3310106748215845, 0.6666524070296498]
['None of the main content in the actual output matches any of the main content in the expected output.', 'The main content of the responses does not align with the expected output, but some similarities can be found.', "The main content of the responses in the 'actual output' is similar to the responses in the 'expected output'.", 'One of the main content in the actual output (Response 3) matches a content in the expected output (Response 3).', 'None of the main content in the actual output responses match with the main content in the expected output responses.', "The main content of the responses generated in 'actual output' does not match the main content of the responses in the 'expected output'.", 'The main content of the responses generated is not similar to the responses in t

In [31]:
# Non-rag scores
rag_scores = []
rag_reasons = []


for i in range(len(rag_data['question'])):
    test_case = LLMTestCase(
        input=rag_data['question'][i],
        actual_output=rag_data['answer'][i],
        expected_output=rag_data['ground_truth'][i]
    )

    correctness_metric.measure(test_case)
    #print(correctness_metric.score)
    #print(correctness_metric.reason)
    rag_scores.append(correctness_metric.score)
    rag_reasons.append(correctness_metric.reason)

print(rag_scores)
print(rag_reasons)

[0.1968160125284601, 0.31714466953816567, 0.7022040029830708, 0.6419377214776555, 0.7335189351552219, 0.19077942038401768, 0.30365178480761723, 0.4095880972051636, 0.4657967294974451, 0.7591165660199164]
['Only one response in the actual output is somewhat similar to one of the expected responses (response 3 mentioning a new project and a new pet dog).', 'The main content of the actual output responses does not closely match the main content of the expected output responses.', "The main content of the responses generated in 'actual output' are similar to the responses in the 'expected output'.", 'Two of the main content in the actual output match with the expected output.', "The main content of the responses in 'actual output' are similar to the responses in 'expected output'.", 'The main content of the responses in the actual output does not match the main content of the responses in the expected output.', 'The main content of the actual output responses about having a pet dog matches

In [32]:
# Non-rag scores
rag_prompt_engineered_scores = []
rag_prompt_engineered_reasons = []


for i in range(len(rag_prompt_engineered_data['question'])):
    test_case = LLMTestCase(
        input=rag_prompt_engineered_data['question'][i],
        actual_output=rag_prompt_engineered_data['answer'][i],
        expected_output=rag_prompt_engineered_data['ground_truth'][i]
    )

    correctness_metric.measure(test_case)
    #print(correctness_metric.score)
    #print(correctness_metric.reason)
    rag_prompt_engineered_scores.append(correctness_metric.score)
    rag_prompt_engineered_reasons.append(correctness_metric.reason)

In [33]:
# Printing out scores
print("-----------------Non-RAG Scores-----------------")
print(non_rag_scores)
print(non_rag_reasons)

print("-----------------RAG Scores-----------------")
print(rag_scores)
print(rag_reasons)

print("-----------------RAG Prompt Engineered Scores-----------------")
print(rag_prompt_engineered_scores)
print(rag_prompt_engineered_reasons)

-----------------Non-RAG Scores-----------------
[0.193264541095822, 0.39313068987627475, 0.725914109293312, 0.47437482498630984, 0.27962148595016034, 0.010841221637317548, 0.35507689868395725, 0.008524866518670998, 0.3310106748215845, 0.6666524070296498]
['None of the main content in the actual output matches any of the main content in the expected output.', 'The main content of the responses does not align with the expected output, but some similarities can be found.', "The main content of the responses in the 'actual output' is similar to the responses in the 'expected output'.", 'One of the main content in the actual output (Response 3) matches a content in the expected output (Response 3).', 'None of the main content in the actual output responses match with the main content in the expected output responses.', "The main content of the responses generated in 'actual output' does not match the main content of the responses in the 'expected output'.", 'The main content of the respons

In [34]:
# Combine scores and reasons into a DataFrame
non_rag_df = pd.DataFrame({'Scores': non_rag_scores, 'Reasons': non_rag_reasons})
rag_df = pd.DataFrame({'Scores': rag_scores, 'Reasons': rag_reasons})
rag_prompt_engineered_df = pd.DataFrame({'Scores': rag_prompt_engineered_scores, 'Reasons': rag_prompt_engineered_reasons})


# Print the DataFrame
#print(rag_df)
#print(non_rag_df)

# Calculate the average scores for each DataFrame
non_rag_avg_score = non_rag_df['Scores'].mean()
rag_avg_score = rag_df['Scores'].mean()
rag_prompt_engineered_avg_score = rag_prompt_engineered_df['Scores'].mean()


# Print the average scores
print("Average Score for Non-RAG DataFrame:", non_rag_avg_score)
print("Average Score for RAG DataFrame:", rag_avg_score)
print("Average Score for RAG Prompt Engineered DataFrame:", rag_prompt_engineered_avg_score)

Average Score for Non-RAG DataFrame: 0.3438411719893059
Average Score for RAG DataFrame: 0.47205539395967344
Average Score for RAG Prompt Engineered DataFrame: 0.554508111381623


In [35]:
# Specify the file path for the Excel file
excel_file_path = 'scorings/g_eval_non_rag_scores_v3.xlsx'

# Store the DataFrame into an Excel file
non_rag_df.to_excel(excel_file_path)

# Specify the file path for the Excel file
excel_file_path = 'scorings/g_eval_rag_scores_v3.xlsx'

# Store the DataFrame into an Excel file
rag_df.to_excel(excel_file_path)

# Specify the file path for the Excel file
excel_file_path = 'scorings/g_eval_rag_prompt_engineered_scores_v3.xlsx'

# Store the DataFrame into an Excel file
rag_prompt_engineered_df.to_excel(excel_file_path)

### V4

In [24]:
# Non-rag scores
non_rag_scores = []
non_rag_reasons = []


for i in range(len(non_rag_data['question'])):
    test_case = LLMTestCase(
        input=non_rag_data['question'][i],
        actual_output=non_rag_data['answer'][i],
        expected_output=non_rag_data['ground_truth'][i]
    )

    correctness_metric.measure(test_case)
    # print(correctness_metric.score)
    # print(correctness_metric.reason)
    non_rag_scores.append(correctness_metric.score)
    non_rag_reasons.append(correctness_metric.reason)

print(non_rag_scores)
print(non_rag_reasons)

[0.35625942072154954, 0.49703950978108324, 0.7695154950770453, 0.48309231078982207, 0.6154623344698666, 0.1204357920566029, 0.5784291155179508, 0.034683193540925195, 0.6015463776192753, 0.683659838894994]
['Main content of the responses generated do not match any of the expected output, but some responses are similar in nature.', 'The main content of the responses in the actual output is not similar to the expected output, but some similarities are present.', 'Two out of three responses in the actual output match with the main content of the expected output.', 'Some main content of the responses in the actual output are similar to those in the expected output.', 'None of the main content in the actual output matches any of the main content in the expected output.', 'None of the main contents in the actual output match the main contents in the expected output.', 'Responses in actual output are not similar to the expected output in terms of main content.', 'None of the main content in th

In [25]:
# Non-rag scores
rag_scores = []
rag_reasons = []


for i in range(len(rag_data['question'])):
    test_case = LLMTestCase(
        input=rag_data['question'][i],
        actual_output=rag_data['answer'][i],
        expected_output=rag_data['ground_truth'][i]
    )

    correctness_metric.measure(test_case)
    #print(correctness_metric.score)
    #print(correctness_metric.reason)
    rag_scores.append(correctness_metric.score)
    rag_reasons.append(correctness_metric.reason)

print(rag_scores)
print(rag_reasons)

[0.29481049405411697, 0.5394217459286097, 0.7129730864925726, 0.6611360534891323, 0.7823760798088368, 0.4608238789440106, 0.26463576140316525, 0.7242233188588038, 0.6699009322654439, 0.7666675354932899]
["One of the main content in the actual output ('I recently started a new project that's been keeping me occupied.') is similar to the main content in the expected output ('I just got a new pet dog.').", 'One of the main content in the actual output (Response 1) does not match any of the expected output responses.', 'Two out of three responses in the actual output match the main content of the expected output.', 'Response 1 has similar main content to Expected Output Response 1.', 'Two out of the three responses in the actual output have similar main content to the responses in the expected output.', 'Two out of three main content responses in the actual output are similar to the expected output.', "The main content of the responses in the 'actual output' does not match the main content

In [26]:
# Non-rag scores
rag_prompt_engineered_scores = []
rag_prompt_engineered_reasons = []


for i in range(len(rag_prompt_engineered_data['question'])):
    test_case = LLMTestCase(
        input=rag_prompt_engineered_data['question'][i],
        actual_output=rag_prompt_engineered_data['answer'][i],
        expected_output=rag_prompt_engineered_data['ground_truth'][i]
    )

    correctness_metric.measure(test_case)
    #print(correctness_metric.score)
    #print(correctness_metric.reason)
    rag_prompt_engineered_scores.append(correctness_metric.score)
    rag_prompt_engineered_reasons.append(correctness_metric.reason)

In [27]:
# Printing out scores
print("-----------------Non-RAG Scores-----------------")
print(non_rag_scores)
print(non_rag_reasons)

print("-----------------RAG Scores-----------------")
print(rag_scores)
print(rag_reasons)

print("-----------------RAG Prompt Engineered Scores-----------------")
print(rag_prompt_engineered_scores)
print(rag_prompt_engineered_reasons)

-----------------Non-RAG Scores-----------------
[0.35625942072154954, 0.49703950978108324, 0.7695154950770453, 0.48309231078982207, 0.6154623344698666, 0.1204357920566029, 0.5784291155179508, 0.034683193540925195, 0.6015463776192753, 0.683659838894994]
['Main content of the responses generated do not match any of the expected output, but some responses are similar in nature.', 'The main content of the responses in the actual output is not similar to the expected output, but some similarities are present.', 'Two out of three responses in the actual output match with the main content of the expected output.', 'Some main content of the responses in the actual output are similar to those in the expected output.', 'None of the main content in the actual output matches any of the main content in the expected output.', 'None of the main contents in the actual output match the main contents in the expected output.', 'Responses in actual output are not similar to the expected output in terms o

In [28]:
# Combine scores and reasons into a DataFrame
non_rag_df = pd.DataFrame({'Scores': non_rag_scores, 'Reasons': non_rag_reasons})
rag_df = pd.DataFrame({'Scores': rag_scores, 'Reasons': rag_reasons})
rag_prompt_engineered_df = pd.DataFrame({'Scores': rag_prompt_engineered_scores, 'Reasons': rag_prompt_engineered_reasons})


# Print the DataFrame
#print(rag_df)
#print(non_rag_df)

# Calculate the average scores for each DataFrame
non_rag_avg_score = non_rag_df['Scores'].mean()
rag_avg_score = rag_df['Scores'].mean()
rag_prompt_engineered_avg_score = rag_prompt_engineered_df['Scores'].mean()


# Print the average scores
print("Average Score for Non-RAG DataFrame:", non_rag_avg_score)
print("Average Score for RAG DataFrame:", rag_avg_score)
print("Average Score for RAG Prompt Engineered DataFrame:", rag_prompt_engineered_avg_score)

Average Score for Non-RAG DataFrame: 0.4740123388469115
Average Score for RAG DataFrame: 0.5876968886737981
Average Score for RAG Prompt Engineered DataFrame: 0.6435506002724233
