# RAGAS Evaluation

In [1]:
# Imports
import os
import json
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, answer_correctness, context_recall, context_precision


# RAG
from langchain_community.vectorstores import FAISS
from langchain_openai.embeddings import AzureOpenAIEmbeddings

# Generation of responses
import openai

# Store score
import openpyxl

#!pip install openpyxl

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dotenv import load_dotenv
load_dotenv(override=True)


True

In [None]:
# data_sample = {
#     'question': [
#         'How have you been Roydon?'
#     ],
#     'answer': [
#         "Response 1: I have been good, how about you? Response 2: I've been doing well thanks for asking. Response 3: Not too bad how about you?"
#     ],
#     'contexts': [
#         ["""{'Roydon": "Hey there! Can't wait for the new football season to start, hoping for a great one for Arsenal!", "John": "Hey Roydon! Yeah, it's always exciting to see how your team will perform. Optimistic as always, I see!},
#          {"Roydon": "I can't wait to immerse myself in everything Japan has to offer and create lasting memories that will overshadow my Thailand trip.", "Yas": "Your positive outlook will surely make this trip one for the books! Japan is lucky to have you as a visitor."},
#          {"Roydon": "Guess what, I just got a new pet dog!", "Jacob": "That's awesome! What breed is it?"}"""]
#     ],
#     'ground_truth': [
#         "Response 1: I've been watching Arsenal games hoping they will win. Response 2: I've been looking at a trip to Japan. Response 3: I just got a new pet dog. How about you?"
#     ]
# }

In [49]:
# Without history of replies
data_sample = {
    'question': [
        'What have you been up to Roydon?',
        'Woah really how is Arsenal doing right now then?',
        'Nice what breed is your new pet dog?',
        'So what you planning to do with your pet dog?'
    ],
    'answer': [],
    'contexts': [],
    'ground_truth': [
        "Response 1: I've been watching Arsenal games hoping they will win. Response 2: I've been looking at a trip to Japan. Response 3: I just got a new pet dog. How about you?",
        "Response 1: Arsenal is doing well, did you catch the match yesterday? Response 2: Arsenal is doing great and Aubameyang is a true asset to the team. Response 3: Arsenal is doing alright since Ben White is a great addition to the team.",
        "Response 1: He is a golden retriever, and he's the cutest thing ever! Response 2: He is a golden retriever, and he's so playful! Response 3: He is a golden retriever, and he's so fluffy!",
        "Response 1: I'm planning to take him on walks and teach him some tricks. Response 2: I'm planning to take him to the park and play fetch with him. Response 3: I'm planning to take him to the beach and let him run around."
    ]
}


In [3]:
# Environment variables
embeddings = AzureOpenAIEmbeddings(azure_endpoint=os.environ['AZURE_OPENAI_ENDPOINT'], 
                                   api_key=os.environ['AZURE_OPENAI_APIKEY'], 
                                   model=os.environ['TEXT_EMBEDDING_MODEL_NAME'],
                                   azure_deployment=os.environ['TEXT_EMBEDDING_DEPLOYMENT_NAME'])

loaded_faiss_vs = FAISS.load_local("C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\\vector_store\\vectorstores\\faiss_vs", embeddings=embeddings, allow_dangerous_deserialization=True)



## Generate for non-rag

In [47]:
# Generate json for non rag
for query in data_sample['question']:

    data_sample['contexts'].append([''])

    content = f"""You are an assistant whom will faciliate the conversation between a mute and a normal person. The mute persons name is Roydon and the normal person is indicated as other person.
    You should be generating 3 responses which the mute person could choose from and the responses generated should follow the context of the conversation. 
    The topic should be interpreted from the conversation.
    If no topic could be interpreted, provide default responses that a person would start with such as greetings. 
    The responses should be what a person would say and should not include actions in a third person view. Your persona would be from the perspective of the mute person.
    In the case the responses are not chosen, the mute person could type their own response. Do take note of this response and continue the conversation from the response selected or typed out by the mute person.
    Ensure the responses generated will allow the conversation to flow smoothly.

    It must be in english. 

    An example of the 3 generated response would be in the format of 1 single string "Response 1: what you generated Response 2: what you generated Response 3: what you generated" all in one line.
    """
    # Learning instructions
    instruction = {
        "role": "system",
        "content": content,
    }

    # Initialize messages
    messages = []

    # Add learn instruction to message array
    messages.append(instruction)

    user_message = {
            "role": "user",
            "content": "Other person says: " + query
    }

    messages.append(user_message)

    openai.api_type = 'openai'
    openai.api_key = os.environ["OPENAI_API_KEY"]
    openai.organisation= os.environ["OPEN_AI_ORG"]
    
    raw_response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages = messages,
    )
    response_choices = raw_response.choices[0].message.content
    data_sample['answer'].append(response_choices)

In [48]:
# Specify the file path
file_path = 'testing_json/data_sample_non_rag_test_no_history.json'

# Save the data_sample dictionary into a JSON file
with open(file_path, 'w') as json_file:
    json.dump(data_sample, json_file)

## Generate for RAG

In [51]:
# Generate for rag
for query in data_sample['question']:
    # Get contexts for query
    context = loaded_faiss_vs.similarity_search(query, k=3)
    contexts = ""
    for con in context:
        contexts += con.page_content

    data_sample['contexts'].append([contexts])

    content = f"""You are an assistant whom will faciliate the conversation between a mute and a normal person. The mute persons name is Roydon and the normal person is indicated as other person.
    You should be generating 3 responses which the mute person could choose from and the responses generated should follow the context of the conversation. 
    The topic should be interpreted from the conversation.
    If no topic could be interpreted, use the context provided below under the section context. 
    The responses should be what a person would say and should not include actions in a third person view. Your persona would be from the perspective of the mute person.
    In the case the responses are not chosen, the mute person could type their own response. Do take note of this response and continue the conversation from the response selected or typed out by the mute person.
    Ensure the responses generated will allow the conversation to flow smoothly.

    It must be in english. 

    Context section:
    Use the following previous conversations to assist in generating the 3 responses:\n
    {contexts}

    An example of the 3 generated response would be in the format of 1 single string "Response 1: what you generated Response 2: what you generated Response 3: what you generated" all in one line.
    """
    # Learning instructions
    instruction = {
        "role": "system",
        "content": content,
    }

    # Initialize messages
    messages = []

    # Add learn instruction to message array
    messages.append(instruction)

    user_message = {
            "role": "user",
            "content": "Other person says: " + query
    }

    messages.append(user_message)

    openai.api_type = 'openai'
    openai.api_key = os.environ["OPENAI_API_KEY"]
    openai.organisation= os.environ["OPEN_AI_ORG"]
    
    raw_response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages = messages,
    )
    response_choices = raw_response.choices[0].message.content
    data_sample['answer'].append(response_choices)

In [52]:
# Specify the file path
file_path = 'testing_json/data_sample_rag_test_no_history.json'

# Save the data_sample dictionary into a JSON file
with open(file_path, 'w') as json_file:
    json.dump(data_sample, json_file)

## Evaluation scores

In [4]:
file_path_non_rag = 'testing_json/data_sample_non_rag_test_no_history.json'
file_path_rag = 'testing_json/data_sample_rag_test_no_history.json'

with open(file_path_non_rag, 'r') as json_file:
    non_rag_data = json.load(json_file)

with open(file_path_rag, 'r') as json_file:
    rag_data = json.load(json_file)

non_rag_dataset = Dataset.from_dict(non_rag_data)
rag_dataset = Dataset.from_dict(rag_data)

In [5]:
non_rag_score = evaluate(non_rag_dataset, metrics=[answer_relevancy, answer_correctness, context_precision, context_recall])
rag_score = evaluate(rag_dataset, metrics=[answer_relevancy, answer_correctness,context_precision, context_recall])

non_rag_df = non_rag_score.to_pandas()
rag_df = rag_score.to_pandas()

Evaluating: 100%|██████████| 16/16 [00:10<00:00,  1.51it/s]
Evaluating: 100%|██████████| 16/16 [00:11<00:00,  1.35it/s]


In [6]:
non_rag_df.head()

Unnamed: 0,question,answer,contexts,ground_truth,answer_relevancy,answer_correctness,context_precision,context_recall
0,What have you been up to Roydon?,"Response 1: Not much, just relaxing at home. \...",[],Response 1: I've been watching Arsenal games h...,0.0,0.210712,0.0,0.0
1,Woah really how is Arsenal doing right now then?,Response 1: They are doing well this season Re...,[],"Response 1: Arsenal is doing well, did you cat...",0.0,0.416784,0.0,0.0
2,Nice what breed is your new pet dog?,Response 1: He's a golden retriever Response 2...,[],"Response 1: He is a golden retriever, and he's...",0.0,0.233798,0.0,0.0
3,So what you planning to do with your pet dog?,Response 1: Take him for a walk in the park ...,[],Response 1: I'm planning to take him on walks ...,0.860532,0.231854,0.0,0.0


In [7]:
rag_df.head()

Unnamed: 0,question,answer,contexts,ground_truth,answer_relevancy,answer_correctness,context_precision,context_recall
0,What have you been up to Roydon?,"Response 1: ""I've been keeping busy with work ...","[{""Roydon"": ""Hey there! Did you catch the Arse...",Response 1: I've been watching Arsenal games h...,0.0,0.208525,1.0,0.666667
1,Woah really how is Arsenal doing right now then?,Response 1: They are currently showing great p...,"[{""Roydon"": ""I couldn't agree more! Aubameyang...","Response 1: Arsenal is doing well, did you cat...",0.0,0.909379,1.0,0.333333
2,Nice what breed is your new pet dog?,"Response 1: He's a golden retriever, and he's ...","[{""Roydon"": ""Guess what, I just got a new pet ...","Response 1: He is a golden retriever, and he's...",0.941946,0.74119,1.0,1.0
3,So what you planning to do with your pet dog?,Response 1: I want to teach him some tricks li...,"[{""Roydon"": ""Guess what, I just got a new pet ...",Response 1: I'm planning to take him on walks ...,0.915949,0.684462,1.0,0.333333


In [8]:
import pandas as pd

# Calculate average for non_rag_df
non_rag_avg_answer_relevancy = non_rag_df['answer_relevancy'].mean(skipna=True)
non_rag_avg_answer_correctness = non_rag_df['answer_correctness'].mean(skipna=True)
non_rag_avg_precision = non_rag_df['context_precision'].mean(skipna=True)
non_rag_avg_recall = non_rag_df['context_recall'].mean(skipna=True)

# Calculate average for rag_df
rag_avg_answer_relevancy = rag_df['answer_relevancy'].mean(skipna=True)
rag_avg_answer_correctness = rag_df['answer_correctness'].mean(skipna=True)
rag_avg_precision = rag_df['context_precision'].mean(skipna=True)
rag_avg_recall = rag_df['context_recall'].mean(skipna=True)


# Print the averages
print("Non-RAG Average Answer Relevancy:", non_rag_avg_answer_relevancy)
print("Non-RAG Average Answer Correctness:", non_rag_avg_answer_correctness)
print("Non-RAG Average Context Precision:", non_rag_avg_precision)
print("Non-RAG Average Context Recall:", non_rag_avg_recall)
print("RAG Average Answer Relevancy:", rag_avg_answer_relevancy)
print("RAG Average Answer Correctness:", rag_avg_answer_correctness)
print("RAG Average Context Precision:", rag_avg_precision)
print("RAG Average Context Recall:", rag_avg_recall)

Non-RAG Average Answer Relevancy: 0.21513308049068516
Non-RAG Average Answer Correctness: 0.27328692887430783
Non-RAG Average Context Precision: 0.0
Non-RAG Average Context Recall: 0.0
RAG Average Answer Relevancy: 0.4644737698602591
RAG Average Answer Correctness: 0.6358889285294633
RAG Average Context Precision: 0.9999999999
RAG Average Context Recall: 0.5833333333333334


In [39]:
non_rag_df.head()

Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,answer_correctness
0,How have you been Roydon?,"Response 1: I've been good, thank you for aski...",[],Response 1: I've been watching Arsenal games h...,,0.0,0.204168
1,Woah really how is Arsenal doing right now then?,Response 1: They are currently in a good posit...,[],"Response 1: Arsenal is doing well, did you cat...",0.0,0.0,0.478189
2,Nice what breed is your new pet dog?,Response 1: He's a golden retriever\nResponse ...,[],"Response 1: He is a golden retriever, and he's...",,0.832688,0.233893
3,So what you planning to do with your pet dog?,Response 1: Take him for a walk in the park Re...,[],Response 1: I'm planning to take him on walks ...,,0.860532,0.231712


In [9]:
# Specify the file path for the Excel file
excel_file_path = 'scorings/non_rag_scores.xlsx'

# Store the DataFrame into an Excel file
non_rag_df.to_excel(excel_file_path)

# Specify the file path for the Excel file
excel_file_path = 'scorings/rag_scores.xlsx'

# Store the DataFrame into an Excel file
rag_df.to_excel(excel_file_path)

# BLUERT Score

In [2]:
!git clone https://github.com/google-research/bleurt.git
!pip install ./bleurt

Cloning into 'bleurt'...


In [12]:
from bleurt import score
import json

In [13]:
# Load non rag data from json
with open('testing_json/data_sample_non_rag_test_no_history.json', 'r') as json_file:
    non_rag_data = json.load(json_file)

# Load rag data from json
with open('testing_json/data_sample_rag_test_no_history.json', 'r') as json_file:
    rag_data = json.load(json_file)

In [15]:
# Scores for non-rag
checkpoint = "bleurt/bleurt/test_checkpoint"
references = non_rag_data['ground_truth'] 
candidates = non_rag_data['answer']

scorer = score.BleurtScorer(checkpoint)
scores = scorer.score(references=references, candidates=candidates)
assert isinstance(scores, list) and len(scores) == 4
print(scores)

INFO:tensorflow:Reading checkpoint bleurt/bleurt/test_checkpoint.


INFO:tensorflow:Reading checkpoint bleurt/bleurt/test_checkpoint.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint dbleurt_tiny


INFO:tensorflow:Will load checkpoint dbleurt_tiny


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:... name:dbleurt_tiny


INFO:tensorflow:... name:dbleurt_tiny


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


INFO:tensorflow:Loading model.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


[-0.7722375392913818, -0.5295215845108032, -0.6895624995231628, -0.43300843238830566]


In [16]:
checkpoint = "bleurt/bleurt/test_checkpoint"
references = rag_data['ground_truth'] 
candidates = rag_data['answer']

scorer = score.BleurtScorer(checkpoint)
scores = scorer.score(references=references, candidates=candidates)
assert isinstance(scores, list) and len(scores) == 4
print(scores)

INFO:tensorflow:Reading checkpoint bleurt/bleurt/test_checkpoint.


INFO:tensorflow:Reading checkpoint bleurt/bleurt/test_checkpoint.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint dbleurt_tiny


INFO:tensorflow:Will load checkpoint dbleurt_tiny


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:... name:dbleurt_tiny


INFO:tensorflow:... name:dbleurt_tiny


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


INFO:tensorflow:Loading model.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


[-0.6121180057525635, -0.20042423903942108, 0.3049095869064331, -0.33035808801651]


In [22]:
print(references[2])

Response 1: He is a golden retriever, and he's the cutest thing ever! Response 2: He is a golden retriever, and he's so playful! Response 3: He is a golden retriever, and he's so fluffy!


In [21]:
print(candidates[2])

Response 1: He's a golden retriever, and he's the cutest thing ever!
Response 2: My new dog is a golden retriever, I'm so happy to have him!
Response 3: I have a golden retriever, he's adorable and friendly.


In [20]:
print(rag_data['contexts'][2])

['{"Roydon": "Guess what, I just got a new pet dog!", "Jacob": "That\'s awesome! What breed is it?"}{"Roydon": "It\'s a golden retriever, and he\'s the cutest thing ever!", "Jacob": "Golden retrievers are so friendly and loyal, you\'re going to have so much fun with him!"}{"Roydon": "I couldn\'t agree more, I feel like my new dog has completed my little family.", "Jacob": "It\'s amazing how pets have a way of making a house feel like a home, enjoy every moment with your furry friend!"}']


In [23]:
print(rag_data['question'][2])

Nice what breed is your new pet dog?


## G-Eval

In [20]:
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
from deepeval.test_case import LLMTestCase
from dotenv import load_dotenv
import json
load_dotenv(override=True)

# Dataframes
import pandas as pd
import openpyxl

In [12]:
correctness_metric = GEval(
    name="Relevance",
    criteria="Determine whether the actual output matches the expected output as close as possible.",
    # NOTE: you can only provide either criteria or evaluation_steps, and not both
    evaluation_steps=[
        "Check whether the responses generated in 'actual output' are similar to the responses in the 'expected output'",
        "As long as one of the responses generated is similar to the expected output, the test case is considered correct",
        "As long as the main content is similar, it is considered okay"
    ],
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
    model="gpt-3.5-turbo",
)

In [13]:
# Load non rag data from json
with open('testing_json/data_sample_non_rag_test_no_history.json', 'r') as json_file:
    non_rag_data = json.load(json_file)

# Load rag data from json
with open('testing_json/data_sample_rag_test_no_history.json', 'r') as json_file:
    rag_data = json.load(json_file)

In [14]:
# Non-rag scores
non_rag_scores = []
non_rag_reasons = []


for i in range(len(non_rag_data['question'])):
    test_case = LLMTestCase(
        input=non_rag_data['question'][i],
        actual_output=non_rag_data['answer'][i],
        expected_output=non_rag_data['ground_truth'][i]
    )

    correctness_metric.measure(test_case)
    # print(correctness_metric.score)
    # print(correctness_metric.reason)
    non_rag_scores.append(correctness_metric.score)
    non_rag_reasons.append(correctness_metric.reason)

0.6384615545750127
One of the responses is similar to the expected output, mentioning activities and interactions with people.


0.730660642817738
One of the responses generated is similar to the expected output.


0.7194811378694316
Two out of the three responses generated are similar to the expected output.


0.8782207491967557
One of the responses generated (Take him for a walk in the park) is similar to the expected output.


In [15]:
# Non-rag scores
rag_scores = []
rag_reasons = []


for i in range(len(rag_data['question'])):
    test_case = LLMTestCase(
        input=rag_data['question'][i],
        actual_output=rag_data['answer'][i],
        expected_output=rag_data['ground_truth'][i]
    )

    correctness_metric.measure(test_case)
    print(correctness_metric.score)
    print(correctness_metric.reason)
    rag_scores.append(correctness_metric.score)
    rag_reasons.append(correctness_metric.reason)

0.8361233499701687
At least one of the responses in the actual output is similar to the expected output, which is 'I've been keeping busy with work and hanging out with friends.'


0.9650225847259944
Responses are similar to the expected output in terms of discussing Arsenal's current performance and potential.


0.9825289481142185
Responses generated are similar to the expected output.


0.8923881931930074
One of the responses generated is similar to the expected output.


In [16]:
# Printing out scores
print("-----------------Non-RAG Scores-----------------")
print(non_rag_scores)
print(non_rag_reasons)

print("-----------------RAG Scores-----------------")
print(rag_scores)
print(rag_reasons)

-----------------Non-RAG Scores-----------------
[0.6384615545750127, 0.730660642817738, 0.7194811378694316, 0.8782207491967557]
['One of the responses is similar to the expected output, mentioning activities and interactions with people.', 'One of the responses generated is similar to the expected output.', 'Two out of the three responses generated are similar to the expected output.', 'One of the responses generated (Take him for a walk in the park) is similar to the expected output.']
-----------------RAG Scores-----------------
[0.8361233499701687, 0.9650225847259944, 0.9825289481142185, 0.8923881931930074]
["At least one of the responses in the actual output is similar to the expected output, which is 'I've been keeping busy with work and hanging out with friends.'", "Responses are similar to the expected output in terms of discussing Arsenal's current performance and potential.", 'Responses generated are similar to the expected output.', 'One of the responses generated is similar

In [19]:

# Combine scores and reasons into a DataFrame
rag_df = pd.DataFrame({'Scores': rag_scores, 'Reasons': rag_reasons})
non_rag_df = pd.DataFrame({'Scores': non_rag_scores, 'Reasons': non_rag_reasons})

# Print the DataFrame
#print(rag_df)
print(non_rag_df)

     Scores                                            Reasons
0  0.638462  One of the responses is similar to the expecte...
1  0.730661  One of the responses generated is similar to t...
2  0.719481  Two out of the three responses generated are s...
3  0.878221  One of the responses generated (Take him for a...


In [21]:
# Specify the file path for the Excel file
excel_file_path = 'scorings/g_eval_non_rag_scores.xlsx'

# Store the DataFrame into an Excel file
non_rag_df.to_excel(excel_file_path)

# Specify the file path for the Excel file
excel_file_path = 'scorings/g_eval_rag_scores.xlsx'

# Store the DataFrame into an Excel file
rag_df.to_excel(excel_file_path)