In [1]:
# Imports
import os
import json
from dotenv import load_dotenv
from langchain_openai import AzureChatOpenAI
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage

In [2]:
# Load in environment variables
load_dotenv(override=True)

True

# Hybrid Search

In [2]:
# !pip install -q langchain sentence-transformers cohere
# !pip install rank_bm25
# !pip install inflect

Collecting inflect
  Downloading inflect-7.4.0-py3-none-any.whl (34 kB)
Collecting more-itertools>=8.5.0 (from inflect)
  Downloading more_itertools-10.5.0-py3-none-any.whl (60 kB)
                                              0.0/61.0 kB ? eta -:--:--
     ------                                   10.2/61.0 kB ? eta -:--:--
     -------------------------------------- 61.0/61.0 kB 805.0 kB/s eta 0:00:00
Collecting typeguard>=4.0.1 (from inflect)
  Downloading typeguard-4.3.0-py3-none-any.whl (35 kB)
Installing collected packages: typeguard, more-itertools, inflect
Successfully installed inflect-7.4.0 more-itertools-10.5.0 typeguard-4.3.0



[notice] A new release of pip is available: 23.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

# Retrievers
from langchain.retrievers import BM25Retriever, EnsembleRetriever

In [4]:
documents = []
for filename in os.listdir('C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\mockdata\\'):
    if filename.endswith(".json"):
        with open(f'C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\mockdata\\{filename}') as f:
            data = json.load(f)
            for response_label, conversation in data.items():
                doc_content = json.dumps(conversation)
                doc_metadata = {"label": response_label, "source": filename}
                documents.append(Document(page_content=doc_content, metadata=doc_metadata))

print(len(documents))

61


In [63]:
for doc in documents:
    doc.metadata['file_name'] = doc.metadata['source']
    #print(doc)

In [5]:
embeddings=HuggingFaceInferenceAPIEmbeddings(
    api_key=os.environ['HUGGING_FACE_ACCESS_TOKEN'],
    model_name='BAAI/bge-base-en-v1.5'
)

## Create vector store

In [21]:
faiss_vectorstore_hugging_face_v1 = FAISS.from_documents(documents, embeddings)

faiss_vectorstore_hugging_face_v1

<langchain_community.vectorstores.faiss.FAISS at 0x1872010fe90>

In [22]:
## Saving Vector Store
faiss_vectorstore_hugging_face_v1.save_local("C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\\vector_store\\vectorstores\\hugging_face\\faiss_vs_hf_v1")

## Loading and using vector store

### Hugging Face Embeddings

In [42]:
## Load Vector Store
loaded_faiss_vs_hf_v1 = FAISS.load_local("C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\\vector_store\\vectorstores\\hugging_face\\faiss_vs_hf_v1", embeddings=embeddings, allow_dangerous_deserialization=True)

In [17]:
retriever_vectordb = loaded_faiss_vs_hf_v1.as_retriever(search_kwargs={"k": 3})
keyword_retriever = BM25Retriever.from_documents(documents)
keyword_retriever.k =  3
ensemble_retriever = EnsembleRetriever(retrievers=[retriever_vectordb,keyword_retriever],
                                       weights=[0.5, 0.5])

In [9]:
query = "What have you been up to Roydon?"
docs_rel=ensemble_retriever.get_relevant_documents(query)
docs_rel

  warn_deprecated(


[Document(metadata={'label': 'Response 3', 'source': 'travel.json', 'file_name': 'travel.json'}, page_content='{"Roydon": "I tried to, but everywhere I went, I just kept getting ripped off by the locals.", "Dory": "That must have been frustrating. Did you try any of the street food at least?"}'),
 Document(metadata={'label': 'Response 4', 'source': 'next_trip2.json', 'file_name': 'next_trip2.json'}, page_content='{"Roydon": "No, I had to borrow money from a friend to get back home.", "Xavier": "That must have been a really stressful experience."}'),
 Document(metadata={'label': 'Response 1', 'source': 'football2.json', 'file_name': 'football2.json'}, page_content='{"Roydon": "Hey there! Did you catch the Arsenal game last night? What a thrilling match!", "John": "Hey Roydon! Yes, I watched it. Arsenal played really well, didn\'t they?"}'),
 Document(metadata={'label': 'Response 6', 'source': 'football.json', 'file_name': 'football.json'}, page_content='{"Roydon": "Arteta has been makin

In [10]:
# Re ranking
from langchain.retrievers.document_compressors import CohereRerank
from langchain.retrievers import ContextualCompressionRetriever

compressor = CohereRerank(cohere_api_key=os.environ['COHERE_API_KEY'])
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=ensemble_retriever
)
compressed_docs = compression_retriever.get_relevant_documents(query)
compressed_docs

  warn_deprecated(


[Document(metadata={'label': 'Response 3', 'source': 'travel.json', 'file_name': 'travel.json', 'relevance_score': 0.95177364}, page_content='{"Roydon": "I tried to, but everywhere I went, I just kept getting ripped off by the locals.", "Dory": "That must have been frustrating. Did you try any of the street food at least?"}'),
 Document(metadata={'label': 'Response 1', 'source': 'football2.json', 'file_name': 'football2.json', 'relevance_score': 0.896614}, page_content='{"Roydon": "Hey there! Did you catch the Arsenal game last night? What a thrilling match!", "John": "Hey Roydon! Yes, I watched it. Arsenal played really well, didn\'t they?"}'),
 Document(metadata={'label': 'Response 7', 'source': 'travel.json', 'file_name': 'travel.json', 'relevance_score': 0.85645247}, page_content='{"Roydon": "Easy for you to say. You weren\'t the one stuck in a foreign country with nothing going right.", "Dory": "I know, but sometimes these things happen. You just have to try and make the best of i

### OpenAI Embedding

In [12]:
# Load and retrieve normally using semantic search
from langchain_openai.embeddings import AzureOpenAIEmbeddings

embeddings = AzureOpenAIEmbeddings(azure_endpoint=os.environ['AZURE_OPENAI_ENDPOINT'], 
                                   api_key=os.environ['AZURE_OPENAI_APIKEY'], 
                                   model=os.environ['TEXT_EMBEDDING_MODEL_NAME'],
                                   azure_deployment=os.environ['TEXT_EMBEDDING_DEPLOYMENT_NAME'])

loaded_faiss_vs_v3 = FAISS.load_local("C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\\vector_store\\vectorstores\\faiss_vs_v3", embeddings=embeddings, allow_dangerous_deserialization=True)

In [13]:
# Normal semantic search
query = "What have you been up to Roydon?"
context = loaded_faiss_vs_v3.similarity_search(query, k=3)

for con in context:
    print("=" * 30, "\n", con.metadata)
    print(con.page_content)

 {'label': 'Response 1', 'source': 'football2.json', 'file_name': 'football2.json'}
{"Roydon": "Hey there! Did you catch the Arsenal game last night? What a thrilling match!", "John": "Hey Roydon! Yes, I watched it. Arsenal played really well, didn't they?"}
 {'label': 'Response 11', 'source': 'next_trip2.json', 'file_name': 'next_trip2.json'}
{"Roydon": "You too, Xavier."}
 {'label': 'Response 1', 'source': 'next_trip2.json', 'file_name': 'next_trip2.json'}
{"Roydon": "I can't believe what happened to me in Thailand.", "Xavier": "What happened?"}


In [13]:
retriever_vectordb = loaded_faiss_vs_v3.as_retriever(search_kwargs={"k": 3})
keyword_retriever = BM25Retriever.from_documents(documents)
keyword_retriever.k =  3
ensemble_retriever = EnsembleRetriever(retrievers=[retriever_vectordb,keyword_retriever],
                                       weights=[0.5, 0.5])

In [14]:
query = "What have you been up to Roydon?"
docs_rel=ensemble_retriever.get_relevant_documents(query)
docs_rel

[Document(metadata={'label': 'Response 1', 'source': 'football2.json', 'file_name': 'football2.json'}, page_content='{"Roydon": "Hey there! Did you catch the Arsenal game last night? What a thrilling match!", "John": "Hey Roydon! Yes, I watched it. Arsenal played really well, didn\'t they?"}'),
 Document(metadata={'label': 'Response 4', 'source': 'next_trip2.json', 'file_name': 'next_trip2.json'}, page_content='{"Roydon": "No, I had to borrow money from a friend to get back home.", "Xavier": "That must have been a really stressful experience."}'),
 Document(metadata={'label': 'Response 11', 'source': 'next_trip2.json', 'file_name': 'next_trip2.json'}, page_content='{"Roydon": "You too, Xavier."}'),
 Document(metadata={'label': 'Response 6', 'source': 'football.json', 'file_name': 'football.json'}, page_content='{"Roydon": "Arteta has been making some good decisions lately, so I have faith in him. How about Ole Gunnar Solskjaer?", "John": "Solskjaer has been improving as a manager, but 

In [15]:
# Cohere reranking
from langchain.retrievers.document_compressors import CohereRerank
from langchain.retrievers import ContextualCompressionRetriever

compressor = CohereRerank(cohere_api_key=os.environ['COHERE_API_KEY'])
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=ensemble_retriever
)
compressed_docs = compression_retriever.get_relevant_documents(query)
compressed_docs

[Document(metadata={'label': 'Response 3', 'source': 'travel.json', 'file_name': 'travel.json', 'relevance_score': 0.9518632}, page_content='{"Roydon": "I tried to, but everywhere I went, I just kept getting ripped off by the locals.", "Dory": "That must have been frustrating. Did you try any of the street food at least?"}'),
 Document(metadata={'label': 'Response 11', 'source': 'next_trip2.json', 'file_name': 'next_trip2.json', 'relevance_score': 0.9073122}, page_content='{"Roydon": "You too, Xavier."}'),
 Document(metadata={'label': 'Response 1', 'source': 'football2.json', 'file_name': 'football2.json', 'relevance_score': 0.8962514}, page_content='{"Roydon": "Hey there! Did you catch the Arsenal game last night? What a thrilling match!", "John": "Hey Roydon! Yes, I watched it. Arsenal played really well, didn\'t they?"}')]

# Evaluation using metrics score for few shot prompting responses

## OpenAI Embedding with ensemble

In [17]:
# Generation of responses
import openai

In [16]:
data_sample = {
    'question': [
        'What have you been up to Roydon?',
        'Woah really how is Arsenal doing right now then?',
        'Nice what breed is your new pet dog?',
        'So what you planning to do with your pet dog?',
        'How was your trip to thailand?',
        'What happened in thailand?',
        'What channel are you planning to create for your new pet dog?',
        #------------ Dual questions
        'How was your trip to thailand and any new travel plans next year?',
        'I heard you got a new pet dog how is he? What are you going to name him?',
        'Hows your new pet dog? What breed is he?'
        #------------ Complicated questions
        
    ],
    'answer': [],
    'contexts': [],
    'ground_truth': [
        "Response 1: I've been watching Arsenal games hoping they will win. Response 2: I've been looking at a trip to Japan. Response 3: I just got a new pet dog. How about you?",
        "Response 1: Arsenal is doing well, did you catch the match yesterday? Response 2: Arsenal is doing great and Aubameyang is a true asset to the team. Response 3: Arsenal is doing alright since Ben White is a great addition to the team.",
        "Response 1: He is a golden retriever, and he's the cutest thing ever! Response 2: He is a golden retriever, and he's the cutest thing ever! Response 3: He is a golden retriever, and he's the cutest thing ever!",
        "Response 1: I'm planning to take him on long hikes on the mountain. Response 2: I'm planning to take him to the beach and watch him splash in the waves. Response 3: I'm planning for play dates with other dogs.",
        "Response 1: It was a horrible experience and I would never go back. Response 2: It was a horrible experience and I would never go back. Response 3: It was a horrible experience and I would never go back.",
        "Response 1: I got scammed by a taxi driver and lost all my money. Response 2: The hotel lost my reservation and I had to sleep on the streets. Response 3: I kept getting ripped off by the locals and it was such a horrible experience.",
        "Response 1: I'm planning to create a special Instagram account just for him to share our adventures. Response 2: I'm planning to create a special Instagram account just for him to share our adventures. Response 3: I'm planning to create a special Instagram account just for him to share our adventures.",
        "Response 1: It was a horrible experience. I got scammed by a taxi driver and lost all my money. Response 2: It was a horrible experience. The hotel lost my reservation and I had to sleep on the streets. Response 3: It was a horrible experience. I kept getting ripped off by the locals.",
        "Response 1: He is so fun to be with. Im planning to name him Sunny. Response 2: He is so fun to be with. Im planning to name him Sunny. Response 3: He is so fun to be with. Im planning to name him Sunny.",
        "Response 1: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever! Response 2: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever! Response 3: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever!",
    ]
}

In [18]:
# Load and retrieve normally using semantic search
from langchain_openai.embeddings import AzureOpenAIEmbeddings

open_ai_embeddings = AzureOpenAIEmbeddings(azure_endpoint=os.environ['AZURE_OPENAI_ENDPOINT'], 
                                   api_key=os.environ['AZURE_OPENAI_APIKEY'], 
                                   model=os.environ['TEXT_EMBEDDING_MODEL_NAME'],
                                   azure_deployment=os.environ['TEXT_EMBEDDING_DEPLOYMENT_NAME'])

loaded_faiss_vs_v3 = FAISS.load_local("C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\\vector_store\\vectorstores\\faiss_vs_v3", embeddings=open_ai_embeddings, allow_dangerous_deserialization=True)

In [19]:
# Initiate ensemble retriever
retriever_vectordb = loaded_faiss_vs_v3.as_retriever(search_kwargs={"k": 3})
keyword_retriever = BM25Retriever.from_documents(documents)
keyword_retriever.k =  3
ensemble_retriever = EnsembleRetriever(retrievers=[retriever_vectordb,keyword_retriever],
                                       weights=[0.5, 0.5])

In [22]:
query = "What have you been up to Roydon?"
docs_rel=ensemble_retriever.get_relevant_documents(query)
docs_rel_top_3 = docs_rel[:3]

contexts = ""
for context in docs_rel_top_3:
    contexts += context.page_content

print(contexts)


{"Roydon": "Hey there! Did you catch the Arsenal game last night? What a thrilling match!", "John": "Hey Roydon! Yes, I watched it. Arsenal played really well, didn't they?"}{"Roydon": "No, I had to borrow money from a friend to get back home.", "Xavier": "That must have been a really stressful experience."}{"Roydon": "You too, Xavier."}


In [23]:
# Generate for rag
for query in data_sample['question']:
    # Get contexts for query
    docs_rel=ensemble_retriever.get_relevant_documents(query)
    docs_rel_top_3 = docs_rel[:3]

    contexts = ""
    for context in docs_rel_top_3:
        contexts += context.page_content

    data_sample['contexts'].append([contexts])

    content = f"""You are an assistant whom will faciliate the conversation between a mute and a normal person. The mute persons name is Roydon and the normal person is indicated as other person.
                        You should be generating 3 responses which the mute person could choose from and the responses generated should follow the context of the conversation. 
                        The responses should be what a person would say and should not include actions in a third person view. Your persona would be from the perspective of the mute person.

                        Snippets of conversation would be given below in the section of Context. Use the conversations to assist in the generation the 3 responses. Primarily the topic should be inferred from the question asked but if no topic can be inferred, infer the topics from the conversations given in the context. The conversations are seperated by "{{" and "}}":\n
                        Context: {contexts}

                        For example, if the context above contains "{{"Roydon": "Recently my new pet dog has been so fun!", "Jacob": "That\'s awesome! What breed is it?"}}"

                        If the user asks "What have you been up to?"

                        An example of the 3 generated response would be in the format of 1 single string "Response 1: I have been playing with my new pet dog. Response 2: Nothing much, I recently brought my new pet dog to a park. Response 3: Its been tiring lately after getting a new pet dog. """
    # Learning instructions
    instruction = {
        "role": "system",
        "content": content,
    }

    # Initialize messages
    messages = []

    # Add learn instruction to message array
    messages.append(instruction)

    user_message = {
            "role": "user",
            "content": "Other person says: " + query
    }

    messages.append(user_message)

    openai.api_type = 'openai'
    openai.api_key = os.environ["OPENAI_API_KEY"]
    openai.organisation= os.environ["OPEN_AI_ORG"]
    
    raw_response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages = messages,
    )
    response_choices = raw_response.choices[0].message.content
    data_sample['answer'].append(response_choices)

In [24]:
# Specify the file path
file_path = 'testing_json/rag_few_shot_ensemble_v1.json'

# Save the data_sample dictionary into a JSON file
with open(file_path, 'w') as json_file:
    json.dump(data_sample, json_file)

## Hugging Face Embedding with ensemble

In [25]:
data_sample = {
    'question': [
        'What have you been up to Roydon?',
        'Woah really how is Arsenal doing right now then?',
        'Nice what breed is your new pet dog?',
        'So what you planning to do with your pet dog?',
        'How was your trip to thailand?',
        'What happened in thailand?',
        'What channel are you planning to create for your new pet dog?',
        #------------ Dual questions
        'How was your trip to thailand and any new travel plans next year?',
        'I heard you got a new pet dog how is he? What are you going to name him?',
        'Hows your new pet dog? What breed is he?'
        #------------ Complicated questions
        
    ],
    'answer': [],
    'contexts': [],
    'ground_truth': [
        "Response 1: I've been watching Arsenal games hoping they will win. Response 2: I've been looking at a trip to Japan. Response 3: I just got a new pet dog. How about you?",
        "Response 1: Arsenal is doing well, did you catch the match yesterday? Response 2: Arsenal is doing great and Aubameyang is a true asset to the team. Response 3: Arsenal is doing alright since Ben White is a great addition to the team.",
        "Response 1: He is a golden retriever, and he's the cutest thing ever! Response 2: He is a golden retriever, and he's the cutest thing ever! Response 3: He is a golden retriever, and he's the cutest thing ever!",
        "Response 1: I'm planning to take him on long hikes on the mountain. Response 2: I'm planning to take him to the beach and watch him splash in the waves. Response 3: I'm planning for play dates with other dogs.",
        "Response 1: It was a horrible experience and I would never go back. Response 2: It was a horrible experience and I would never go back. Response 3: It was a horrible experience and I would never go back.",
        "Response 1: I got scammed by a taxi driver and lost all my money. Response 2: The hotel lost my reservation and I had to sleep on the streets. Response 3: I kept getting ripped off by the locals and it was such a horrible experience.",
        "Response 1: I'm planning to create a special Instagram account just for him to share our adventures. Response 2: I'm planning to create a special Instagram account just for him to share our adventures. Response 3: I'm planning to create a special Instagram account just for him to share our adventures.",
        "Response 1: It was a horrible experience. I got scammed by a taxi driver and lost all my money. Response 2: It was a horrible experience. The hotel lost my reservation and I had to sleep on the streets. Response 3: It was a horrible experience. I kept getting ripped off by the locals.",
        "Response 1: He is so fun to be with. Im planning to name him Sunny. Response 2: He is so fun to be with. Im planning to name him Sunny. Response 3: He is so fun to be with. Im planning to name him Sunny.",
        "Response 1: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever! Response 2: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever! Response 3: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever!",
    ]
}

In [11]:
embeddings=HuggingFaceInferenceAPIEmbeddings(
    api_key=os.environ['HUGGING_FACE_ACCESS_TOKEN'],
    model_name='BAAI/bge-base-en-v1.5'
)

## Load Vector Store
loaded_faiss_vs_hf_v1 = FAISS.load_local("C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\\vector_store\\vectorstores\\hugging_face\\faiss_vs_hf_v1", embeddings=embeddings, allow_dangerous_deserialization=True)

In [12]:
# Initiate retriever
retriever_vectordb = loaded_faiss_vs_hf_v1.as_retriever(search_kwargs={"k": 3})
keyword_retriever = BM25Retriever.from_documents(documents)
keyword_retriever.k =  3
ensemble_retriever = EnsembleRetriever(retrievers=[retriever_vectordb,keyword_retriever],
                                       weights=[0.5, 0.5])

In [28]:
query = "What have you been up to Roydon?"
docs_rel=ensemble_retriever.get_relevant_documents(query)
docs_rel_top_3 = docs_rel[:3]

contexts = ""
for context in docs_rel_top_3:
    contexts += context.page_content

print(contexts)

{"Roydon": "I tried to, but everywhere I went, I just kept getting ripped off by the locals.", "Dory": "That must have been frustrating. Did you try any of the street food at least?"}{"Roydon": "No, I had to borrow money from a friend to get back home.", "Xavier": "That must have been a really stressful experience."}{"Roydon": "Hey there! Did you catch the Arsenal game last night? What a thrilling match!", "John": "Hey Roydon! Yes, I watched it. Arsenal played really well, didn't they?"}


In [29]:
# Generate for rag
for query in data_sample['question']:
    # Get contexts for query
    docs_rel=ensemble_retriever.get_relevant_documents(query)
    docs_rel_top_3 = docs_rel[:3]

    contexts = ""
    for context in docs_rel_top_3:
        contexts += context.page_content

    data_sample['contexts'].append([contexts])

    content = f"""You are an assistant whom will faciliate the conversation between a mute and a normal person. The mute persons name is Roydon and the normal person is indicated as other person.
                        You should be generating 3 responses which the mute person could choose from and the responses generated should follow the context of the conversation. 
                        The responses should be what a person would say and should not include actions in a third person view. Your persona would be from the perspective of the mute person.

                        Snippets of conversation would be given below in the section of Context. Use the conversations to assist in the generation the 3 responses. Primarily the topic should be inferred from the question asked but if no topic can be inferred, infer the topics from the conversations given in the context. The conversations are seperated by "{{" and "}}":\n
                        Context: {contexts}

                        For example, if the context above contains "{{"Roydon": "Recently my new pet dog has been so fun!", "Jacob": "That\'s awesome! What breed is it?"}}"

                        If the user asks "What have you been up to?"

                        An example of the 3 generated response would be in the format of 1 single string "Response 1: I have been playing with my new pet dog. Response 2: Nothing much, I recently brought my new pet dog to a park. Response 3: Its been tiring lately after getting a new pet dog. """
    # Learning instructions
    instruction = {
        "role": "system",
        "content": content,
    }

    # Initialize messages
    messages = []

    # Add learn instruction to message array
    messages.append(instruction)

    user_message = {
            "role": "user",
            "content": "Other person says: " + query
    }

    messages.append(user_message)

    openai.api_type = 'openai'
    openai.api_key = os.environ["OPENAI_API_KEY"]
    openai.organisation= os.environ["OPEN_AI_ORG"]
    
    raw_response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages = messages,
    )
    response_choices = raw_response.choices[0].message.content
    data_sample['answer'].append(response_choices)

In [30]:
# Specify the file path
file_path = 'testing_json/Improve_RAG/rag_few_shot_ensemble_hf_v2.json'

# Save the data_sample dictionary into a JSON file
with open(file_path, 'w') as json_file:
    json.dump(data_sample, json_file)

## Compare between original and the 2

In [33]:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, answer_correctness, context_recall, context_precision

In [32]:
# RAGAS 
file_path_rag_few_shot = 'testing_json/data_sample_rag_test_prompt_engineered_v2.json'
file_path_ensemble_open_ai = 'testing_json/Improve_RAG/rag_few_shot_ensemble_v1.json'
file_path_ensemble_hf = 'testing_json/Improve_RAG/rag_few_shot_ensemble_hf_v2.json.'

with open(file_path_rag_few_shot, 'r') as json_file:
    rag_data_few_shot = json.load(json_file)

with open(file_path_ensemble_open_ai, 'r') as json_file:
    rag_ensemble_open_ai = json.load(json_file)

with open(file_path_ensemble_hf, 'r') as json_file:
    rag_ensemble_hf = json.load(json_file)

rag_dataset_few_shot = Dataset.from_dict(rag_data_few_shot)
rag_dataset_ensemble_open_ai = Dataset.from_dict(rag_ensemble_open_ai)
rag_dataset_ensemble_hf = Dataset.from_dict(rag_ensemble_hf)

In [34]:
rag_dataset_few_shot_score = evaluate(rag_dataset_few_shot, metrics=[answer_relevancy, answer_correctness, context_precision, context_recall])
rag_dataset_ensemble_open_ai_score = evaluate(rag_dataset_ensemble_open_ai, metrics=[answer_relevancy, answer_correctness,context_precision, context_recall])
rag_dataset_ensemble_hf_score = evaluate(rag_dataset_ensemble_hf, metrics=[answer_relevancy, answer_correctness,context_precision, context_recall])

rag_few_shot_df = rag_dataset_few_shot_score.to_pandas()
rag_ensemble_open_ai_df = rag_dataset_ensemble_open_ai_score.to_pandas()
rag_ensemble_hf_df= rag_dataset_ensemble_hf_score.to_pandas()

Evaluating: 100%|██████████| 40/40 [00:27<00:00,  1.45it/s]
Evaluating: 100%|██████████| 40/40 [00:18<00:00,  2.11it/s]
Evaluating: 100%|██████████| 40/40 [00:59<00:00,  1.50s/it]


In [35]:
import pandas as pd

# Calculate average for non_rag_df
rag_few_shot_relevancy = rag_few_shot_df['answer_relevancy'].mean(skipna=True)
rag_few_shot_answer_correctness = rag_few_shot_df['answer_correctness'].mean(skipna=True)
rag_few_shot_avg_precision = rag_few_shot_df['context_precision'].mean(skipna=True)
rag_few_shot_avg_recall = rag_few_shot_df['context_recall'].mean(skipna=True)

# Calculate average for rag_df
rag_ensemble_open_ai_avg_answer_relevancy = rag_ensemble_open_ai_df['answer_relevancy'].mean(skipna=True)
rag_ensemble_open_ai_avg_answer_correctness = rag_ensemble_open_ai_df['answer_correctness'].mean(skipna=True)
rag_ensemble_open_ai_avg_precision = rag_ensemble_open_ai_df['context_precision'].mean(skipna=True)
rag_ensemble_open_ai_avg_recall = rag_ensemble_open_ai_df['context_recall'].mean(skipna=True)

# Calculate average for rag_df
rag_ensemble_hf_avg_answer_relevancy = rag_ensemble_hf_df['answer_relevancy'].mean(skipna=True)
rag_ensemble_hf_avg_answer_correctness = rag_ensemble_hf_df['answer_correctness'].mean(skipna=True)
rag_ensemble_hf_avg_precision = rag_ensemble_hf_df['context_precision'].mean(skipna=True)
rag_ensemble_hf_avg_recall = rag_ensemble_hf_df['context_recall'].mean(skipna=True)


# Print the averages
print("=========================Few Shot=========================")
print("Non-RAG Average Answer Relevancy:", rag_few_shot_relevancy)
print("Non-RAG Average Answer Correctness:", rag_few_shot_answer_correctness)
print("Non-RAG Average Context Precision:", rag_few_shot_avg_precision)
print("Non-RAG Average Context Recall:", rag_few_shot_avg_recall)
print("=========================Ensemble Open AI=========================")
print("RAG Average Answer Relevancy:", rag_ensemble_open_ai_avg_answer_relevancy)
print("RAG Average Answer Correctness:", rag_ensemble_open_ai_avg_answer_correctness)
print("RAG Average Context Precision:", rag_ensemble_open_ai_avg_precision)
print("RAG Average Context Recall:", rag_ensemble_open_ai_avg_recall)
print("=========================Ensemble HF=========================")
print("RAG Average Answer Relevancy:", rag_ensemble_hf_avg_answer_relevancy)
print("RAG Average Answer Correctness:", rag_ensemble_hf_avg_answer_correctness)
print("RAG Average Context Precision:", rag_ensemble_hf_avg_precision)
print("RAG Average Context Recall:", rag_ensemble_hf_avg_recall)

Non-RAG Average Answer Relevancy: 0.452734835146927
Non-RAG Average Answer Correctness: 0.6440070072030675
Non-RAG Average Context Precision: 0.89999999991
Non-RAG Average Context Recall: 0.475
RAG Average Answer Relevancy: 0.468133850976659
RAG Average Answer Correctness: 0.4664450979906115
RAG Average Context Precision: 0.69999999993
RAG Average Context Recall: 0.4
RAG Average Answer Relevancy: 0.3743571687636189
RAG Average Answer Correctness: 0.5205250062605699
RAG Average Context Precision: 0.69999999993
RAG Average Context Recall: 0.5


In [36]:
# Specify the file path for the Excel file
excel_file_path = 'scorings/RAG_Prompt_Engineered/ensemble_open_ai_scores.xlsx'

# Store the DataFrame into an Excel file
rag_ensemble_open_ai_df.to_excel(excel_file_path)

# Specify the file path for the Excel file
excel_file_path = 'scorings/RAG_Prompt_Engineered/ensemble_hf_scores.xlsx'

# Store the DataFrame into an Excel file
rag_ensemble_hf_df.to_excel(excel_file_path)

In [37]:
# G-eval
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
from deepeval.test_case import LLMTestCase
from dotenv import load_dotenv
import json
load_dotenv(override=True)

# Dataframes
import pandas as pd
import openpyxl



In [38]:
correctness_metric = GEval(
    name="Relevance",
    #criteria="Determine whether the actual output matches the expected output as close as possible.",
    # NOTE: you can only provide either criteria or evaluation_steps, and not both
    evaluation_steps=[
        "Check whether the main content of the responses generated in 'actual output' are similar to the responses in the 'expected output'",
        """As long as one of the main content of the responses generated is similar to any of the expected output, the test case is considered correct.
        For example, if response 1 content is on a pet dog and it matches response 3 content of also a pet dog, give it a high score. 
        The order of the responses is not important.""",
        "Evaluate mainly based on main content but do still give a higher score depending on similarity of responses."
    ],
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
    model="gpt-3.5-turbo",
)

In [39]:
# Ensemble_open_ai scores
ensemble_open_ai_scores = []
ensemble_open_ai_reasons = []


for i in range(len(rag_ensemble_open_ai['question'])):
    test_case = LLMTestCase(
        input=rag_ensemble_open_ai['question'][i],
        actual_output=rag_ensemble_open_ai['answer'][i],
        expected_output=rag_ensemble_open_ai['ground_truth'][i]
    )

    correctness_metric.measure(test_case)
    # print(correctness_metric.score)
    # print(correctness_metric.reason)
    ensemble_open_ai_scores.append(correctness_metric.score)
    ensemble_open_ai_reasons.append(correctness_metric.reason)

# print(ensemble_open_ai_scores)
# print(ensemble_open_ai_reasons)

In [40]:
# Ensemble_open_hf scores
ensemble_open_hf_scores = []
ensemble_open_hf_reasons = []


for i in range(len(rag_ensemble_hf['question'])):
    test_case = LLMTestCase(
        input=rag_ensemble_hf['question'][i],
        actual_output=rag_ensemble_hf['answer'][i],
        expected_output=rag_ensemble_hf['ground_truth'][i]
    )

    correctness_metric.measure(test_case)
    # print(correctness_metric.score)
    # print(correctness_metric.reason)
    ensemble_open_hf_scores.append(correctness_metric.score)
    ensemble_open_hf_reasons.append(correctness_metric.reason)

# print(ensemble_open_ai_scores)
# print(ensemble_open_ai_reasons)

In [47]:
# Load the scores and reasons from the specified Excel file
excel_file_path = 'scorings/RAG_Prompt_Engineered/g_eval_rag_prompt_engineered_scores_v3.xlsx'

# Read the Excel file into a DataFrame
g_eval_df = pd.read_excel(excel_file_path)

rag_few_shot_scores = g_eval_df['Scores'].tolist()
rag_few_shot_reasons = g_eval_df['Reasons'].tolist()

In [52]:
# Combine scores and reasons into a DataFrame
ensemble_open_ai_df = pd.DataFrame({'Scores': ensemble_open_ai_scores, 'Reasons': ensemble_open_ai_reasons})
ensemble_hf_df = pd.DataFrame({'Scores': ensemble_open_hf_scores, 'Reasons': ensemble_open_hf_reasons})

# Calculate the average scores for each DataFrame
rag_few_shot = sum(rag_few_shot_scores) / len(rag_few_shot_scores)
ensemble_open_ai = ensemble_open_ai_df['Scores'].mean()
ensemble_hf = ensemble_hf_df['Scores'].mean()


# Print the average scores
print("Average Score for Few Shot:", rag_few_shot)
print("Average Score for Ensemble Open Ai:", ensemble_open_ai)
print("Average Score for Ensemble HF:", ensemble_hf)

Average Score for Few Shot: 0.5545081113816229
Average Score for Ensemble Open Ai: 0.602516814829001
Average Score for Ensemble HF: 0.6355191874880344


In [54]:
# Specify the file path for the Excel file
excel_file_path = 'scorings/RAG_Prompt_Engineered/g_eval_ensemble_open_ai_v1.xlsx'

# Store the DataFrame into an Excel file
ensemble_open_ai_df.to_excel(excel_file_path)

# Specify the file path for the Excel file
excel_file_path = 'scorings/RAG_Prompt_Engineered/g_eval_ensemble_hf_v2.xlsx'

# Store the DataFrame into an Excel file
ensemble_hf_df.to_excel(excel_file_path)

## Ensemble HF with Reranking

In [19]:
# Generation of responses
import openai

In [20]:
data_sample = {
    'question': [
        'What have you been up to Roydon?',
        'Woah really how is Arsenal doing right now then?',
        'Nice what breed is your new pet dog?',
        'So what you planning to do with your pet dog?',
        'How was your trip to thailand?',
        'What happened in thailand?',
        'What channel are you planning to create for your new pet dog?',
        #------------ Dual questions
        'How was your trip to thailand and any new travel plans next year?',
        'I heard you got a new pet dog how is he? What are you going to name him?',
        'Hows your new pet dog? What breed is he?'
        #------------ Complicated questions
        
    ],
    'answer': [],
    'contexts': [],
    'ground_truth': [
        "Response 1: I've been watching Arsenal games hoping they will win. Response 2: I've been looking at a trip to Japan. Response 3: I just got a new pet dog. How about you?",
        "Response 1: Arsenal is doing well, did you catch the match yesterday? Response 2: Arsenal is doing great and Aubameyang is a true asset to the team. Response 3: Arsenal is doing alright since Ben White is a great addition to the team.",
        "Response 1: He is a golden retriever, and he's the cutest thing ever! Response 2: He is a golden retriever, and he's the cutest thing ever! Response 3: He is a golden retriever, and he's the cutest thing ever!",
        "Response 1: I'm planning to take him on long hikes on the mountain. Response 2: I'm planning to take him to the beach and watch him splash in the waves. Response 3: I'm planning for play dates with other dogs.",
        "Response 1: It was a horrible experience and I would never go back. Response 2: It was a horrible experience and I would never go back. Response 3: It was a horrible experience and I would never go back.",
        "Response 1: I got scammed by a taxi driver and lost all my money. Response 2: The hotel lost my reservation and I had to sleep on the streets. Response 3: I kept getting ripped off by the locals and it was such a horrible experience.",
        "Response 1: I'm planning to create a special Instagram account just for him to share our adventures. Response 2: I'm planning to create a special Instagram account just for him to share our adventures. Response 3: I'm planning to create a special Instagram account just for him to share our adventures.",
        "Response 1: It was a horrible experience. I got scammed by a taxi driver and lost all my money. Response 2: It was a horrible experience. The hotel lost my reservation and I had to sleep on the streets. Response 3: It was a horrible experience. I kept getting ripped off by the locals.",
        "Response 1: He is so fun to be with. Im planning to name him Sunny. Response 2: He is so fun to be with. Im planning to name him Sunny. Response 3: He is so fun to be with. Im planning to name him Sunny.",
        "Response 1: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever! Response 2: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever! Response 3: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever!",
    ]
}

In [21]:
embeddings=HuggingFaceInferenceAPIEmbeddings(
    api_key=os.environ['HUGGING_FACE_ACCESS_TOKEN'],
    model_name='BAAI/bge-base-en-v1.5'
)

## Load Vector Store
loaded_faiss_vs_hf_v1 = FAISS.load_local("C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\\vector_store\\vectorstores\\hugging_face\\faiss_vs_hf_v1", embeddings=embeddings, allow_dangerous_deserialization=True)

In [22]:
# Initiate retriever
retriever_vectordb = loaded_faiss_vs_hf_v1.as_retriever(search_kwargs={"k": 3})
keyword_retriever = BM25Retriever.from_documents(documents)
keyword_retriever.k =  3
ensemble_retriever = EnsembleRetriever(retrievers=[retriever_vectordb,keyword_retriever],
                                       weights=[0.5, 0.5])

In [23]:
# Cohere reranking
from langchain.retrievers.document_compressors import CohereRerank
from langchain.retrievers import ContextualCompressionRetriever

compressor = CohereRerank(cohere_api_key=os.environ['COHERE_API_KEY'])
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=ensemble_retriever
)


In [24]:
query = "What have you been up to Roydon?"
compressed_docs = compression_retriever.get_relevant_documents(query)

contexts = ""
for context in compressed_docs:
    contexts += context.page_content

print(contexts)

{"Roydon": "I tried to, but everywhere I went, I just kept getting ripped off by the locals.", "Dory": "That must have been frustrating. Did you try any of the street food at least?"}{"Roydon": "Hey there! Did you catch the Arsenal game last night? What a thrilling match!", "John": "Hey Roydon! Yes, I watched it. Arsenal played really well, didn't they?"}{"Roydon": "Easy for you to say. You weren't the one stuck in a foreign country with nothing going right.", "Dory": "I know, but sometimes these things happen. You just have to try and make the best of it."}


In [25]:
# Generate for ensemble with reranking
for query in data_sample['question']:
    # Get contexts for query
    docs_rel=ensemble_retriever.get_relevant_documents(query)
    docs_rel_top_3 = docs_rel[:3]

    contexts = ""
    for context in docs_rel_top_3:
        contexts += context.page_content

    data_sample['contexts'].append([contexts])

    content = f"""You are an assistant whom will faciliate the conversation between a mute and a normal person. The mute persons name is Roydon and the normal person is indicated as other person.
                        You should be generating 3 responses which the mute person could choose from and the responses generated should follow the context of the conversation. 
                        The responses should be what a person would say and should not include actions in a third person view. Your persona would be from the perspective of the mute person.

                        Snippets of conversation would be given below in the section of Context. Use the conversations to assist in the generation the 3 responses. Primarily the topic should be inferred from the question asked but if no topic can be inferred, infer the topics from the conversations given in the context. The conversations are seperated by "{{" and "}}":\n
                        Context: {contexts}

                        For example, if the context above contains "{{"Roydon": "Recently my new pet dog has been so fun!", "Jacob": "That\'s awesome! What breed is it?"}}"

                        If the user asks "What have you been up to?"

                        An example of the 3 generated response would be in the format of 1 single string "Response 1: I have been playing with my new pet dog. Response 2: Nothing much, I recently brought my new pet dog to a park. Response 3: Its been tiring lately after getting a new pet dog. """
    # Learning instructions
    instruction = {
        "role": "system",
        "content": content,
    }

    # Initialize messages
    messages = []

    # Add learn instruction to message array
    messages.append(instruction)

    user_message = {
            "role": "user",
            "content": "Other person says: " + query
    }

    messages.append(user_message)

    openai.api_type = 'openai'
    openai.api_key = os.environ["OPENAI_API_KEY"]
    openai.organisation= os.environ["OPEN_AI_ORG"]
    
    raw_response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages = messages,
    )
    response_choices = raw_response.choices[0].message.content
    data_sample['answer'].append(response_choices)

In [26]:
# Specify the file path
file_path = 'testing_json/Improve_RAG/rag_few_shot_ensemble_hf_rerank_v3.json'

# Save the data_sample dictionary into a JSON file
with open(file_path, 'w') as json_file:
    json.dump(data_sample, json_file)

In [29]:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, answer_correctness, context_recall, context_precision

import pandas as pd

In [41]:
# RAGAS 
file_path_ensemble_rerank_hf = 'testing_json/Improve_RAG/rag_few_shot_ensemble_hf_rerank_v3.json'

with open(file_path_ensemble_rerank_hf, 'r') as json_file:
    rag_ensemble_rerank_hf = json.load(json_file)

rag_dataset_ensemble_hf_rerank = Dataset.from_dict(rag_ensemble_rerank_hf)

# Load dataset score from excel for RAGAS and g-eval, ensemble hf
excel_file_path = 'scorings/RAG_Prompt_Engineered/RAG Improvement/RAG_Few_Shot_Combined_Eval_HF_Ensemble_v2.xlsx'

# Read the Excel file into a DataFrame
hf_df = pd.read_excel(excel_file_path)

hf_df

Unnamed: 0.1,Unnamed: 0,question,answer,contexts,ground_truth,answer_relevancy,answer_correctness,context_precision,context_recall,G-Eval_Scores,Reasons
0,0,What have you been up to Roydon?,"Response 1: I tried to, but everywhere I went,...","['{""Roydon"": ""I tried to, but everywhere I wen...",Response 1: I've been watching Arsenal games h...,0.0,0.213683,0.0,0.0,0.211592,The main content of response 3 in the actual o...
1,1,Woah really how is Arsenal doing right now then?,Response 1: Arsenal is currently in a good pos...,"['{""Roydon"": ""I couldn\'t agree more! Aubameya...","Response 1: Arsenal is doing well, did you cat...",0.0,0.477226,1.0,0.666667,0.597107,Two out of the three responses in the actual o...
2,2,Nice what breed is your new pet dog?,Response 1: It's a Golden Retriever. \nRespons...,"['{""Roydon"": ""Guess what, I just got a new pet...","Response 1: He is a golden retriever, and he's...",0.0,0.530437,0.0,0.0,0.714982,The main content of all responses in the 'actu...
3,3,So what you planning to do with your pet dog?,Response 1: I'm planning to teach him some tri...,"['{""Roydon"": ""Absolutely! I\'m planning to tea...",Response 1: I'm planning to take him on long h...,0.0,0.536566,1.0,0.666667,0.50672,Actual output response 1 is similar to expecte...
4,4,How was your trip to thailand?,"Response 1: It was a total disaster, nothing w...","['{""Roydon"": ""I can\'t believe how terrible my...",Response 1: It was a horrible experience and I...,0.99587,0.910776,1.0,1.0,0.94147,All the main content of the actual responses m...
5,5,What happened in thailand?,Response 1: My flight got cancelled and my lug...,"['{""Roydon"": ""I can\'t believe what happened t...",Response 1: I got scammed by a taxi driver and...,0.91164,0.228831,1.0,0.0,0.613643,One of the main content in the actual output (...
6,6,What channel are you planning to create for yo...,Response 1: I'm thinking of creating a channel...,"['{""Roydon"": ""Guess what, I just got a new pet...",Response 1: I'm planning to create a special I...,0.0,0.55507,1.0,1.0,0.746633,Two of the main contents of the responses in t...
7,7,How was your trip to thailand and any new trav...,"Response 1: Thailand was a disaster, but Japan...","['{""Roydon"": ""I can\'t wait to immerse myself ...",Response 1: It was a horrible experience. I go...,0.0,0.57782,1.0,0.666667,0.546592,None of the main content in the responses gene...
8,8,I heard you got a new pet dog how is he? What ...,"Response 1: He's doing great, thanks for askin...","['{""Roydon"": ""Guess what, I just got a new pet...",Response 1: He is so fun to be with. Im planni...,0.897573,0.733547,1.0,1.0,0.861327,The main content of the responses in the 'actu...
9,9,Hows your new pet dog? What breed is he?,Response 1: He's doing great! My new pet dog i...,"['{""Roydon"": ""Guess what, I just got a new pet...",Response 1: He brings so much joy to my life. ...,0.938489,0.441294,0.0,0.0,0.615126,The main content of the responses in the 'actu...


In [33]:
# RAGAS Evaluation
rag_dataset_ensemble_hf_reranking_score = evaluate(rag_dataset_ensemble_hf_rerank, metrics=[answer_relevancy, answer_correctness,context_precision, context_recall])

rag_ensemble_hf_rerank_df= rag_dataset_ensemble_hf_reranking_score.to_pandas()

Evaluating: 100%|██████████| 40/40 [00:22<00:00,  1.81it/s]


In [34]:
# Calculate average for rag_hf_rerank_df
rag_ensemble_hf_avg_answer_relevancy = hf_df['answer_relevancy'].mean(skipna=True)
rag_ensemble_hf_avg_answer_correctness = hf_df['answer_correctness'].mean(skipna=True)
rag_ensemble_hf_avg_precision = hf_df['context_precision'].mean(skipna=True)
rag_ensemble_hf_avg_recall = hf_df['context_recall'].mean(skipna=True)


# Calculate average for rag_hf_rerank_df
rag_ensemble_hf_rerank_avg_answer_relevancy = rag_ensemble_hf_rerank_df['answer_relevancy'].mean(skipna=True)
rag_ensemble_hf_rerank_avg_answer_correctness = rag_ensemble_hf_rerank_df['answer_correctness'].mean(skipna=True)
rag_ensemble_hf_rerank_avg_precision = rag_ensemble_hf_rerank_df['context_precision'].mean(skipna=True)
rag_ensemble_hf_rerank_avg_recall = rag_ensemble_hf_rerank_df['context_recall'].mean(skipna=True)




# Print the averages
print("=========================Ensemble HF=========================")
print("RAG Average Answer Relevancy:", rag_ensemble_hf_avg_answer_relevancy)
print("RAG Average Answer Correctness:", rag_ensemble_hf_avg_answer_correctness)
print("RAG Average Context Precision:", rag_ensemble_hf_avg_precision)
print("RAG Average Context Recall:", rag_ensemble_hf_avg_recall)


print("=========================Ensemble HF Rerank=========================")
print("RAG Average Answer Relevancy:", rag_ensemble_hf_rerank_avg_answer_relevancy)
print("RAG Average Answer Correctness:", rag_ensemble_hf_rerank_avg_answer_correctness)
print("RAG Average Context Precision:", rag_ensemble_hf_rerank_avg_precision)
print("RAG Average Context Recall:", rag_ensemble_hf_rerank_avg_recall)

RAG Average Answer Relevancy: 0.3743571687636189
RAG Average Answer Correctness: 0.5205250062605699
RAG Average Context Precision: 0.69999999993
RAG Average Context Recall: 0.5
RAG Average Answer Relevancy: 0.284599889611891
RAG Average Answer Correctness: 0.5720345380788269
RAG Average Context Precision: 0.69999999993
RAG Average Context Recall: 0.4666666666666666


In [35]:
# Specify the file path for the Excel file
excel_file_path = 'scorings/RAG_Prompt_Engineered/RAG Improvement/ensemble_hf_rerank_scores.xlsx'

# Store the DataFrame into an Excel file
rag_ensemble_hf_rerank_df.to_excel(excel_file_path)

In [36]:
# G-eval
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
from deepeval.test_case import LLMTestCase
from dotenv import load_dotenv
import json
load_dotenv(override=True)

# Dataframes
import pandas as pd
import openpyxl



In [37]:
correctness_metric = GEval(
    name="Relevance",
    #criteria="Determine whether the actual output matches the expected output as close as possible.",
    # NOTE: you can only provide either criteria or evaluation_steps, and not both
    evaluation_steps=[
        "Check whether the main content of the responses generated in 'actual output' are similar to the responses in the 'expected output'",
        """As long as one of the main content of the responses generated is similar to any of the expected output, the test case is considered correct.
        For example, if response 1 content is on a pet dog and it matches response 3 content of also a pet dog, give it a high score. 
        The order of the responses is not important.""",
        "Evaluate mainly based on main content but do still give a higher score depending on similarity of responses."
    ],
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
    model="gpt-3.5-turbo",
)

In [38]:
# Ensemble_open_hf scores
ensemble_open_hf_rerank_scores = []
ensemble_open_hf_rerank_reasons = []


for i in range(len(rag_dataset_ensemble_hf_rerank['question'])):
    test_case = LLMTestCase(
        input=rag_dataset_ensemble_hf_rerank['question'][i],
        actual_output=rag_dataset_ensemble_hf_rerank['answer'][i],
        expected_output=rag_dataset_ensemble_hf_rerank['ground_truth'][i]
    )

    correctness_metric.measure(test_case)
    # print(correctness_metric.score)
    # print(correctness_metric.reason)
    ensemble_open_hf_rerank_scores.append(correctness_metric.score)
    ensemble_open_hf_rerank_reasons.append(correctness_metric.reason)

# print(ensemble_open_ai_scores)
# print(ensemble_open_ai_reasons)

In [44]:
# Combine scores and reasons into a DataFrame
ensemble_hf_rerank_df = pd.DataFrame({'Scores': ensemble_open_hf_rerank_scores, 'Reasons': ensemble_open_hf_rerank_reasons})

# Calculate the average scores for each DataFrame
ensemble_hf = hf_df['G-Eval_Scores'].mean()
ensemble_hf_rerank = ensemble_hf_rerank_df['Scores'].mean()

# Print the average scores
print("Average Score for Ensemble HF:", ensemble_hf)
print("Average Score for Ensemble HF Rerank:", ensemble_hf_rerank)

Average Score for Ensemble HF: 0.6355191874880344
Average Score for Ensemble HF Rerank: 0.5903976058086038


In [45]:
# Specify the file path for the Excel file
excel_file_path = 'scorings/RAG_Prompt_Engineered/RAG Improvement/g_eval_ensemble_hf_rerank_v3.xlsx'

# Store the DataFrame into an Excel file
ensemble_hf_rerank_df.to_excel(excel_file_path)

# Trying out varying parameters for ensemble hf

In [7]:
embeddings=HuggingFaceInferenceAPIEmbeddings(
    api_key=os.environ['HUGGING_FACE_ACCESS_TOKEN'],
    model_name='BAAI/bge-base-en-v1.5'
)

## Load Vector Store
loaded_faiss_vs_hf_v1 = FAISS.load_local("C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\\vector_store\\vectorstores\\hugging_face\\faiss_vs_hf_v1", embeddings=embeddings, allow_dangerous_deserialization=True)

## 0.6, 0.4

In [10]:
import openai

In [19]:
data_sample = {
    'question': [
        'What have you been up to Roydon?',
        'Woah really how is Arsenal doing right now then?',
        'Nice what breed is your new pet dog?',
        'So what you planning to do with your pet dog?',
        'How was your trip to thailand?',
        'What happened in thailand?',
        'What channel are you planning to create for your new pet dog?',
        #------------ Dual questions
        'How was your trip to thailand and any new travel plans next year?',
        'I heard you got a new pet dog how is he? What are you going to name him?',
        'Hows your new pet dog? What breed is he?'
        #------------ Complicated questions
        
    ],
    'answer': [],
    'contexts': [],
    'ground_truth': [
        "Response 1: I've been watching Arsenal games hoping they will win. Response 2: I've been looking at a trip to Japan. Response 3: I just got a new pet dog. How about you?",
        "Response 1: Arsenal is doing well, did you catch the match yesterday? Response 2: Arsenal is doing great and Aubameyang is a true asset to the team. Response 3: Arsenal is doing alright since Ben White is a great addition to the team.",
        "Response 1: He is a golden retriever, and he's the cutest thing ever! Response 2: He is a golden retriever, and he's the cutest thing ever! Response 3: He is a golden retriever, and he's the cutest thing ever!",
        "Response 1: I'm planning to take him on long hikes on the mountain. Response 2: I'm planning to take him to the beach and watch him splash in the waves. Response 3: I'm planning for play dates with other dogs.",
        "Response 1: It was a horrible experience and I would never go back. Response 2: It was a horrible experience and I would never go back. Response 3: It was a horrible experience and I would never go back.",
        "Response 1: I got scammed by a taxi driver and lost all my money. Response 2: The hotel lost my reservation and I had to sleep on the streets. Response 3: I kept getting ripped off by the locals and it was such a horrible experience.",
        "Response 1: I'm planning to create a special Instagram account just for him to share our adventures. Response 2: I'm planning to create a special Instagram account just for him to share our adventures. Response 3: I'm planning to create a special Instagram account just for him to share our adventures.",
        "Response 1: It was a horrible experience. I got scammed by a taxi driver and lost all my money. Response 2: It was a horrible experience. The hotel lost my reservation and I had to sleep on the streets. Response 3: It was a horrible experience. I kept getting ripped off by the locals.",
        "Response 1: He is so fun to be with. Im planning to name him Sunny. Response 2: He is so fun to be with. Im planning to name him Sunny. Response 3: He is so fun to be with. Im planning to name him Sunny.",
        "Response 1: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever! Response 2: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever! Response 3: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever!",
    ]
}

In [20]:
# Initiate retriever
retriever_vectordb = loaded_faiss_vs_hf_v1.as_retriever(search_kwargs={"k": 3})
keyword_retriever = BM25Retriever.from_documents(documents)
keyword_retriever.k =  3
ensemble_retriever = EnsembleRetriever(retrievers=[retriever_vectordb,keyword_retriever],
                                       weights=[0.6, 0.4])

In [21]:
# Generate for rag
for query in data_sample['question']:
    # Get contexts for query
    docs_rel=ensemble_retriever.get_relevant_documents(query)
    docs_rel_top_3 = docs_rel[:3]

    contexts = ""
    for context in docs_rel_top_3:
        contexts += context.page_content

    data_sample['contexts'].append([contexts])

    content = f"""You are an assistant whom will faciliate the conversation between a mute and a normal person. The mute persons name is Roydon and the normal person is indicated as other person.
                        You should be generating 3 responses which the mute person could choose from and the responses generated should follow the context of the conversation. 
                        The responses should be what a person would say and should not include actions in a third person view. Your persona would be from the perspective of the mute person.

                        Snippets of conversation would be given below in the section of Context. Use the conversations to assist in the generation the 3 responses. Primarily the topic should be inferred from the question asked but if no topic can be inferred, infer the topics from the conversations given in the context. The conversations are seperated by "{{" and "}}":\n
                        Context: {contexts}

                        For example, if the context above contains "{{"Roydon": "Recently my new pet dog has been so fun!", "Jacob": "That\'s awesome! What breed is it?"}}"

                        If the user asks "What have you been up to?"

                        An example of the 3 generated response would be in the format of 1 single string "Response 1: I have been playing with my new pet dog. Response 2: Nothing much, I recently brought my new pet dog to a park. Response 3: Its been tiring lately after getting a new pet dog. """
    # Learning instructions
    instruction = {
        "role": "system",
        "content": content,
    }

    # Initialize messages
    messages = []

    # Add learn instruction to message array
    messages.append(instruction)

    user_message = {
            "role": "user",
            "content": "Other person says: " + query
    }

    messages.append(user_message)

    openai.api_type = 'openai'
    openai.api_key = os.environ["OPENAI_API_KEY"]
    openai.organisation= os.environ["OPEN_AI_ORG"]
    
    raw_response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages = messages,
    )
    response_choices = raw_response.choices[0].message.content
    data_sample['answer'].append(response_choices)

In [23]:
# Specify the file path
file_path = 'testing_json/Improve_RAG/rag_few_shot_ensemble_hf_v3_0.6_0.4.json'

# Save the data_sample dictionary into a JSON file
with open(file_path, 'w') as json_file:
    json.dump(data_sample, json_file)

## 0.4, 0.6

In [25]:
data_sample = {
    'question': [
        'What have you been up to Roydon?',
        'Woah really how is Arsenal doing right now then?',
        'Nice what breed is your new pet dog?',
        'So what you planning to do with your pet dog?',
        'How was your trip to thailand?',
        'What happened in thailand?',
        'What channel are you planning to create for your new pet dog?',
        #------------ Dual questions
        'How was your trip to thailand and any new travel plans next year?',
        'I heard you got a new pet dog how is he? What are you going to name him?',
        'Hows your new pet dog? What breed is he?'
        #------------ Complicated questions
        
    ],
    'answer': [],
    'contexts': [],
    'ground_truth': [
        "Response 1: I've been watching Arsenal games hoping they will win. Response 2: I've been looking at a trip to Japan. Response 3: I just got a new pet dog. How about you?",
        "Response 1: Arsenal is doing well, did you catch the match yesterday? Response 2: Arsenal is doing great and Aubameyang is a true asset to the team. Response 3: Arsenal is doing alright since Ben White is a great addition to the team.",
        "Response 1: He is a golden retriever, and he's the cutest thing ever! Response 2: He is a golden retriever, and he's the cutest thing ever! Response 3: He is a golden retriever, and he's the cutest thing ever!",
        "Response 1: I'm planning to take him on long hikes on the mountain. Response 2: I'm planning to take him to the beach and watch him splash in the waves. Response 3: I'm planning for play dates with other dogs.",
        "Response 1: It was a horrible experience and I would never go back. Response 2: It was a horrible experience and I would never go back. Response 3: It was a horrible experience and I would never go back.",
        "Response 1: I got scammed by a taxi driver and lost all my money. Response 2: The hotel lost my reservation and I had to sleep on the streets. Response 3: I kept getting ripped off by the locals and it was such a horrible experience.",
        "Response 1: I'm planning to create a special Instagram account just for him to share our adventures. Response 2: I'm planning to create a special Instagram account just for him to share our adventures. Response 3: I'm planning to create a special Instagram account just for him to share our adventures.",
        "Response 1: It was a horrible experience. I got scammed by a taxi driver and lost all my money. Response 2: It was a horrible experience. The hotel lost my reservation and I had to sleep on the streets. Response 3: It was a horrible experience. I kept getting ripped off by the locals.",
        "Response 1: He is so fun to be with. Im planning to name him Sunny. Response 2: He is so fun to be with. Im planning to name him Sunny. Response 3: He is so fun to be with. Im planning to name him Sunny.",
        "Response 1: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever! Response 2: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever! Response 3: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever!",
    ]
}

In [26]:
# Initiate retriever
retriever_vectordb = loaded_faiss_vs_hf_v1.as_retriever(search_kwargs={"k": 3})
keyword_retriever = BM25Retriever.from_documents(documents)
keyword_retriever.k =  3
ensemble_retriever = EnsembleRetriever(retrievers=[retriever_vectordb,keyword_retriever],
                                       weights=[0.4, 0.6])

In [27]:
# Generate for rag
for query in data_sample['question']:
    # Get contexts for query
    docs_rel=ensemble_retriever.get_relevant_documents(query)
    docs_rel_top_3 = docs_rel[:3]

    contexts = ""
    for context in docs_rel_top_3:
        contexts += context.page_content

    data_sample['contexts'].append([contexts])

    content = f"""You are an assistant whom will faciliate the conversation between a mute and a normal person. The mute persons name is Roydon and the normal person is indicated as other person.
                        You should be generating 3 responses which the mute person could choose from and the responses generated should follow the context of the conversation. 
                        The responses should be what a person would say and should not include actions in a third person view. Your persona would be from the perspective of the mute person.

                        Snippets of conversation would be given below in the section of Context. Use the conversations to assist in the generation the 3 responses. Primarily the topic should be inferred from the question asked but if no topic can be inferred, infer the topics from the conversations given in the context. The conversations are seperated by "{{" and "}}":\n
                        Context: {contexts}

                        For example, if the context above contains "{{"Roydon": "Recently my new pet dog has been so fun!", "Jacob": "That\'s awesome! What breed is it?"}}"

                        If the user asks "What have you been up to?"

                        An example of the 3 generated response would be in the format of 1 single string "Response 1: I have been playing with my new pet dog. Response 2: Nothing much, I recently brought my new pet dog to a park. Response 3: Its been tiring lately after getting a new pet dog. """
    # Learning instructions
    instruction = {
        "role": "system",
        "content": content,
    }

    # Initialize messages
    messages = []

    # Add learn instruction to message array
    messages.append(instruction)

    user_message = {
            "role": "user",
            "content": "Other person says: " + query
    }

    messages.append(user_message)

    openai.api_type = 'openai'
    openai.api_key = os.environ["OPENAI_API_KEY"]
    openai.organisation= os.environ["OPEN_AI_ORG"]
    
    raw_response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages = messages,
    )
    response_choices = raw_response.choices[0].message.content
    data_sample['answer'].append(response_choices)

In [28]:
# Specify the file path
file_path = 'testing_json/Improve_RAG/rag_few_shot_ensemble_hf_v4_0.4_0.6.json'

# Save the data_sample dictionary into a JSON file
with open(file_path, 'w') as json_file:
    json.dump(data_sample, json_file)

## Comparisons for all 3

- 0.5,0.5
- 0.4, 0.6
- 0.6, 0.4

In [17]:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, answer_correctness, context_recall, context_precision

  from .autonotebook import tqdm as notebook_tqdm


In [29]:
# RAGAS 
file_path_ensemble_hf_6_4 = 'testing_json/Improve_RAG/rag_few_shot_ensemble_hf_v3_0.6_0.4.json'
file_path_ensemble_hf_4_6 = 'testing_json/Improve_RAG/rag_few_shot_ensemble_hf_v4_0.4_0.6.json'

with open(file_path_ensemble_hf_6_4, 'r') as json_file:
    rag_ensemble_hf_6_4 = json.load(json_file)

with open(file_path_ensemble_hf_4_6, 'r') as json_file:
    rag_ensemble_hf_4_6 = json.load(json_file)

rag_dataset_ensemble_hf_6_4 = Dataset.from_dict(rag_ensemble_hf_6_4)
rag_dataset_ensemble_hf_4_6 = Dataset.from_dict(rag_ensemble_hf_4_6)

In [30]:
rag_dataset_ensemble_hf_6_4_score = evaluate(rag_dataset_ensemble_hf_6_4, metrics=[answer_relevancy, answer_correctness,context_precision, context_recall])
rag_dataset_ensemble_hf_4_6_score = evaluate(rag_dataset_ensemble_hf_4_6, metrics=[answer_relevancy, answer_correctness,context_precision, context_recall])

rag_ensemble_hf_df_6_4 = rag_dataset_ensemble_hf_6_4_score.to_pandas()
rag_ensemble_hf_df_4_6 = rag_dataset_ensemble_hf_4_6_score.to_pandas()

Evaluating: 100%|██████████| 40/40 [00:21<00:00,  1.90it/s]
Evaluating: 100%|██████████| 40/40 [00:21<00:00,  1.90it/s]


In [32]:
import pandas as pd
# Load 0.5 0.5 scores

excel_file_path = 'scorings/RAG_Prompt_Engineered/RAG Improvement/RAG_Few_Shot_Combined_Eval_HF_Ensemble_v2.xlsx'

# Read the Excel file into a DataFrame
hf_df = pd.read_excel(excel_file_path)

hf_df

Unnamed: 0.1,Unnamed: 0,question,answer,contexts,ground_truth,answer_relevancy,answer_correctness,context_precision,context_recall,G-Eval_Scores,Reasons
0,0,What have you been up to Roydon?,"Response 1: I tried to, but everywhere I went,...","['{""Roydon"": ""I tried to, but everywhere I wen...",Response 1: I've been watching Arsenal games h...,0.0,0.213683,0.0,0.0,0.211592,The main content of response 3 in the actual o...
1,1,Woah really how is Arsenal doing right now then?,Response 1: Arsenal is currently in a good pos...,"['{""Roydon"": ""I couldn\'t agree more! Aubameya...","Response 1: Arsenal is doing well, did you cat...",0.0,0.477226,1.0,0.666667,0.597107,Two out of the three responses in the actual o...
2,2,Nice what breed is your new pet dog?,Response 1: It's a Golden Retriever. \nRespons...,"['{""Roydon"": ""Guess what, I just got a new pet...","Response 1: He is a golden retriever, and he's...",0.0,0.530437,0.0,0.0,0.714982,The main content of all responses in the 'actu...
3,3,So what you planning to do with your pet dog?,Response 1: I'm planning to teach him some tri...,"['{""Roydon"": ""Absolutely! I\'m planning to tea...",Response 1: I'm planning to take him on long h...,0.0,0.536566,1.0,0.666667,0.50672,Actual output response 1 is similar to expecte...
4,4,How was your trip to thailand?,"Response 1: It was a total disaster, nothing w...","['{""Roydon"": ""I can\'t believe how terrible my...",Response 1: It was a horrible experience and I...,0.99587,0.910776,1.0,1.0,0.94147,All the main content of the actual responses m...
5,5,What happened in thailand?,Response 1: My flight got cancelled and my lug...,"['{""Roydon"": ""I can\'t believe what happened t...",Response 1: I got scammed by a taxi driver and...,0.91164,0.228831,1.0,0.0,0.613643,One of the main content in the actual output (...
6,6,What channel are you planning to create for yo...,Response 1: I'm thinking of creating a channel...,"['{""Roydon"": ""Guess what, I just got a new pet...",Response 1: I'm planning to create a special I...,0.0,0.55507,1.0,1.0,0.746633,Two of the main contents of the responses in t...
7,7,How was your trip to thailand and any new trav...,"Response 1: Thailand was a disaster, but Japan...","['{""Roydon"": ""I can\'t wait to immerse myself ...",Response 1: It was a horrible experience. I go...,0.0,0.57782,1.0,0.666667,0.546592,None of the main content in the responses gene...
8,8,I heard you got a new pet dog how is he? What ...,"Response 1: He's doing great, thanks for askin...","['{""Roydon"": ""Guess what, I just got a new pet...",Response 1: He is so fun to be with. Im planni...,0.897573,0.733547,1.0,1.0,0.861327,The main content of the responses in the 'actu...
9,9,Hows your new pet dog? What breed is he?,Response 1: He's doing great! My new pet dog i...,"['{""Roydon"": ""Guess what, I just got a new pet...",Response 1: He brings so much joy to my life. ...,0.938489,0.441294,0.0,0.0,0.615126,The main content of the responses in the 'actu...


In [33]:
import pandas as pd

# Calculate average for 0.5, 0.5
rag_ensemble_hf_avg_answer_relevancy_5_5 = hf_df['answer_relevancy'].mean(skipna=True)
rag_ensemble_hf_avg_answer_correctness_5_5 = hf_df['answer_correctness'].mean(skipna=True)
rag_ensemble_hf_avg_precision_5_5 = hf_df['context_precision'].mean(skipna=True)
rag_ensemble_hf_avg_recall_5_5 = hf_df['context_recall'].mean(skipna=True)

# Calculate average for 0.6, 0.4
rag_ensemble_hf_avg_answer_relevancy_6_4 = rag_ensemble_hf_df_6_4['answer_relevancy'].mean(skipna=True)
rag_ensemble_hf_avg_answer_correctness_6_4 = rag_ensemble_hf_df_6_4['answer_correctness'].mean(skipna=True)
rag_ensemble_hf_avg_precision_6_4 = rag_ensemble_hf_df_6_4['context_precision'].mean(skipna=True)
rag_ensemble_hf_avg_recall_6_4 = rag_ensemble_hf_df_6_4['context_recall'].mean(skipna=True)

# Calculate average for 0.6, 0.4
rag_ensemble_hf_avg_answer_relevancy_4_6 = rag_ensemble_hf_df_4_6['answer_relevancy'].mean(skipna=True)
rag_ensemble_hf_avg_answer_correctness_4_6 = rag_ensemble_hf_df_4_6['answer_correctness'].mean(skipna=True)
rag_ensemble_hf_avg_precision_4_6 = rag_ensemble_hf_df_4_6['context_precision'].mean(skipna=True)
rag_ensemble_hf_avg_recall_4_6 = rag_ensemble_hf_df_4_6['context_recall'].mean(skipna=True)



# Print the averages
print("=========================Ensemble HF 0.5, 0.5=========================")
print("Non-RAG Average Answer Relevancy:", rag_ensemble_hf_avg_answer_relevancy_5_5)
print("Non-RAG Average Answer Correctness:", rag_ensemble_hf_avg_answer_correctness_5_5)
print("Non-RAG Average Context Precision:", rag_ensemble_hf_avg_precision_5_5)
print("Non-RAG Average Context Recall:", rag_ensemble_hf_avg_recall_5_5)
print("=========================Ensemble HF 0.6, 0.4=========================")
print("RAG Average Answer Relevancy:", rag_ensemble_hf_avg_answer_relevancy_6_4)
print("RAG Average Answer Correctness:", rag_ensemble_hf_avg_answer_correctness_6_4)
print("RAG Average Context Precision:", rag_ensemble_hf_avg_precision_6_4)
print("RAG Average Context Recall:", rag_ensemble_hf_avg_recall_6_4)
print("=========================Ensemble HF 0.4, 0.6=========================")
print("RAG Average Answer Relevancy:", rag_ensemble_hf_avg_answer_relevancy_4_6)
print("RAG Average Answer Correctness:", rag_ensemble_hf_avg_answer_correctness_4_6)
print("RAG Average Context Precision:", rag_ensemble_hf_avg_precision_4_6)
print("RAG Average Context Recall:", rag_ensemble_hf_avg_recall_4_6)

Non-RAG Average Answer Relevancy: 0.3743571687636189
Non-RAG Average Answer Correctness: 0.5205250062605699
Non-RAG Average Context Precision: 0.69999999993
Non-RAG Average Context Recall: 0.5
RAG Average Answer Relevancy: 0.36526090687001056
RAG Average Answer Correctness: 0.5677870677191184
RAG Average Context Precision: 0.89999999991
RAG Average Context Recall: 0.6833333333333333
RAG Average Answer Relevancy: 0.27873358142029897
RAG Average Answer Correctness: 0.5779282570940589
RAG Average Context Precision: 0.49999999995
RAG Average Context Recall: 0.26666666666666666


In [34]:
# Specify the file path for the Excel file
excel_file_path = 'scorings/RAG_Prompt_Engineered/RAG Improvement/Ensemble HF/ensemble_hf_6_4_scores.xlsx'

# Store the DataFrame into an Excel file
rag_ensemble_hf_df_6_4.to_excel(excel_file_path)

# Specify the file path for the Excel file
excel_file_path = 'scorings/RAG_Prompt_Engineered/RAG Improvement/Ensemble HF/ensemble_hf_4_6_scores.xlsx'

# Store the DataFrame into an Excel file
rag_ensemble_hf_df_4_6.to_excel(excel_file_path)

In [35]:
# G-eval
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
from deepeval.test_case import LLMTestCase
from dotenv import load_dotenv
import json
load_dotenv(override=True)

# Dataframes
import pandas as pd
import openpyxl



In [36]:
correctness_metric = GEval(
    name="Relevance",
    #criteria="Determine whether the actual output matches the expected output as close as possible.",
    # NOTE: you can only provide either criteria or evaluation_steps, and not both
    evaluation_steps=[
        "Check whether the main content of the responses generated in 'actual output' are similar to the responses in the 'expected output'",
        """As long as one of the main content of the responses generated is similar to any of the expected output, the test case is considered correct.
        For example, if response 1 content is on a pet dog and it matches response 3 content of also a pet dog, give it a high score. 
        The order of the responses is not important.""",
        "Evaluate mainly based on main content but do still give a higher score depending on similarity of responses."
    ],
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
    model="gpt-3.5-turbo",
)

In [37]:
# Ensemble_open_hf scores
ensemble_hf_scores_6_4 = []
ensemble_hf_reasons_6_4 = []


for i in range(len(rag_dataset_ensemble_hf_6_4['question'])):
    test_case = LLMTestCase(
        input=rag_dataset_ensemble_hf_6_4['question'][i],
        actual_output=rag_dataset_ensemble_hf_6_4['answer'][i],
        expected_output=rag_dataset_ensemble_hf_6_4['ground_truth'][i]
    )

    correctness_metric.measure(test_case)
    # print(correctness_metric.score)
    # print(correctness_metric.reason)
    ensemble_hf_scores_6_4.append(correctness_metric.score)
    ensemble_hf_reasons_6_4.append(correctness_metric.reason)

# print(ensemble_open_ai_scores)
# print(ensemble_open_ai_reasons)

In [38]:
# Ensemble_open_hf scores
ensemble_hf_scores_4_6 = []
ensemble_hf_reasons_4_6 = []


for i in range(len(rag_dataset_ensemble_hf_4_6['question'])):
    test_case = LLMTestCase(
        input=rag_dataset_ensemble_hf_4_6['question'][i],
        actual_output=rag_dataset_ensemble_hf_4_6['answer'][i],
        expected_output=rag_dataset_ensemble_hf_4_6['ground_truth'][i]
    )

    correctness_metric.measure(test_case)
    # print(correctness_metric.score)
    # print(correctness_metric.reason)
    ensemble_hf_scores_4_6.append(correctness_metric.score)
    ensemble_hf_reasons_4_6.append(correctness_metric.reason)

# print(ensemble_open_ai_scores)
# print(ensemble_open_ai_reasons)

In [39]:
# Combine scores and reasons into a DataFrame
ensemble_hf_df_6_4 = pd.DataFrame({'Scores': ensemble_hf_scores_6_4, 'Reasons': ensemble_hf_reasons_6_4})
ensemble_hf_df_4_6 = pd.DataFrame({'Scores': ensemble_hf_scores_4_6, 'Reasons': ensemble_hf_reasons_4_6})

# Calculate the average scores for each DataFrame
ensemble_hf_5_5 = hf_df['G-Eval_Scores'].mean()
ensemble_hf_6_4 = ensemble_hf_df_6_4['Scores'].mean()
ensemble_hf_4_6 = ensemble_hf_df_4_6['Scores'].mean()


# Print the average scores
print("Average Score for Ensemble HF 0.5,0.5:", ensemble_hf_5_5)
print("Average Score for Ensemble HF 0.6,0.4:", ensemble_hf_6_4)
print("Average Score for Ensemble HF 0.4,0.6:", ensemble_hf_4_6)

Average Score for Ensemble HF 0.5,0.5: 0.6355191874880344
Average Score for Ensemble HF 0.6,0.4: 0.6276375029535538
Average Score for Ensemble HF 0.4,0.6: 0.5456360238792106


In [40]:
# Specify the file path for the Excel file
excel_file_path = 'scorings/RAG_Prompt_Engineered/RAG Improvement/Ensemble HF/g_eval_ensemble_hf_6_4_v2.xlsx'

# Store the DataFrame into an Excel file
ensemble_hf_df_6_4.to_excel(excel_file_path)

# Specify the file path for the Excel file
excel_file_path = 'scorings/RAG_Prompt_Engineered/RAG Improvement/Ensemble HF/g_eval_ensemble_hf_4_6_v2.xlsx'

# Store the DataFrame into an Excel file
ensemble_hf_df_4_6.to_excel(excel_file_path)

## 0.7, 0.3

In [6]:
import openai

In [7]:
data_sample = {
    'question': [
        'What have you been up to Roydon?',
        'Woah really how is Arsenal doing right now then?',
        'Nice what breed is your new pet dog?',
        'So what you planning to do with your pet dog?',
        'How was your trip to thailand?',
        'What happened in thailand?',
        'What channel are you planning to create for your new pet dog?',
        #------------ Dual questions
        'How was your trip to thailand and any new travel plans next year?',
        'I heard you got a new pet dog how is he? What are you going to name him?',
        'Hows your new pet dog? What breed is he?'
        #------------ Complicated questions
        
    ],
    'answer': [],
    'contexts': [],
    'ground_truth': [
        "Response 1: I've been watching Arsenal games hoping they will win. Response 2: I've been looking at a trip to Japan. Response 3: I just got a new pet dog. How about you?",
        "Response 1: Arsenal is doing well, did you catch the match yesterday? Response 2: Arsenal is doing great and Aubameyang is a true asset to the team. Response 3: Arsenal is doing alright since Ben White is a great addition to the team.",
        "Response 1: He is a golden retriever, and he's the cutest thing ever! Response 2: He is a golden retriever, and he's the cutest thing ever! Response 3: He is a golden retriever, and he's the cutest thing ever!",
        "Response 1: I'm planning to take him on long hikes on the mountain. Response 2: I'm planning to take him to the beach and watch him splash in the waves. Response 3: I'm planning for play dates with other dogs.",
        "Response 1: It was a horrible experience and I would never go back. Response 2: It was a horrible experience and I would never go back. Response 3: It was a horrible experience and I would never go back.",
        "Response 1: I got scammed by a taxi driver and lost all my money. Response 2: The hotel lost my reservation and I had to sleep on the streets. Response 3: I kept getting ripped off by the locals and it was such a horrible experience.",
        "Response 1: I'm planning to create a special Instagram account just for him to share our adventures. Response 2: I'm planning to create a special Instagram account just for him to share our adventures. Response 3: I'm planning to create a special Instagram account just for him to share our adventures.",
        "Response 1: It was a horrible experience. I got scammed by a taxi driver and lost all my money. Response 2: It was a horrible experience. The hotel lost my reservation and I had to sleep on the streets. Response 3: It was a horrible experience. I kept getting ripped off by the locals.",
        "Response 1: He is so fun to be with. Im planning to name him Sunny. Response 2: He is so fun to be with. Im planning to name him Sunny. Response 3: He is so fun to be with. Im planning to name him Sunny.",
        "Response 1: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever! Response 2: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever! Response 3: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever!",
    ]
}

In [13]:
embeddings=HuggingFaceInferenceAPIEmbeddings(
    api_key=os.environ['HUGGING_FACE_ACCESS_TOKEN'],
    model_name='BAAI/bge-base-en-v1.5'
)

## Load Vector Store
loaded_faiss_vs_hf_v1 = FAISS.load_local("C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\\vector_store\\vectorstores\\hugging_face\\faiss_vs_hf_v1", embeddings=embeddings, allow_dangerous_deserialization=True)

In [14]:
# Initiate retriever
retriever_vectordb = loaded_faiss_vs_hf_v1.as_retriever(search_kwargs={"k": 3})
keyword_retriever = BM25Retriever.from_documents(documents)
keyword_retriever.k =  3
ensemble_retriever = EnsembleRetriever(retrievers=[retriever_vectordb,keyword_retriever],
                                       weights=[0.7, 0.3])

In [15]:
# Generate for rag
for query in data_sample['question']:
    # Get contexts for query
    docs_rel=ensemble_retriever.get_relevant_documents(query)
    docs_rel_top_3 = docs_rel[:3]

    contexts = ""
    for context in docs_rel_top_3:
        contexts += context.page_content

    data_sample['contexts'].append([contexts])

    content = f"""You are an assistant whom will faciliate the conversation between a mute and a normal person. The mute persons name is Roydon and the normal person is indicated as other person.
                        You should be generating 3 responses which the mute person could choose from and the responses generated should follow the context of the conversation. 
                        The responses should be what a person would say and should not include actions in a third person view. Your persona would be from the perspective of the mute person.

                        Snippets of conversation would be given below in the section of Context. Use the conversations to assist in the generation the 3 responses. Primarily the topic should be inferred from the question asked but if no topic can be inferred, infer the topics from the conversations given in the context. The conversations are seperated by "{{" and "}}":\n
                        Context: {contexts}

                        For example, if the context above contains "{{"Roydon": "Recently my new pet dog has been so fun!", "Jacob": "That\'s awesome! What breed is it?"}}"

                        If the user asks "What have you been up to?"

                        An example of the 3 generated response would be in the format of 1 single string "Response 1: I have been playing with my new pet dog. Response 2: Nothing much, I recently brought my new pet dog to a park. Response 3: Its been tiring lately after getting a new pet dog. """
    # Learning instructions
    instruction = {
        "role": "system",
        "content": content,
    }

    # Initialize messages
    messages = []

    # Add learn instruction to message array
    messages.append(instruction)

    user_message = {
            "role": "user",
            "content": "Other person says: " + query
    }

    messages.append(user_message)

    openai.api_type = 'openai'
    openai.api_key = os.environ["OPENAI_API_KEY"]
    openai.organisation= os.environ["OPEN_AI_ORG"]
    
    raw_response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages = messages,
    )
    response_choices = raw_response.choices[0].message.content
    data_sample['answer'].append(response_choices)

  warn_deprecated(


In [16]:
# Specify the file path
file_path = 'testing_json/Improve_RAG/rag_few_shot_ensemble_hf_v5_0.7_0.3.json'

# Save the data_sample dictionary into a JSON file
with open(file_path, 'w') as json_file:
    json.dump(data_sample, json_file)

## 0.8, 0.2

In [17]:
import openai

In [18]:
data_sample = {
    'question': [
        'What have you been up to Roydon?',
        'Woah really how is Arsenal doing right now then?',
        'Nice what breed is your new pet dog?',
        'So what you planning to do with your pet dog?',
        'How was your trip to thailand?',
        'What happened in thailand?',
        'What channel are you planning to create for your new pet dog?',
        #------------ Dual questions
        'How was your trip to thailand and any new travel plans next year?',
        'I heard you got a new pet dog how is he? What are you going to name him?',
        'Hows your new pet dog? What breed is he?'
        #------------ Complicated questions
        
    ],
    'answer': [],
    'contexts': [],
    'ground_truth': [
        "Response 1: I've been watching Arsenal games hoping they will win. Response 2: I've been looking at a trip to Japan. Response 3: I just got a new pet dog. How about you?",
        "Response 1: Arsenal is doing well, did you catch the match yesterday? Response 2: Arsenal is doing great and Aubameyang is a true asset to the team. Response 3: Arsenal is doing alright since Ben White is a great addition to the team.",
        "Response 1: He is a golden retriever, and he's the cutest thing ever! Response 2: He is a golden retriever, and he's the cutest thing ever! Response 3: He is a golden retriever, and he's the cutest thing ever!",
        "Response 1: I'm planning to take him on long hikes on the mountain. Response 2: I'm planning to take him to the beach and watch him splash in the waves. Response 3: I'm planning for play dates with other dogs.",
        "Response 1: It was a horrible experience and I would never go back. Response 2: It was a horrible experience and I would never go back. Response 3: It was a horrible experience and I would never go back.",
        "Response 1: I got scammed by a taxi driver and lost all my money. Response 2: The hotel lost my reservation and I had to sleep on the streets. Response 3: I kept getting ripped off by the locals and it was such a horrible experience.",
        "Response 1: I'm planning to create a special Instagram account just for him to share our adventures. Response 2: I'm planning to create a special Instagram account just for him to share our adventures. Response 3: I'm planning to create a special Instagram account just for him to share our adventures.",
        "Response 1: It was a horrible experience. I got scammed by a taxi driver and lost all my money. Response 2: It was a horrible experience. The hotel lost my reservation and I had to sleep on the streets. Response 3: It was a horrible experience. I kept getting ripped off by the locals.",
        "Response 1: He is so fun to be with. Im planning to name him Sunny. Response 2: He is so fun to be with. Im planning to name him Sunny. Response 3: He is so fun to be with. Im planning to name him Sunny.",
        "Response 1: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever! Response 2: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever! Response 3: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever!",
    ]
}

In [19]:
embeddings=HuggingFaceInferenceAPIEmbeddings(
    api_key=os.environ['HUGGING_FACE_ACCESS_TOKEN'],
    model_name='BAAI/bge-base-en-v1.5'
)

## Load Vector Store
loaded_faiss_vs_hf_v1 = FAISS.load_local("C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\\vector_store\\vectorstores\\hugging_face\\faiss_vs_hf_v1", embeddings=embeddings, allow_dangerous_deserialization=True)

In [20]:
# Initiate retriever
retriever_vectordb = loaded_faiss_vs_hf_v1.as_retriever(search_kwargs={"k": 3})
keyword_retriever = BM25Retriever.from_documents(documents)
keyword_retriever.k =  3
ensemble_retriever = EnsembleRetriever(retrievers=[retriever_vectordb,keyword_retriever],
                                       weights=[0.8, 0.2])

In [21]:
# Generate for rag
for query in data_sample['question']:
    # Get contexts for query
    docs_rel=ensemble_retriever.get_relevant_documents(query)
    docs_rel_top_3 = docs_rel[:3]

    contexts = ""
    for context in docs_rel_top_3:
        contexts += context.page_content

    data_sample['contexts'].append([contexts])

    content = f"""You are an assistant whom will faciliate the conversation between a mute and a normal person. The mute persons name is Roydon and the normal person is indicated as other person.
                        You should be generating 3 responses which the mute person could choose from and the responses generated should follow the context of the conversation. 
                        The responses should be what a person would say and should not include actions in a third person view. Your persona would be from the perspective of the mute person.

                        Snippets of conversation would be given below in the section of Context. Use the conversations to assist in the generation the 3 responses. Primarily the topic should be inferred from the question asked but if no topic can be inferred, infer the topics from the conversations given in the context. The conversations are seperated by "{{" and "}}":\n
                        Context: {contexts}

                        For example, if the context above contains "{{"Roydon": "Recently my new pet dog has been so fun!", "Jacob": "That\'s awesome! What breed is it?"}}"

                        If the user asks "What have you been up to?"

                        An example of the 3 generated response would be in the format of 1 single string "Response 1: I have been playing with my new pet dog. Response 2: Nothing much, I recently brought my new pet dog to a park. Response 3: Its been tiring lately after getting a new pet dog. """
    # Learning instructions
    instruction = {
        "role": "system",
        "content": content,
    }

    # Initialize messages
    messages = []

    # Add learn instruction to message array
    messages.append(instruction)

    user_message = {
            "role": "user",
            "content": "Other person says: " + query
    }

    messages.append(user_message)

    openai.api_type = 'openai'
    openai.api_key = os.environ["OPENAI_API_KEY"]
    openai.organisation= os.environ["OPEN_AI_ORG"]
    
    raw_response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages = messages,
    )
    response_choices = raw_response.choices[0].message.content
    data_sample['answer'].append(response_choices)

In [22]:
# Specify the file path
file_path = 'testing_json/Improve_RAG/rag_few_shot_ensemble_hf_v6_0.8_0.2.json'

# Save the data_sample dictionary into a JSON file
with open(file_path, 'w') as json_file:
    json.dump(data_sample, json_file)

## Comparing with 0.6,0.4

- 0.6,0.4
- 0.7,0.3
- 0.8, 0.2

In [24]:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, answer_correctness, context_recall, context_precision

In [26]:
# RAGAS 
file_path_ensemble_hf_7_3 = 'testing_json/Improve_RAG/rag_few_shot_ensemble_hf_v5_0.7_0.3.json'
file_path_ensemble_hf_8_2 = 'testing_json/Improve_RAG/rag_few_shot_ensemble_hf_v6_0.8_0.2.json'

with open(file_path_ensemble_hf_7_3, 'r') as json_file:
    rag_ensemble_hf_7_3 = json.load(json_file)

with open(file_path_ensemble_hf_8_2, 'r') as json_file:
    rag_ensemble_hf_8_2 = json.load(json_file)

rag_dataset_ensemble_hf_7_3 = Dataset.from_dict(rag_ensemble_hf_7_3)
rag_dataset_ensemble_hf_8_2 = Dataset.from_dict(rag_ensemble_hf_8_2)

In [28]:
rag_dataset_ensemble_hf_7_3_score = evaluate(rag_dataset_ensemble_hf_7_3, metrics=[answer_relevancy, answer_correctness,context_precision, context_recall])
rag_dataset_ensemble_hf_8_2_score = evaluate(rag_dataset_ensemble_hf_8_2, metrics=[answer_relevancy, answer_correctness,context_precision, context_recall])

rag_ensemble_hf_df_7_3 = rag_dataset_ensemble_hf_7_3_score.to_pandas()
rag_ensemble_hf_df_8_2 = rag_dataset_ensemble_hf_8_2_score.to_pandas()

In [29]:
import pandas as pd
# Load 0.6 0.4 scores

excel_file_path = 'scorings/RAG_Prompt_Engineered/RAG Improvement/Ensemble HF/ensemble_hf_6_4_scores.xlsx'

# Read the Excel file into a DataFrame
ensemble_hf_6_4 = pd.read_excel(excel_file_path)

ensemble_hf_6_4

Unnamed: 0.1,Unnamed: 0,question,answer,contexts,ground_truth,answer_relevancy,answer_correctness,context_precision,context_recall
0,0,What have you been up to Roydon?,"Response 1: I tried to, but everywhere I went,...","['{""Roydon"": ""I tried to, but everywhere I wen...",Response 1: I've been watching Arsenal games h...,0.0,0.214858,0.0,0.333333
1,1,Woah really how is Arsenal doing right now then?,Response 1: Arsenal is currently showing great...,"['{""Roydon"": ""I couldn\'t agree more! Aubameya...","Response 1: Arsenal is doing well, did you cat...",0.844825,0.501332,1.0,0.666667
2,2,Nice what breed is your new pet dog?,"Response 1: It's a golden retriever, and he's ...","['{""Roydon"": ""Guess what, I just got a new pet...","Response 1: He is a golden retriever, and he's...",0.941946,0.427109,1.0,1.0
3,3,So what you planning to do with your pet dog?,Response 1: I'm planning to teach him how to f...,"['{""Roydon"": ""Absolutely! I\'m planning to tea...",Response 1: I'm planning to take him on long h...,0.0,0.732617,1.0,0.666667
4,4,How was your trip to thailand?,"Response 1: It was a total disaster, nothing w...","['{""Roydon"": ""I can\'t believe how terrible my...",Response 1: It was a horrible experience and I...,0.0,0.75929,1.0,1.0
5,5,What happened in thailand?,Response 1: It was a series of unfortunate eve...,"['{""Roydon"": ""I can\'t believe what happened t...",Response 1: I got scammed by a taxi driver and...,0.0,0.213351,1.0,0.333333
6,6,What channel are you planning to create for yo...,Response 1: I'm thinking of creating a channel...,"['{""Roydon"": ""Guess what, I just got a new pet...",Response 1: I'm planning to create a special I...,0.0,0.791707,1.0,1.0
7,7,How was your trip to thailand and any new trav...,"Response 1: Thailand was a disaster, but I'm l...","['{""Roydon"": ""I can\'t wait to immerse myself ...",Response 1: It was a horrible experience. I go...,0.0,0.418789,1.0,0.333333
8,8,I heard you got a new pet dog how is he? What ...,Response 1: He's doing great! I named him Sunn...,"['{""Roydon"": ""Guess what, I just got a new pet...",Response 1: He is so fun to be with. Im planni...,0.927349,0.887572,1.0,1.0
9,9,Hows your new pet dog? What breed is he?,"Response 1: He's doing great, thanks for askin...","['{""Roydon"": ""Guess what, I just got a new pet...",Response 1: He brings so much joy to my life. ...,0.938489,0.731246,1.0,0.5


In [31]:
import pandas as pd

# Calculate average for 0.6, 0.4
rag_ensemble_hf_avg_answer_relevancy_6_4 = ensemble_hf_6_4['answer_relevancy'].mean(skipna=True)
rag_ensemble_hf_avg_answer_correctness_6_4 = ensemble_hf_6_4['answer_correctness'].mean(skipna=True)
rag_ensemble_hf_avg_precision_6_4 = ensemble_hf_6_4['context_precision'].mean(skipna=True)
rag_ensemble_hf_avg_recall_6_4 = ensemble_hf_6_4['context_recall'].mean(skipna=True)

# Calculate average for 0.7, 0.3
rag_ensemble_hf_avg_answer_relevancy_7_3 = rag_ensemble_hf_df_7_3['answer_relevancy'].mean(skipna=True)
rag_ensemble_hf_avg_answer_correctness_7_3 = rag_ensemble_hf_df_7_3['answer_correctness'].mean(skipna=True)
rag_ensemble_hf_avg_precision_7_3 = rag_ensemble_hf_df_7_3['context_precision'].mean(skipna=True)
rag_ensemble_hf_avg_recall_7_3 = rag_ensemble_hf_df_7_3['context_recall'].mean(skipna=True)

# Calculate average for 0.8, 0.2
rag_ensemble_hf_avg_answer_relevancy_8_2 = rag_ensemble_hf_df_8_2['answer_relevancy'].mean(skipna=True)
rag_ensemble_hf_avg_answer_correctness_8_2 = rag_ensemble_hf_df_8_2['answer_correctness'].mean(skipna=True)
rag_ensemble_hf_avg_precision_8_2 = rag_ensemble_hf_df_8_2['context_precision'].mean(skipna=True)
rag_ensemble_hf_avg_recall_8_2 = rag_ensemble_hf_df_8_2['context_recall'].mean(skipna=True)



# Print the averages
print("=========================Ensemble HF 0.6, 0.4=========================")
print("Non-RAG Average Answer Relevancy:", rag_ensemble_hf_avg_answer_relevancy_6_4)
print("Non-RAG Average Answer Correctness:", rag_ensemble_hf_avg_answer_correctness_6_4)
print("Non-RAG Average Context Precision:", rag_ensemble_hf_avg_precision_6_4)
print("Non-RAG Average Context Recall:", rag_ensemble_hf_avg_recall_6_4)
print("=========================Ensemble HF 0.7, 0.3=========================")
print("RAG Average Answer Relevancy:", rag_ensemble_hf_avg_answer_relevancy_7_3)
print("RAG Average Answer Correctness:", rag_ensemble_hf_avg_answer_correctness_7_3)
print("RAG Average Context Precision:", rag_ensemble_hf_avg_precision_7_3)
print("RAG Average Context Recall:", rag_ensemble_hf_avg_recall_7_3)
print("=========================Ensemble HF 0.8, 0.2=========================")
print("RAG Average Answer Relevancy:", rag_ensemble_hf_avg_answer_relevancy_8_2)
print("RAG Average Answer Correctness:", rag_ensemble_hf_avg_answer_correctness_8_2)
print("RAG Average Context Precision:", rag_ensemble_hf_avg_precision_8_2)
print("RAG Average Context Recall:", rag_ensemble_hf_avg_recall_8_2)

Non-RAG Average Answer Relevancy: 0.36526090687001056
Non-RAG Average Answer Correctness: 0.5677870677191184
Non-RAG Average Context Precision: 0.89999999991
Non-RAG Average Context Recall: 0.6833333333333333
RAG Average Answer Relevancy: 0.6425841014958626
RAG Average Answer Correctness: 0.5492757708033762
RAG Average Context Precision: 0.89999999991
RAG Average Context Recall: 0.7166666666666666
RAG Average Answer Relevancy: 0.36909307640225575
RAG Average Answer Correctness: 0.5602780987652676
RAG Average Context Precision: 0.89999999991
RAG Average Context Recall: 0.6833333333333333


In [32]:
# Specify the file path for the Excel file
excel_file_path = 'scorings/RAG_Prompt_Engineered/RAG Improvement/Ensemble HF/ensemble_hf_7_3_scores.xlsx'

# Store the DataFrame into an Excel file
rag_ensemble_hf_df_7_3.to_excel(excel_file_path)

# Specify the file path for the Excel file
excel_file_path = 'scorings/RAG_Prompt_Engineered/RAG Improvement/Ensemble HF/ensemble_hf_8_2_scores.xlsx'

# Store the DataFrame into an Excel file
rag_ensemble_hf_df_8_2.to_excel(excel_file_path)

In [33]:
# G-eval
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
from deepeval.test_case import LLMTestCase
from dotenv import load_dotenv
import json
load_dotenv(override=True)

# Dataframes
import pandas as pd
import openpyxl



In [34]:
correctness_metric = GEval(
    name="Relevance",
    #criteria="Determine whether the actual output matches the expected output as close as possible.",
    # NOTE: you can only provide either criteria or evaluation_steps, and not both
    evaluation_steps=[
        "Check whether the main content of the responses generated in 'actual output' are similar to the responses in the 'expected output'",
        """As long as one of the main content of the responses generated is similar to any of the expected output, the test case is considered correct.
        For example, if response 1 content is on a pet dog and it matches response 3 content of also a pet dog, give it a high score. 
        The order of the responses is not important.""",
        "Evaluate mainly based on main content but do still give a higher score depending on similarity of responses."
    ],
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
    model="gpt-3.5-turbo",
)

In [35]:
# Ensemble_open_hf scores
ensemble_hf_scores_7_3 = []
ensemble_hf_reasons_7_3 = []


for i in range(len(rag_dataset_ensemble_hf_7_3['question'])):
    test_case = LLMTestCase(
        input=rag_dataset_ensemble_hf_7_3['question'][i],
        actual_output=rag_dataset_ensemble_hf_7_3['answer'][i],
        expected_output=rag_dataset_ensemble_hf_7_3['ground_truth'][i]
    )

    correctness_metric.measure(test_case)
    # print(correctness_metric.score)
    # print(correctness_metric.reason)
    ensemble_hf_scores_7_3.append(correctness_metric.score)
    ensemble_hf_reasons_7_3.append(correctness_metric.reason)

# print(ensemble_open_ai_scores)
# print(ensemble_open_ai_reasons)

In [36]:
# Ensemble_open_hf scores
ensemble_hf_scores_8_2 = []
ensemble_hf_reasons_8_2 = []


for i in range(len(rag_dataset_ensemble_hf_8_2['question'])):
    test_case = LLMTestCase(
        input=rag_dataset_ensemble_hf_8_2['question'][i],
        actual_output=rag_dataset_ensemble_hf_8_2['answer'][i],
        expected_output=rag_dataset_ensemble_hf_8_2['ground_truth'][i]
    )

    correctness_metric.measure(test_case)
    # print(correctness_metric.score)
    # print(correctness_metric.reason)
    ensemble_hf_scores_8_2.append(correctness_metric.score)
    ensemble_hf_reasons_8_2.append(correctness_metric.reason)

# print(ensemble_open_ai_scores)
# print(ensemble_open_ai_reasons)

In [40]:
import pandas as pd
# Load 0.6 0.4 scores

excel_file_path = 'scorings/RAG_Prompt_Engineered/RAG Improvement/Ensemble HF/g_eval_ensemble_hf_6_4_v2.xlsx'

# Read the Excel file into a DataFrame
ensemble_hf_6_4 = pd.read_excel(excel_file_path)

ensemble_hf_6_4

Unnamed: 0.1,Unnamed: 0,Scores,Reasons
0,0,0.257821,Response 3 in the actual output is similar to ...
1,1,0.491448,None of the main content in the actual output ...
2,2,0.91503,The main content of the responses in 'actual o...
3,3,0.575091,Some main content of the responses generated a...
4,4,0.760092,The main content of the responses in the actua...
5,5,0.309339,"One of the main content in the actual output, ..."
6,6,0.645623,The main content of the responses generated in...
7,7,0.620191,The main content of the responses in the actua...
8,8,0.944703,The main content of the responses in the actua...
9,9,0.757036,The main content of the responses in the 'actu...


In [41]:
# Combine scores and reasons into a DataFrame
ensemble_hf_df_7_3 = pd.DataFrame({'Scores': ensemble_hf_scores_7_3, 'Reasons': ensemble_hf_reasons_7_3})
ensemble_hf_df_8_2 = pd.DataFrame({'Scores': ensemble_hf_scores_8_2, 'Reasons': ensemble_hf_reasons_8_2})

# Calculate the average scores for each DataFrame
ensemble_hf_6_4 = ensemble_hf_6_4['Scores'].mean()
ensemble_hf_7_3 = ensemble_hf_df_7_3['Scores'].mean()
ensemble_hf_8_2 = ensemble_hf_df_8_2['Scores'].mean()


# Print the average scores
print("Average Score for Ensemble HF 0.6,0.4:", ensemble_hf_6_4)
print("Average Score for Ensemble HF 0.7,0.3:", ensemble_hf_7_3)
print("Average Score for Ensemble HF 0.8,0.2:", ensemble_hf_8_2)

Average Score for Ensemble HF 0.6,0.4: 0.6276375029535538
Average Score for Ensemble HF 0.7,0.3: 0.6556917153416697
Average Score for Ensemble HF 0.8,0.2: 0.610807453001268


In [42]:
# Specify the file path for the Excel file
excel_file_path = 'scorings/RAG_Prompt_Engineered/RAG Improvement/Ensemble HF/g_eval_ensemble_hf_7_3_v3.xlsx'

# Store the DataFrame into an Excel file
ensemble_hf_df_7_3.to_excel(excel_file_path)

# Specify the file path for the Excel file
excel_file_path = 'scorings/RAG_Prompt_Engineered/RAG Improvement/Ensemble HF/g_eval_ensemble_hf_8_2_v4.xlsx'

# Store the DataFrame into an Excel file
ensemble_hf_df_8_2.to_excel(excel_file_path)

# Meta filtering and Query altering

### Prepare data

In [6]:
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

# Retrievers
from langchain.retrievers import BM25Retriever, EnsembleRetriever

import openai

In [7]:
# Load in environment variables
load_dotenv(override=True)

llm = AzureChatOpenAI(
    azure_endpoint=os.environ['AZURE_OPENAI_ENDPOINT'],
    api_key=os.environ['AZURE_OPENAI_APIKEY'],
    deployment_name=os.environ['AZURE_OPENAI_DEPLOYMENT_NAME'],
    model_name=os.environ['AZURE_OPENAI_MODEL_NAME'],
    api_version=os.environ['AZURE_OPENAI_API_VERSION'],
    temperature=0
)

In [52]:
documents = []
for filename in os.listdir('C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\mockdata\\'):
    if filename.endswith(".json"):
        with open(f'C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\mockdata\\{filename}') as f:
            data = json.load(f)
            for response_label, conversation in data.items():
                doc_content = json.dumps(conversation)
                doc_metadata = {"label": response_label, "source": filename}
                documents.append(Document(page_content=doc_content, metadata=doc_metadata))

print(len(documents))

61


In [7]:
print(documents[0])

page_content='{"Roydon": "Hey there! Can't wait for the new football season to start, hoping for a great one for Arsenal!", "John": "Hey Roydon! Yeah, it's always exciting to see how your team will perform. Optimistic as always, I see!"}' metadata={'label': 'Response 1', 'source': 'football.json'}


In [53]:
# Instruction prompt
persona = """You would be assisting in identifying topics from a snippet of conversation"""
task = """I would supply the conversation directly. Interpret the main topic of the conversation and return the main topic. Do not give multiple topics such as football/soccer. Only give one main topic."""
example = """For example if the conversation is: 
            {"Roydon": "Can't wait for the new football season to start, hoping for a great one for Arsenal!", "John": "Hey Roydon! Yeah, it's always exciting to see how your team will perform."}

           football

            Example 2:
            {"Roydon": "I'm planning to go on a trip to Japan next year", "John": "That's awesome! Japan is such a beautiful country."}

            travel

            Example 3: If no main topic can be determined such as a greeting
            {"Roydon": "Hey there! How are you doing?", "John": "Hey Roydon! I'm doing great, how about you?"}

            general"""


In [43]:
# Construct message object
instruction = f"{persona} {task} {example}"
messages = [SystemMessage(content=instruction)]

doc = documents[50]

query = doc.page_content

print(f"Human : {query}\n")
usermsg = HumanMessage(content=query)
messages.append(usermsg)

response = llm.invoke(messages)

print(response.content)

# with open('C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\mockdata\\pet2.json', 'a') as f:
#     json.dump(responses, f, indent=4)
#     f.write("\n") 

Human : {"Roydon": "Yes, I plan to create a special Instagram account just for him! I want to share our adventures and cute moments with everyone.", "Xavier": "That's a brilliant idea! I'm sure your golden retriever will become an Instagram star in no time. Do you have a name picked out for him yet?"}

pets


In [54]:
for doc in documents:
    instruction = f"{persona} {task} {example}"
    messages = [SystemMessage(content=instruction)]

    query = doc.page_content

    usermsg = HumanMessage(content=query)
    messages.append(usermsg)

    response = llm.invoke(messages)

    doc.metadata['topic'] = response.content


In [68]:
import json

# Convert the documents list to a JSON serializable format
documents_json = [
    {
        "metadata": doc.metadata,
        "page_content": doc.page_content
    }
    for doc in documents
]

# Specify the file path for the JSON file
json_file_path = 'testing_json/documents.json'

# Save the documents list into a JSON file
with open(json_file_path, 'w') as json_file:
    json.dump(documents_json, json_file) 

In [69]:
position = []
count = 0

for doc in documents:
    if(len(doc.metadata['topic'].split()) > 1):
        print(doc.metadata['topic'])
        position.append(count)
    count += 1

In [56]:
position

[30, 48, 59]

In [61]:
count = 0

for doc in documents:
    if(len(doc.metadata['topic']) == 0):
        print(count)
    count += 1

In [60]:
documents[59].metadata['topic'] = 'general'

In [33]:
for doc in documents:
    print(doc.metadata['topic'])

football
football
football
football
football
football
football
football
determination
sports
football
sports
football
football
football
football
travel
travel
travel
travel
travel
travel
travel
travel
travel
travel
travel
travel
crime
problem
finances
travel
emotions
ethics
experience
travel
general
general
pets
pets
pets
pets
pets
pets
pets
pets
pets
pets
pets
pets
pets
pets
travel
travel
travel
travel
travel
travel
travel
general
support


In [24]:
position

[30, 48, 59]

In [70]:
embeddings=HuggingFaceInferenceAPIEmbeddings(
    api_key=os.environ['HUGGING_FACE_ACCESS_TOKEN'],
    model_name='BAAI/bge-base-en-v1.5'
)

In [71]:
# Vector store
faiss_vectorstore_hugging_face_v3 = FAISS.from_documents(documents, embeddings)

faiss_vectorstore_hugging_face_v3

<langchain_community.vectorstores.faiss.FAISS at 0x22044968710>

In [72]:
## Saving Vector Store
faiss_vectorstore_hugging_face_v3.save_local("C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\\vector_store\\vectorstores\\hugging_face\\faiss_vs_hf_v3")

## Meta Filtering Search (V1)

In [86]:
embeddings=HuggingFaceInferenceAPIEmbeddings(
    api_key=os.environ['HUGGING_FACE_ACCESS_TOKEN'],
    model_name='BAAI/bge-base-en-v1.5'
)

## Load Vector Store
loaded_faiss_vs_hf_v2 = FAISS.load_local("C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\\vector_store\\vectorstores\\hugging_face\\faiss_vs_hf_v2", embeddings=embeddings, allow_dangerous_deserialization=True)

In [30]:
meta_content = """
You would be assisting in identifying topics from a snippet of conversation. I would supply the conversation directly. Interpret the main topic of the conversation and return the main topic.

Do not give multiple topics such as football/soccer. Only give one main topic.

For example if the conversation is: 
            {"Roydon": "Can't wait for the new football season to start, hoping for a great one for Arsenal!", "John": "Hey Roydon! Yeah, it's always exciting to see how your team will perform."}

            football

            Example 2:
            {"Roydon": "I'm planning to go on a trip to Japan next year", "John": "That's awesome! Japan is such a beautiful country."}

            travel

            Example 3: If no main topic can be determined such as a greeting
            {"Roydon": "Hey there! How are you doing?", "John": "Hey Roydon! I'm doing great, how about you?"}

            general 
"""

In [31]:
# Generate for rag

# Learning instructions
instruction = {
    "role": "system",
    "content": meta_content,
}

doc = documents[2]

print("Query is: " + doc.page_content)

# Initialize messages
messages = []

# Add learn instruction to message array
messages.append(instruction)

user_message = {
        "role": "user",
        "content": doc.page_content
}

messages.append(user_message)

openai.api_type = 'openai'
openai.api_key = os.environ["OPENAI_API_KEY"]
openai.organisation= os.environ["OPEN_AI_ORG"]

raw_response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages = messages,
)
response_choices = raw_response.choices[0].message.content

print(response_choices)

Query is: {"Roydon": "Haha, definitely! Looking forward to some intense matches between our teams. Who do you think will be Arsenal's key player this season?", "John": "I have a feeling Aubameyang will continue to shine for Arsenal. His goals are always a game-changer!"}
football


In [39]:
documents[7]

Document(metadata={'label': 'Response 1', 'source': 'football2.json', 'topic': 'The main topic of the conversation is football.'}, page_content='{"Roydon": "Hey there! Did you catch the Arsenal game last night? What a thrilling match!", "John": "Hey Roydon! Yes, I watched it. Arsenal played really well, didn\'t they?"}')

In [44]:
# Meta filtering retriever to reduce number of documents first
new_docs = []

new_docs = loaded_faiss_vs_hf_v2.similarity_search(
    "",
    k=10,
    filter={"topic": response_choices},
)

print("Length is: ", len(new_docs))
print(new_docs)

Length is:  1
[Document(metadata={'label': 'Response 5', 'source': 'football.json', 'topic': 'football'}, page_content='{"Roydon": "I\'m optimistic about the new signings, especially Ben White. I think he\'ll strengthen our defense. How about Manchester United\'s signings?", "John": "Varane and Sancho are exciting additions to our squad. Hoping they make a big impact this season. Do you think Arsenal\'s manager is the right fit for the team?"}')]


In [50]:
retriever_vectordb = loaded_faiss_vs_hf_v2.as_retriever(search_kwargs={"k": 3})

In [62]:
# Initiate retriever
retriever_vectordb = loaded_faiss_vs_hf_v2.as_retriever(search_kwargs={"k": 3})
keyword_retriever = BM25Retriever.from_documents(new_docs)
keyword_retriever.k =  3
ensemble_retriever = EnsembleRetriever(retrievers=[retriever_vectordb,keyword_retriever],
                                       weights=[0.7, 0.3])

In [64]:
docs_rel=ensemble_retriever.get_relevant_documents("How is your new pet dog?")
docs_rel_top_3 = docs_rel[:3]
print(docs_rel_top_3)

[Document(metadata={'label': 'Response 1', 'source': 'pet.json', 'topic': 'pets'}, page_content='{"Roydon": "Guess what, I just got a new pet dog!", "Jacob": "That\'s awesome! What breed is it?"}'), Document(metadata={'label': 'Response 6', 'source': 'pet.json', 'topic': 'pets'}, page_content='{"Roydon": "I couldn\'t agree more, I feel like my new dog has completed my little family.", "Jacob": "It\'s amazing how pets have a way of making a house feel like a home, enjoy every moment with your furry friend!"}'), Document(metadata={'label': 'Response 1', 'source': 'pet2.json', 'topic': 'pets'}, page_content='{"Roydon": "I can\'t wait to take my new golden retriever dog on long hikes in the mountains!", "Xavier": "That sounds like a great idea! Golden retrievers love the outdoors. Have you thought about teaching him any tricks?"}')]


## Meta Filtering Search (V2)

In [8]:
embeddings=HuggingFaceInferenceAPIEmbeddings(
    api_key=os.environ['HUGGING_FACE_ACCESS_TOKEN'],
    model_name='BAAI/bge-base-en-v1.5'
)

## Load Vector Store
loaded_faiss_vs_hf_v2 = FAISS.load_local("C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\\vector_store\\vectorstores\\hugging_face\\faiss_vs_hf_v2", embeddings=embeddings, allow_dangerous_deserialization=True)

In [9]:
documents = []
for filename in os.listdir('C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\mockdata\\'):
    if filename.endswith(".json"):
        with open(f'C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\mockdata\\{filename}') as f:
            data = json.load(f)
            for response_label, conversation in data.items():
                doc_content = json.dumps(conversation)
                doc_metadata = {"label": response_label, "source": filename}
                documents.append(Document(page_content=doc_content, metadata=doc_metadata))

print(len(documents))

61


In [10]:
# Initiate retriever
retriever_vectordb = loaded_faiss_vs_hf_v2.as_retriever(search_kwargs={"k": 6})
keyword_retriever = BM25Retriever.from_documents(documents)
keyword_retriever.k =  6
ensemble_retriever = EnsembleRetriever(retrievers=[retriever_vectordb,keyword_retriever],
                                       weights=[0.7, 0.3])

In [11]:
query = "How is your new pet dog?"

try:
    docs_rel = ensemble_retriever.invoke(query)
    print("Retrieved documents:", docs_rel)
except KeyError as e:
    print(f"KeyError: {e}")
except Exception as e:
    print(f"An error occurred: {e}")

# docs_rel_top_3 = docs_rel
print("Number of relevant documents:", len(docs_rel))

Retrieved documents: [Document(metadata={'label': 'Response 1', 'source': 'pet.json', 'topic': 'pets'}, page_content='{"Roydon": "Guess what, I just got a new pet dog!", "Jacob": "That\'s awesome! What breed is it?"}'), Document(metadata={'label': 'Response 5', 'source': 'pet2.json', 'topic': 'pets'}, page_content='{"Roydon": "I\'m thinking about organizing playdates with other dogs in the neighborhood. It will be a great way for him to socialize and make new friends.", "Xavier": "That\'s a fantastic idea! Socializing is important for dogs, and it will also be a great opportunity for you to meet other dog owners. Have you thought about documenting your adventures with your golden retriever?"}'), Document(metadata={'label': 'Response 6', 'source': 'pet.json', 'topic': 'pets'}, page_content='{"Roydon": "I couldn\'t agree more, I feel like my new dog has completed my little family.", "Jacob": "It\'s amazing how pets have a way of making a house feel like a home, enjoy every moment with yo

In [91]:
query = "How is your new pet dog?"
docs_rel=ensemble_retriever.get_relevant_documents(query)
docs_rel_top_3 = docs_rel
len(docs_rel)

10

In [92]:
meta_content = """
You would be assisting in identifying topics from a snippet of conversation. I would supply the conversation directly. Interpret the main topic of the conversation and return the main topic.

Do not give multiple topics such as football/soccer. Only give one main topic.

For example if the conversation is: 
            {"Roydon": "Can't wait for the new football season to start, hoping for a great one for Arsenal!", "John": "Hey Roydon! Yeah, it's always exciting to see how your team will perform."}

            football

            Example 2:
            {"Roydon": "I'm planning to go on a trip to Japan next year", "John": "That's awesome! Japan is such a beautiful country."}

            travel

            Example 3: If no main topic can be determined such as a greeting
            {"Roydon": "Hey there! How are you doing?", "John": "Hey Roydon! I'm doing great, how about you?"}

            general 
"""

In [95]:
# Generate for rag

# Learning instructions
instruction = {
    "role": "system",
    "content": meta_content,
}

print("Query is: " + query)

# Initialize messages
messages = []

# Add learn instruction to message array
messages.append(instruction)

user_message = {
        "role": "user",
        "content": query
}

messages.append(user_message)

openai.api_type = 'openai'
openai.api_key = os.environ["OPENAI_API_KEY"]
openai.organisation= os.environ["OPEN_AI_ORG"]

raw_response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages = messages,
)
topic = raw_response.choices[0].message.content

print(topic)

Query is: How is your new pet dog?
pets


In [97]:
# Filter according to the topic
filtered_docs = []

for doc in docs_rel:
    if(doc.metadata['topic'] == topic):
        filtered_docs.append(doc)

print(len(filtered_docs))

6


In [100]:
for doc in docs_rel:
    print(doc)

page_content='{"Roydon": "Guess what, I just got a new pet dog!", "Jacob": "That's awesome! What breed is it?"}' metadata={'label': 'Response 1', 'source': 'pet.json', 'topic': 'pets'}
page_content='{"Roydon": "I'm thinking about organizing playdates with other dogs in the neighborhood. It will be a great way for him to socialize and make new friends.", "Xavier": "That's a fantastic idea! Socializing is important for dogs, and it will also be a great opportunity for you to meet other dog owners. Have you thought about documenting your adventures with your golden retriever?"}' metadata={'label': 'Response 5', 'source': 'pet2.json', 'topic': 'pets'}
page_content='{"Roydon": "I couldn't agree more, I feel like my new dog has completed my little family.", "Jacob": "It's amazing how pets have a way of making a house feel like a home, enjoy every moment with your furry friend!"}' metadata={'label': 'Response 6', 'source': 'pet.json', 'topic': 'pets'}
page_content='{"Roydon": "I can't wait to

In [99]:
for docs in filtered_docs:
    print(docs)

page_content='{"Roydon": "Guess what, I just got a new pet dog!", "Jacob": "That's awesome! What breed is it?"}' metadata={'label': 'Response 1', 'source': 'pet.json', 'topic': 'pets'}
page_content='{"Roydon": "I'm thinking about organizing playdates with other dogs in the neighborhood. It will be a great way for him to socialize and make new friends.", "Xavier": "That's a fantastic idea! Socializing is important for dogs, and it will also be a great opportunity for you to meet other dog owners. Have you thought about documenting your adventures with your golden retriever?"}' metadata={'label': 'Response 5', 'source': 'pet2.json', 'topic': 'pets'}
page_content='{"Roydon": "I couldn't agree more, I feel like my new dog has completed my little family.", "Jacob": "It's amazing how pets have a way of making a house feel like a home, enjoy every moment with your furry friend!"}' metadata={'label': 'Response 6', 'source': 'pet.json', 'topic': 'pets'}
page_content='{"Roydon": "I can't wait to

## Evaluation

#### Generate mock data

In [12]:
import openai

In [13]:
data_sample = {
    'question': [
        'What have you been up to Roydon?',
        'Woah really how is Arsenal doing right now then?',
        'Nice what breed is your new pet dog?',
        'So what you planning to do with your pet dog?',
        'How was your trip to thailand?',
        'What happened in thailand?',
        'What channel are you planning to create for your new pet dog?',
        #------------ Dual questions
        'How was your trip to thailand and any new travel plans next year?',
        'I heard you got a new pet dog how is he? What are you going to name him?',
        'Hows your new pet dog? What breed is he?'
        #------------ Complicated questions
        
    ],
    'answer': [],
    'contexts': [],
    'ground_truth': [
        "Response 1: I've been watching Arsenal games hoping they will win. Response 2: I've been looking at a trip to Japan. Response 3: I just got a new pet dog. How about you?",
        "Response 1: Arsenal is doing well, did you catch the match yesterday? Response 2: Arsenal is doing great and Aubameyang is a true asset to the team. Response 3: Arsenal is doing alright since Ben White is a great addition to the team.",
        "Response 1: He is a golden retriever, and he's the cutest thing ever! Response 2: He is a golden retriever, and he's the cutest thing ever! Response 3: He is a golden retriever, and he's the cutest thing ever!",
        "Response 1: I'm planning to take him on long hikes on the mountain. Response 2: I'm planning to take him to the beach and watch him splash in the waves. Response 3: I'm planning for play dates with other dogs.",
        "Response 1: It was a horrible experience and I would never go back. Response 2: It was a horrible experience and I would never go back. Response 3: It was a horrible experience and I would never go back.",
        "Response 1: I got scammed by a taxi driver and lost all my money. Response 2: The hotel lost my reservation and I had to sleep on the streets. Response 3: I kept getting ripped off by the locals and it was such a horrible experience.",
        "Response 1: I'm planning to create a special Instagram account just for him to share our adventures. Response 2: I'm planning to create a special Instagram account just for him to share our adventures. Response 3: I'm planning to create a special Instagram account just for him to share our adventures.",
        "Response 1: It was a horrible experience. I got scammed by a taxi driver and lost all my money. Response 2: It was a horrible experience. The hotel lost my reservation and I had to sleep on the streets. Response 3: It was a horrible experience. I kept getting ripped off by the locals.",
        "Response 1: He is so fun to be with. Im planning to name him Sunny. Response 2: He is so fun to be with. Im planning to name him Sunny. Response 3: He is so fun to be with. Im planning to name him Sunny.",
        "Response 1: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever! Response 2: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever! Response 3: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever!",
    ]
}

In [14]:
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

# Retrievers
from langchain.retrievers import BM25Retriever, EnsembleRetriever

embeddings=HuggingFaceInferenceAPIEmbeddings(
    api_key=os.environ['HUGGING_FACE_ACCESS_TOKEN'],
    model_name='BAAI/bge-base-en-v1.5'
)

## Load Vector Store
loaded_faiss_vs_hf_v3 = FAISS.load_local("C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\\vector_store\\vectorstores\\hugging_face\\faiss_vs_hf_v3", embeddings=embeddings, allow_dangerous_deserialization=True)

In [40]:
# documents = []
# for filename in os.listdir('C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\mockdata\\'):
#     if filename.endswith(".json"):
#         with open(f'C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\mockdata\\{filename}') as f:
#             data = json.load(f)
#             for response_label, conversation in data.items():
#                 doc_content = json.dumps(conversation)
#                 doc_metadata = {"label": response_label, "source": filename}
#                 documents.append(Document(page_content=doc_content, metadata=doc_metadata))

# print(len(documents))

61


In [16]:
# Specify the file path for the JSON file
json_file_path = 'C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\mockdata\\documents.json'

with open(json_file_path, 'r') as json_file:
    documents_json = json.load(json_file)

# Convert the JSON serializable format back to Document objects
documents = [
    Document(page_content=doc['page_content'], metadata=doc['metadata'])
    for doc in documents_json
]

print(f"Loaded {len(documents)} documents.")

Loaded 61 documents.


In [17]:
# Initiate retriever
retriever_vectordb = loaded_faiss_vs_hf_v3.as_retriever(search_kwargs={"k": 6})
keyword_retriever = BM25Retriever.from_documents(documents)
keyword_retriever.k =  6
ensemble_retriever = EnsembleRetriever(retrievers=[retriever_vectordb,keyword_retriever],
                                       weights=[0.7, 0.3])

In [18]:
# Initiate meta content
meta_content = """
You would be assisting in identifying topics from a snippet of conversation. I would supply the conversation directly. 
Interpret the main topic of the conversation and return the main topic.

Do not give multiple topics such as football/soccer. Only give one main topic.

For example if the conversation is: 
            {"Roydon": "Can't wait for the new football season to start, hoping for a great one for Arsenal!", "John": "Hey Roydon! Yeah, it's 
            always exciting to see how your team will perform."}

            football

            Example 2:
            {"Roydon": "I'm planning to go on a trip to Japan next year", "John": "That's awesome! Japan is such a beautiful country."}

            travel

            Example 3: If no main topic can be determined such as a greeting
            {"Roydon": "Hey there! How are you doing?", "John": "Hey Roydon! I'm doing great, how about you?"}

            general 
"""

In [19]:
def getTopic(meta_content, query):
    # Learning instructions
    instruction = {
        "role": "system",
        "content": meta_content,
    }

    #print("Query is: " + query)

    # Initialize messages
    messages = []

    # Add learn instruction to message array
    messages.append(instruction)

    user_message = {
            "role": "user",
            "content": query
    }

    messages.append(user_message)

    openai.api_type = 'openai'
    openai.api_key = os.environ["OPENAI_API_KEY"]
    openai.organisation= os.environ["OPEN_AI_ORG"]

    raw_response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages = messages,
    )
    topic = raw_response.choices[0].message.content

    return topic

In [19]:
import inflect

def singularize_and_lower(topic):
    # Create an inflect engine for handling plurals
    engine = inflect.engine()
    
    # Lowercase the topic
    topic = topic.lower()
    
    # Singularize the topic (convert plurals to singular), returns false if not noun
    topic = engine.singular_noun(topic) if engine.singular_noun(topic) else topic
    
    return topic

print(singularize_and_lower("multiple topics"))


multiple topic


In [1]:
# Instead of inflect use lemmatization for noun
import nltk
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet



In [32]:
def is_valid_word(word):
    """Check if a word is valid by looking it up in WordNet."""
    return bool(wordnet.synsets(word))

In [33]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts."""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [44]:
def remove_ing(topic):
    if topic.endswith('ing'):
        return topic[:-3]
    return topic

In [45]:
def singularize_and_lower_lemmatize(topic):
    # Initialize the WordNet Lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # Lowercase the topic
    topic = topic.lower()

    # Lemmatize the word with correct POS tag
    pos = get_wordnet_pos(topic)
    topic_lemmataized = lemmatizer.lemmatize(topic, pos)

    # Extra check for words like crotcheting
    topic = remove_ing(topic_lemmataized)
    processed_words = [is_valid_word(word) for word in topic.split()]
    if not all(processed_words):
        return topic_lemmataized
        
    return topic

In [46]:
print("lematization results: ", singularize_and_lower_lemmatize("crotcheting"))
print("inflect results: ", singularize_and_lower("crotcheting"))

lematization results:  crotchet
inflect results:  crotcheting


In [47]:
print("lematization results: ", singularize_and_lower_lemmatize("giant animals"))
print("inflect results: ", singularize_and_lower("giant animals"))

lematization results:  giant animals
inflect results:  giant animal


In [48]:
print("lematization results: ", singularize_and_lower_lemmatize("flying"))
print("inflect results: ", singularize_and_lower("flying"))

lematization results:  fly
inflect results:  flying


In [50]:
print("lematization results: ", singularize_and_lower_lemmatize("crotchets"))
print("inflect results: ", singularize_and_lower("crotchets"))

lematization results:  crotchet
inflect results:  crotchet


In [21]:
def filter_list(docs_rel, topic):
    # Filter according to the topic
    filtered_docs = []
    final_docs = []
    general_topic = {}

    if singularize_and_lower(topic) == "general":
        for doc in docs_rel:
            if singularize_and_lower(doc.metadata['topic']) not in general_topic:
                general_topic[singularize_and_lower(doc.metadata['topic'])] = 1
                filtered_docs.append(doc)
            else:
                continue
    else:
        for doc in docs_rel:
            if(singularize_and_lower(doc.metadata['topic']) == singularize_and_lower(topic)):
                filtered_docs.append(doc)
       
    if len(filtered_docs) > 2:
        final_docs = filtered_docs[:3]
        return final_docs
    else:
        count = 3 - len(filtered_docs)
        final_docs = filtered_docs
        position = 0
        for i in range(count):
            if(position == len(docs_rel)):
                break
            if(docs_rel[position] in filtered_docs):
                i = i-1
                position += 1
                continue
            else:
                final_docs.append(docs_rel[position])# need to change so that it wont be same obtained
                position+=1 
        
        return final_docs

In [17]:
query = "What have you been up to Roydon?"
docs_rel=ensemble_retriever.get_relevant_documents(query)
topic_interpreted = getTopic(meta_content, query)
for i in (docs_rel):
    print(i)

print("Topic is: ", topic_interpreted)

  warn_deprecated(


Query is: What have you been up to Roydon?
page_content='{"Roydon": "I tried to, but everywhere I went, I just kept getting ripped off by the locals.", "Dory": "That must have been frustrating. Did you try any of the street food at least?"}' metadata={'label': 'Response 3', 'source': 'travel.json', 'topic': 'travel'}
page_content='{"Roydon": "Hey there! Did you catch the Arsenal game last night? What a thrilling match!", "John": "Hey Roydon! Yes, I watched it. Arsenal played really well, didn't they?"}' metadata={'label': 'Response 1', 'source': 'football2.json', 'topic': 'football'}
page_content='{"Roydon": "Easy for you to say. You weren't the one stuck in a foreign country with nothing going right.", "Dory": "I know, but sometimes these things happen. You just have to try and make the best of it."}' metadata={'label': 'Response 7', 'source': 'travel.json', 'topic': 'travel'}
page_content='{"Roydon": "I can't believe what happened to me in Thailand.", "Xavier": "What happened?"}' met

In [22]:
# Testing the filter_list function
query = "What have you been up to Roydon?"
docs_rel=ensemble_retriever.get_relevant_documents(query)
test_docs = []
test_docs.append(docs_rel[0])
test_docs.append(docs_rel[1])

final_docs = filter_list(test_docs, "general")

print(len(final_docs))
print(final_docs)

  warn_deprecated(


2
[Document(metadata={'label': 'Response 3', 'source': 'travel.json', 'topic': 'travel'}, page_content='{"Roydon": "I tried to, but everywhere I went, I just kept getting ripped off by the locals.", "Dory": "That must have been frustrating. Did you try any of the street food at least?"}'), Document(metadata={'label': 'Response 1', 'source': 'football2.json', 'topic': 'football'}, page_content='{"Roydon": "Hey there! Did you catch the Arsenal game last night? What a thrilling match!", "John": "Hey Roydon! Yes, I watched it. Arsenal played really well, didn\'t they?"}')]


In [23]:
# Generate for rag
for query in data_sample['question']:
    # Get contexts for query
    docs_rel=ensemble_retriever.get_relevant_documents(query)
    
    topic_interpreted = getTopic(meta_content, query)
    
    final_docs = filter_list(docs_rel, topic_interpreted) # Still top 3

    contexts = ""
    for context in final_docs:
        contexts += context.page_content

    data_sample['contexts'].append([contexts])

    content = f"""You are an assistant whom will faciliate the conversation between a mute and a normal person. The mute persons name is Roydon and the normal person is indicated as other person.
                        You should be generating 3 responses which the mute person could choose from and the responses generated should follow the context of the conversation. 
                        The responses should be what a person would say and should not include actions in a third person view. Your persona would be from the perspective of the mute person.

                        Snippets of conversation would be given below in the section of Context. Use the conversations to assist in the generation the 3 responses. Primarily the topic should be inferred from the question asked but if no topic can be inferred, infer the topics from the conversations given in the context. The conversations are seperated by "{{" and "}}":\n
                        Context: {contexts}

                        For example, if the context above contains "{{"Roydon": "Recently my new pet dog has been so fun!", "Jacob": "That\'s awesome! What breed is it?"}}"

                        If the user asks "What have you been up to?"

                        An example of the 3 generated response would be in the format of 1 single string "Response 1: I have been playing with my new pet dog. Response 2: Nothing much, I recently brought my new pet dog to a park. Response 3: Its been tiring lately after getting a new pet dog. """
    # Learning instructions
    instruction = {
        "role": "system",
        "content": content,
    }

    # Initialize messages
    messages = []

    # Add learn instruction to message array
    messages.append(instruction)

    user_message = {
            "role": "user",
            "content": "Other person says: " + query
    }

    messages.append(user_message)

    openai.api_type = 'openai'
    openai.api_key = os.environ["OPENAI_API_KEY"]
    openai.organisation= os.environ["OPEN_AI_ORG"]
    
    raw_response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages = messages,
    )
    response_choices = raw_response.choices[0].message.content
    data_sample['answer'].append(response_choices)

In [24]:
# Specify the file path (v1 is before the changed filtered list)
file_path = 'testing_json/Improve_RAG/meta_filter_v2.json'

# Save the data_sample dictionary into a JSON file
with open(file_path, 'w') as json_file:
    json.dump(data_sample, json_file)

#### RAGAS And G-Eval Score

In [25]:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, answer_correctness, context_recall, context_precision

  from .autonotebook import tqdm as notebook_tqdm


In [26]:
# RAGAS 
file_path_meta_filter = 'testing_json/Improve_RAG/meta_filter_v2.json'

with open(file_path_meta_filter, 'r') as json_file:
    rag_ensemble_meta_filter = json.load(json_file)

rag_dataset_meta_filter = Dataset.from_dict(rag_ensemble_meta_filter)

In [27]:
rag_dataset_ensemble_meta_filter_score = evaluate(rag_dataset_meta_filter, metrics=[answer_relevancy, answer_correctness,context_precision, context_recall])

rag_ensemble_meta_filter_df = rag_dataset_ensemble_meta_filter_score.to_pandas()

Evaluating: 100%|██████████| 40/40 [00:19<00:00,  2.02it/s]


In [43]:
import pandas as pd
# Load 0.7 0.3 scores

excel_file_path = 'scorings/RAG_Prompt_Engineered/RAG Improvement/Ensemble HF/ensemble_hf_7_3_scores.xlsx'

# Read the Excel file into a DataFrame
ensemble_hf_7_3 = pd.read_excel(excel_file_path)

ensemble_hf_7_3

Unnamed: 0.1,Unnamed: 0,question,answer,contexts,ground_truth,answer_relevancy,answer_correctness,context_precision,context_recall
0,0,What have you been up to Roydon?,"Response 1: I tried to, but everywhere I went,...","['{""Roydon"": ""I tried to, but everywhere I wen...",Response 1: I've been watching Arsenal games h...,0.0,0.214522,0.0,0.333333
1,1,Woah really how is Arsenal doing right now then?,Response 1: Arsenal is currently showing great...,"['{""Roydon"": ""I couldn\'t agree more! Aubameya...","Response 1: Arsenal is doing well, did you cat...",0.902912,0.980656,1.0,0.666667
2,2,Nice what breed is your new pet dog?,"Response 1: It's a golden retriever, and he's ...","['{""Roydon"": ""Guess what, I just got a new pet...","Response 1: He is a golden retriever, and he's...",0.956775,0.612637,1.0,1.0
3,3,So what you planning to do with your pet dog?,Response 1: I'm planning to teach him how to f...,"['{""Roydon"": ""Absolutely! I\'m planning to tea...",Response 1: I'm planning to take him on long h...,0.0,0.7682,1.0,0.666667
4,4,How was your trip to thailand?,"Response 1: It didn't go as planned, but I'm l...","['{""Roydon"": ""I can\'t believe how terrible my...",Response 1: It was a horrible experience and I...,0.0,0.656944,1.0,1.0
5,5,What happened in thailand?,Response 1: My hotel reservation got messed up...,"['{""Roydon"": ""I can\'t believe what happened t...",Response 1: I got scammed by a taxi driver and...,0.91164,0.231538,1.0,0.333333
6,6,What channel are you planning to create for yo...,Response 1: I'm thinking of creating an Instag...,"['{""Roydon"": ""Guess what, I just got a new pet...",Response 1: I'm planning to create a special I...,0.886203,0.220141,1.0,1.0
7,7,How was your trip to thailand and any new trav...,Response 1: Thailand was a rollercoaster of mi...,"['{""Roydon"": ""I can\'t wait to immerse myself ...",Response 1: It was a horrible experience. I go...,0.922883,0.36911,1.0,0.666667
8,8,I heard you got a new pet dog how is he? What ...,Response 1: He's doing great! I named him Sunn...,"['{""Roydon"": ""Guess what, I just got a new pet...",Response 1: He is so fun to be with. Im planni...,0.911895,0.981487,1.0,1.0
9,9,Hows your new pet dog? What breed is he?,"Response 1: He's a golden retriever, and he's ...","['{""Roydon"": ""Guess what, I just got a new pet...",Response 1: He brings so much joy to my life. ...,0.933533,0.457522,1.0,0.5


In [49]:
import pandas as pd
# Calculate average for 0.7, 0.3
rag_ensemble_hf_avg_answer_relevancy_7_3 = ensemble_hf_7_3['answer_relevancy'].mean(skipna=True)
rag_ensemble_hf_avg_answer_correctness_7_3 = ensemble_hf_7_3['answer_correctness'].mean(skipna=True)
rag_ensemble_hf_avg_precision_7_3 = ensemble_hf_7_3['context_precision'].mean(skipna=True)
rag_ensemble_hf_avg_recall_7_3 = ensemble_hf_7_3['context_recall'].mean(skipna=True)

# Calculate average for meta_filtered
rag_ensemble_meta_filter_avg_answer_relevancy = rag_ensemble_meta_filter_df['answer_relevancy'].mean(skipna=True)
rag_ensemble_meta_filter_avg_answer_correctness = rag_ensemble_meta_filter_df['answer_correctness'].mean(skipna=True)
rag_ensemble_meta_filter_avg_precision = rag_ensemble_meta_filter_df['context_precision'].mean(skipna=True)
rag_ensemble_meta_filter_avg_recall = rag_ensemble_meta_filter_df['context_recall'].mean(skipna=True)



# Print the averages
print("=========================Ensemble HF 0.7, 0.3=========================")
print("RAG Average Answer Relevancy:", rag_ensemble_hf_avg_answer_relevancy_7_3)
print("RAG Average Answer Correctness:", rag_ensemble_hf_avg_answer_correctness_7_3)
print("RAG Average Context Precision:", rag_ensemble_hf_avg_precision_7_3)
print("RAG Average Context Recall:", rag_ensemble_hf_avg_recall_7_3)
print("=========================Ensemble Meta Filtered=========================")
print("RAG Average Answer Relevancy:", rag_ensemble_meta_filter_avg_answer_relevancy)
print("RAG Average Answer Correctness:", rag_ensemble_meta_filter_avg_answer_correctness)
print("RAG Average Context Precision:", rag_ensemble_meta_filter_avg_precision)
print("RAG Average Context Recall:", rag_ensemble_meta_filter_avg_recall)

RAG Average Answer Relevancy: 0.6425841014958626
RAG Average Answer Correctness: 0.5492757708033762
RAG Average Context Precision: 0.89999999991
RAG Average Context Recall: 0.7166666666666666
RAG Average Answer Relevancy: 0.46346294017261014
RAG Average Answer Correctness: 0.48074996223308925
RAG Average Context Precision: 0.89999999991
RAG Average Context Recall: 0.7333333333333333


In [57]:
import pandas as pd
# Version 2 (With filtered function)

# Calculate average for 0.7, 0.3
rag_ensemble_hf_avg_answer_relevancy_7_3 = ensemble_hf_7_3['answer_relevancy'].mean(skipna=True)
rag_ensemble_hf_avg_answer_correctness_7_3 = ensemble_hf_7_3['answer_correctness'].mean(skipna=True)
rag_ensemble_hf_avg_precision_7_3 = ensemble_hf_7_3['context_precision'].mean(skipna=True)
rag_ensemble_hf_avg_recall_7_3 = ensemble_hf_7_3['context_recall'].mean(skipna=True)

# Calculate average for meta_filtered
rag_ensemble_meta_filter_avg_answer_relevancy = combined['answer_relevancy'].mean(skipna=True)
rag_ensemble_meta_filter_avg_answer_correctness = combined['answer_correctness'].mean(skipna=True)
rag_ensemble_meta_filter_avg_precision = combined['context_precision'].mean(skipna=True)
rag_ensemble_meta_filter_avg_recall = combined['context_recall'].mean(skipna=True)



# Print the averages
print("=========================Ensemble HF 0.7, 0.3=========================")
print("RAG Average Answer Relevancy:", rag_ensemble_hf_avg_answer_relevancy_7_3)
print("RAG Average Answer Correctness:", rag_ensemble_hf_avg_answer_correctness_7_3)
print("RAG Average Context Precision:", rag_ensemble_hf_avg_precision_7_3)
print("RAG Average Context Recall:", rag_ensemble_hf_avg_recall_7_3)
print("=========================Ensemble Meta Filtered=========================")
print("RAG Average Answer Relevancy:", rag_ensemble_meta_filter_avg_answer_relevancy)
print("RAG Average Answer Correctness:", rag_ensemble_meta_filter_avg_answer_correctness)
print("RAG Average Context Precision:", rag_ensemble_meta_filter_avg_precision)
print("RAG Average Context Recall:", rag_ensemble_meta_filter_avg_recall)

RAG Average Answer Relevancy: 0.6425841014958626
RAG Average Answer Correctness: 0.5492757708033762
RAG Average Context Precision: 0.89999999991
RAG Average Context Recall: 0.7166666666666666
RAG Average Answer Relevancy: 0.6723204037179018
RAG Average Answer Correctness: 0.568161100684651
RAG Average Context Precision: 0.89999999991
RAG Average Context Recall: 0.7666666666666666


In [30]:
# Specify the file path for the Excel file
excel_file_path = 'scorings/RAG_Prompt_Engineered/RAG Improvement/Ensemble HF/ensemble_meta_filter_scores_v3.xlsx'

# Store the DataFrame into an Excel file
rag_ensemble_meta_filter_df.to_excel(excel_file_path)

In [31]:
# G-eval
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
from deepeval.test_case import LLMTestCase
from dotenv import load_dotenv
import json
load_dotenv(override=True)

# Dataframes
import pandas as pd
import openpyxl



In [32]:
correctness_metric = GEval(
    name="Relevance",
    #criteria="Determine whether the actual output matches the expected output as close as possible.",
    # NOTE: you can only provide either criteria or evaluation_steps, and not both
    evaluation_steps=[
        "Check whether the main content of the responses generated in 'actual output' are similar to the responses in the 'expected output'",
        """As long as one of the main content of the responses generated is similar to any of the expected output, the test case is considered correct.
        For example, if response 1 content is on a pet dog and it matches response 3 content of also a pet dog, give it a high score. 
        The order of the responses is not important.""",
        "Evaluate mainly based on main content but do still give a higher score depending on similarity of responses."
    ],
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
    model="gpt-3.5-turbo",
)

In [33]:
# Ensemble_open_hf scores
ensemble_meta_filter_scores = []
ensemble_meta_filter_reasons = []


for i in range(len(rag_dataset_meta_filter['question'])):
    test_case = LLMTestCase(
        input=rag_dataset_meta_filter['question'][i],
        actual_output=rag_dataset_meta_filter['answer'][i],
        expected_output=rag_dataset_meta_filter['ground_truth'][i]
    )

    correctness_metric.measure(test_case)
    # print(correctness_metric.score)
    # print(correctness_metric.reason)
    ensemble_meta_filter_scores.append(correctness_metric.score)
    ensemble_meta_filter_reasons.append(correctness_metric.reason)

# print(ensemble_open_ai_scores)
# print(ensemble_open_ai_reasons)

In [38]:
import pandas as pd
# Load 0.6 0.4 scores

excel_file_path = 'scorings/RAG_Prompt_Engineered/RAG Improvement/Ensemble HF/g_eval_ensemble_hf_7_3_v3.xlsx' # Change to 7,3

# Read the Excel file into a DataFrame
ensemble_hf_7_3 = pd.read_excel(excel_file_path)

ensemble_hf_7_3

Unnamed: 0.1,Unnamed: 0,Scores,Reasons
0,0,0.280397,Response 1 in Actual Output does not match any...
1,1,0.535548,The main content of the responses in the 'Actu...
2,2,0.799122,The main content of all responses in 'actual o...
3,3,0.586623,Main content of responses in 'actual output' a...
4,4,0.7421,Two out of three responses in the actual outpu...
5,5,0.253476,The main content of the responses in the actua...
6,6,0.624999,One of the main content of the responses (crea...
7,7,0.585437,Two responses in the actual output match close...
8,8,0.87062,The main content of the responses in the 'actu...
9,9,0.829752,The main content of the responses in the 'actu...


In [95]:
# Combine scores and reasons into a DataFrame
ensemble_hf_df_meta_filter = pd.DataFrame({'Scores': ensemble_meta_filter_scores, 'Reasons': ensemble_meta_filter_reasons})

# Calculate the average scores for each DataFrame
ensemble_hf_7_3 = ensemble_hf_7_3['Scores'].mean()
ensemble_hf_meta_filter = ensemble_hf_df_meta_filter['Scores'].mean()


# Print the average scores
print("Average Score for Ensemble HF 0.7,0.3:", ensemble_hf_7_3)
print("Average Score for Ensemble HF Meta Filtered:", ensemble_hf_meta_filter)

Average Score for Ensemble HF 0.7,0.3: 0.6556917153416697
Average Score for Ensemble HF Meta Filtered: 0.6632734878019907


In [37]:
# VERSION 2 (With the new filter_list function)
# Combine scores and reasons into a DataFrame
ensemble_hf_df_meta_filter = pd.DataFrame({'Scores': ensemble_meta_filter_scores, 'Reasons': ensemble_meta_filter_reasons})

# Calculate the average scores for each DataFrame
ensemble_hf_7_3 = ensemble_hf_7_3['Scores'].mean()
ensemble_hf_meta_filter = ensemble_hf_df_meta_filter['Scores'].mean()


# Print the average scores
print("Average Score for Ensemble HF 0.7,0.3:", ensemble_hf_7_3)
print("Average Score for Ensemble HF Meta Filtered:", ensemble_hf_meta_filter)

Average Score for Ensemble HF 0.7,0.3: 0.6556917153416697
Average Score for Ensemble HF Meta Filtered: 0.6873390817852749


In [39]:
# Specify the file path for the Excel file
excel_file_path = 'scorings/RAG_Prompt_Engineered/RAG Improvement/Ensemble HF/g_eval_ensemble_meta_filtered_v2.xlsx'

# Store the DataFrame into an Excel file
ensemble_hf_df_meta_filter.to_excel(excel_file_path)

## Integration

In [20]:
content = """
You would be assisting in identifying topics from a snippet of conversation. I would supply the conversation directly. Interpret the main topic of the conversation and return the main topic.

Do not give multiple topics such as football/soccer. Only give one main topic.

For example if the conversation is: 
            {"Roydon": "Can't wait for the new football season to start, hoping for a great one for Arsenal!", "John": "Hey Roydon! Yeah, it's always exciting to see how your team will perform."}

            Topic is football

            Example 2:
            {"Roydon": "I'm planning to go on a trip to Japan next year", "John": "That's awesome! Japan is such a beautiful country."}

            Topic is travel.

            Example 3: If no main topic can be determined such as a greeting
            {"Roydon": "Hey there! How are you doing?", "John": "Hey Roydon! I'm doing great, how about you?"}

            Topic is general. 
"""

In [21]:
# Generate for rag

# Learning instructions
instruction = {
    "role": "system",
    "content": content,
}

doc = documents[2]

print("Query is: " + doc.page_content)

# Initialize messages
messages = []

# Add learn instruction to message array
messages.append(instruction)

user_message = {
        "role": "user",
        "content": doc.page_content
}

messages.append(user_message)

openai.api_type = 'openai'
openai.api_key = os.environ["OPENAI_API_KEY"]
openai.organisation= os.environ["OPEN_AI_ORG"]

raw_response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages = messages,
)
response_choices = raw_response.choices[0].message.content

print(response_choices)

Query is: {"Roydon": "Haha, definitely! Looking forward to some intense matches between our teams. Who do you think will be Arsenal's key player this season?", "John": "I have a feeling Aubameyang will continue to shine for Arsenal. His goals are always a game-changer!"}
Topic is football


## Query Altering

In [32]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
                                              0.0/1.5 MB ? eta -:--:--
                                              0.0/1.5 MB ? eta -:--:--
                                              0.0/1.5 MB ? eta -:--:--
                                              0.0/1.5 MB ? eta -:--:--
                                              0.0/1.5 MB ? eta -:--:--
                                              0.0/1.5 MB ? eta -:--:--
                                              0.0/1.5 MB ? eta -:--:--
                                              0.0/1.5 MB ? eta -:--:--
                                              0.0/1.5 MB 131.3 kB/s eta 0:00:12
                                              0.0/1.5 MB 131.3 kB/s eta 0:00:12
                                              0.0/1.5 MB 131.3 kB/s eta 0:00:12
                                              0.0/1.5 MB 131.3 kB/s eta 0:00:12
     -                                       


[notice] A new release of pip is available: 23.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer

# Ensure you have the necessary NLTK models downloaded
#nltk.download('punkt')

In [2]:
def process_query(query):
    # Initialize the Punkt tokenizer
    tokenizer = PunktSentenceTokenizer()
    
    # Tokenize the text into sentences
    sentences = tokenizer.tokenize(query)

    # Check if the query is a simple one (contains only one sentence)
    if len(sentences) == 1:
        return sentences  # Return the single sentence wrapped in a list

    # Combine into segments
    combined = ''
    segments = []

    # Flag to check if last sentence was a question
    last_was_question = False

    # Loop through each sentence and decide whether to start a new segment
    for sentence in sentences:
        # If last sentence was a question and current isn't directly a question,
        # start a new segment
        if last_was_question and sentence.strip().endswith('?'):
            segments.append(combined.strip())
            combined = sentence + ' '
            last_was_question = False
        else:
            combined += sentence + ' '

        # Check if current sentence ends with a question mark
        if sentence.strip().endswith('?'):
            last_was_question = True

    # Append the last segment if there's any remaining text
    if combined:
        segments.append(combined.strip())

    return segments

In [3]:
query1 = "How are you Roydon? I heard you have a new pet dog. How have you been?"
query2 = "What's the weather like today?"

# Process the queries
result1 = process_query(query1)
result2 = process_query(query2)

print("Query 1 processed:", result1)
print("Query 2 processed:", result2)


Query 1 processed: ['How are you Roydon? I heard you have a new pet dog.', 'How have you been?']
Query 2 processed: ["What's the weather like today?"]


In [6]:
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

# Retrievers
from langchain.retrievers import BM25Retriever, EnsembleRetriever

embeddings=HuggingFaceInferenceAPIEmbeddings(
    api_key=os.environ['HUGGING_FACE_ACCESS_TOKEN'],
    model_name='BAAI/bge-base-en-v1.5'
)

## Load Vector Store
loaded_faiss_vs_hf_v3 = FAISS.load_local("C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\\vector_store\\vectorstores\\hugging_face\\faiss_vs_hf_v3", embeddings=embeddings, allow_dangerous_deserialization=True)

In [7]:
# Specify the file path for the JSON file
json_file_path = 'C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\mockdata\\documents.json'

with open(json_file_path, 'r') as json_file:
    documents_json = json.load(json_file)

# Convert the JSON serializable format back to Document objects
documents = [
    Document(page_content=doc['page_content'], metadata=doc['metadata'])
    for doc in documents_json
]

print(f"Loaded {len(documents)} documents.")

Loaded 61 documents.


In [8]:
# Initiate retriever
retriever_vectordb = loaded_faiss_vs_hf_v3.as_retriever(search_kwargs={"k": 6})
keyword_retriever = BM25Retriever.from_documents(documents)
keyword_retriever.k =  6
ensemble_retriever = EnsembleRetriever(retrievers=[retriever_vectordb,keyword_retriever],
                                       weights=[0.7, 0.3])

In [88]:
query = "I heard you got a new pet dog how is he? What are you going to name him?"
contexts = ""
query_split = process_query(query)
for i in query_split:
    # Obtain top 3 filtered docs
    docs_rel=ensemble_retriever.get_relevant_documents(query)
    topic_interpreted = getTopic(meta_content, query)
    final_docs = filter_list(docs_rel, topic_interpreted) # Still top 3
    for context in final_docs:
        contexts += context.page_content

print(contexts)

{"Roydon": "Guess what, I just got a new pet dog!", "Jacob": "That's awesome! What breed is it?"}{"Roydon": "I couldn't agree more, I feel like my new dog has completed my little family.", "Jacob": "It's amazing how pets have a way of making a house feel like a home, enjoy every moment with your furry friend!"}{"Roydon": "I'm thinking of naming him Sunny, because he brings so much joy and sunshine into my life. What do you think?", "Xavier": "Sunny is a perfect name for a golden retriever! It suits his happy and cheerful personality. I can't wait to see all the fun adventures you two will have together."}{"Roydon": "Guess what, I just got a new pet dog!", "Jacob": "That's awesome! What breed is it?"}{"Roydon": "I couldn't agree more, I feel like my new dog has completed my little family.", "Jacob": "It's amazing how pets have a way of making a house feel like a home, enjoy every moment with your furry friend!"}{"Roydon": "I'm thinking of naming him Sunny, because he brings so much joy 

#### Generate mock data

In [9]:
import openai

In [10]:
data_sample = {
    'question': [
        'What have you been up to Roydon?',
        'Woah really how is Arsenal doing right now then?',
        'Nice what breed is your new pet dog?',
        'So what you planning to do with your pet dog?',
        'How was your trip to thailand?',
        'What happened in thailand?',
        'What channel are you planning to create for your new pet dog?',
        #------------ Dual questions
        'How was your trip to thailand? Any new travel plans next year?',
        'I heard you got a new pet dog how is he? What are you going to name him?',
        'Hows your new pet dog? What breed is he?'
        #------------ Complicated questions
        
    ],
    'answer': [],
    'contexts': [],
    'ground_truth': [
        "Response 1: I've been watching Arsenal games hoping they will win. Response 2: I've been looking at a trip to Japan. Response 3: I just got a new pet dog. How about you?",
        "Response 1: Arsenal is doing well, did you catch the match yesterday? Response 2: Arsenal is doing great and Aubameyang is a true asset to the team. Response 3: Arsenal is doing alright since Ben White is a great addition to the team.",
        "Response 1: He is a golden retriever, and he's the cutest thing ever! Response 2: He is a golden retriever, and he's the cutest thing ever! Response 3: He is a golden retriever, and he's the cutest thing ever!",
        "Response 1: I'm planning to take him on long hikes on the mountain. Response 2: I'm planning to take him to the beach and watch him splash in the waves. Response 3: I'm planning for play dates with other dogs.",
        "Response 1: It was a horrible experience and I would never go back. Response 2: It was a horrible experience and I would never go back. Response 3: It was a horrible experience and I would never go back.",
        "Response 1: I got scammed by a taxi driver and lost all my money. Response 2: The hotel lost my reservation and I had to sleep on the streets. Response 3: I kept getting ripped off by the locals and it was such a horrible experience.",
        "Response 1: I'm planning to create a special Instagram account just for him to share our adventures. Response 2: I'm planning to create a special Instagram account just for him to share our adventures. Response 3: I'm planning to create a special Instagram account just for him to share our adventures.",
        "Response 1: It was a horrible experience. I got scammed by a taxi driver and lost all my money. Response 2: It was a horrible experience. The hotel lost my reservation and I had to sleep on the streets. Response 3: It was a horrible experience. I kept getting ripped off by the locals.",
        "Response 1: He is so fun to be with. Im planning to name him Sunny. Response 2: He is so fun to be with. Im planning to name him Sunny. Response 3: He is so fun to be with. Im planning to name him Sunny.",
        "Response 1: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever! Response 2: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever! Response 3: He brings so much joy to my life. He is a golden retriever, and he's the cutest thing ever!",
    ]
}

In [11]:
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

# Retrievers
from langchain.retrievers import BM25Retriever, EnsembleRetriever

embeddings=HuggingFaceInferenceAPIEmbeddings(
    api_key=os.environ['HUGGING_FACE_ACCESS_TOKEN'],
    model_name='BAAI/bge-base-en-v1.5'
)

## Load Vector Store
loaded_faiss_vs_hf_v3 = FAISS.load_local("C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\\vector_store\\vectorstores\\hugging_face\\faiss_vs_hf_v3", embeddings=embeddings, allow_dangerous_deserialization=True)

In [12]:
# Specify the file path for the JSON file
json_file_path = 'C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\mockdata\\documents.json'

with open(json_file_path, 'r') as json_file:
    documents_json = json.load(json_file)

# Convert the JSON serializable format back to Document objects
documents = [
    Document(page_content=doc['page_content'], metadata=doc['metadata'])
    for doc in documents_json
]

print(f"Loaded {len(documents)} documents.")

Loaded 61 documents.


In [13]:
# Initiate retriever
retriever_vectordb = loaded_faiss_vs_hf_v3.as_retriever(search_kwargs={"k": 6})
keyword_retriever = BM25Retriever.from_documents(documents)
keyword_retriever.k =  6
ensemble_retriever = EnsembleRetriever(retrievers=[retriever_vectordb,keyword_retriever],
                                       weights=[0.7, 0.3])

In [14]:
# Initiate meta content
meta_content = """
You would be assisting in identifying topics from a snippet of conversation. I would supply the conversation directly. 
Interpret the main topic of the conversation and return the main topic.

Do not give multiple topics such as football/soccer. Only give one main topic.

For example if the conversation is: 
            {"Roydon": "Can't wait for the new football season to start, hoping for a great one for Arsenal!", "John": "Hey Roydon! Yeah, it's 
            always exciting to see how your team will perform."}

            football

            Example 2:
            {"Roydon": "I'm planning to go on a trip to Japan next year", "John": "That's awesome! Japan is such a beautiful country."}

            travel

            Example 3: If no main topic can be determined such as a greeting
            {"Roydon": "Hey there! How are you doing?", "John": "Hey Roydon! I'm doing great, how about you?"}

            general 
"""

In [15]:
def getTopic(meta_content, query):
    # Learning instructions
    instruction = {
        "role": "system",
        "content": meta_content,
    }

    #print("Query is: " + query)

    # Initialize messages
    messages = []

    # Add learn instruction to message array
    messages.append(instruction)

    user_message = {
            "role": "user",
            "content": query
    }

    messages.append(user_message)

    openai.api_type = 'openai'
    openai.api_key = os.environ["OPENAI_API_KEY"]
    openai.organisation= os.environ["OPEN_AI_ORG"]

    raw_response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages = messages,
    )
    topic = raw_response.choices[0].message.content

    return topic

In [16]:
import inflect

def singularize_and_lower(topic):
    # Create an inflect engine for handling plurals
    engine = inflect.engine()
    
    # Lowercase the topic
    topic = topic.lower()
    
    # Singularize the topic (convert plurals to singular), returns false if not noun
    topic = engine.singular_noun(topic) if engine.singular_noun(topic) else topic
    
    return topic

print(singularize_and_lower("pets"))


pet


In [17]:
def filter_list(docs_rel, topic):
    # Filter according to the topic
    filtered_docs = []
    final_docs = []
    general_topic = {}

    if singularize_and_lower(topic) == "general":
        for doc in docs_rel:
            if singularize_and_lower(doc.metadata['topic']) not in general_topic:
                general_topic[singularize_and_lower(doc.metadata['topic'])] = 1
                filtered_docs.append(doc)
            else:
                continue
    else:
        for doc in docs_rel:
            if(singularize_and_lower(doc.metadata['topic']) == singularize_and_lower(topic)):
                filtered_docs.append(doc)
       
    if len(filtered_docs) > 2:
        final_docs = filtered_docs[:3]
        return final_docs
    else:
        count = 3 - len(filtered_docs)
        final_docs = filtered_docs
        position = 0
        for i in range(count):
            if(position == len(docs_rel)):
                break
            if(docs_rel[position] in filtered_docs):
                i = i-1
                position += 1
                continue
            else:
                final_docs.append(docs_rel[position])# need to change so that it wont be same obtained
                position+=1 
        
        return final_docs

In [20]:
# Generate for rag
for query in data_sample['question']:
    # Get contexts for query
    contexts = ""
    query_split = process_query(query)
    for i in query_split:
        # Obtain top 3 filtered docs
        docs_rel=ensemble_retriever.get_relevant_documents(i)
        topic_interpreted = getTopic(meta_content, i)
        final_docs = filter_list(docs_rel, topic_interpreted) # Still top 3
        for context in final_docs:
            contexts += context.page_content

    data_sample['contexts'].append([contexts])

    content = f"""You are an assistant whom will faciliate the conversation between a mute and a normal person. The mute persons name is Roydon and the normal person is indicated as other person.
                        You should be generating 3 responses which the mute person could choose from and the responses generated should follow the context of the conversation. 
                        The responses should be what a person would say and should not include actions in a third person view. Your persona would be from the perspective of the mute person.

                        Snippets of conversation would be given below in the section of Context. Use the conversations to assist in the generation the 3 responses. Primarily the topic should be inferred from the question asked but if no topic can be inferred, infer the topics from the conversations given in the context. The conversations are seperated by "{{" and "}}":\n
                        Context: {contexts}

                        For example, if the context above contains "{{"Roydon": "Recently my new pet dog has been so fun!", "Jacob": "That\'s awesome! What breed is it?"}}"

                        If the user asks "What have you been up to?"

                        An example of the 3 generated response would be in the format of 1 single string "Response 1: I have been playing with my new pet dog. Response 2: Nothing much, I recently brought my new pet dog to a park. Response 3: Its been tiring lately after getting a new pet dog. """
    # Learning instructions
    instruction = {
        "role": "system",
        "content": content,
    }

    # Initialize messages
    messages = []

    # Add learn instruction to message array
    messages.append(instruction)

    user_message = {
            "role": "user",
            "content": "Other person says: " + query
    }

    messages.append(user_message)

    openai.api_type = 'openai'
    openai.api_key = os.environ["OPENAI_API_KEY"]
    openai.organisation= os.environ["OPEN_AI_ORG"]
    
    raw_response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages = messages,
    )
    response_choices = raw_response.choices[0].message.content
    data_sample['answer'].append(response_choices)

In [21]:
# Specify the file path (v1 is before the changed filtered list)
file_path = 'testing_json/Improve_RAG/meta_filter_v3.json'

# Save the data_sample dictionary into a JSON file
with open(file_path, 'w') as json_file:
    json.dump(data_sample, json_file)

#### RAGAS And G-Eval Score

In [22]:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, answer_correctness, context_recall, context_precision

  from .autonotebook import tqdm as notebook_tqdm


In [28]:
# RAGAS 
file_path_meta_filter = 'testing_json/Improve_RAG/meta_filter_v3.json'

with open(file_path_meta_filter, 'r') as json_file:
    rag_ensemble_meta_filter_query_altered = json.load(json_file)

rag_dataset_query_altered_meta_filter = Dataset.from_dict(rag_ensemble_meta_filter_query_altered)

In [29]:
rag_dataset_ensemble_meta_filter_query_altered_score = evaluate(rag_dataset_query_altered_meta_filter, metrics=[answer_relevancy, answer_correctness,context_precision, context_recall])

rag_ensemble_meta_filter_query_altered_df = rag_dataset_ensemble_meta_filter_query_altered_score.to_pandas()

Evaluating: 100%|██████████| 40/40 [00:35<00:00,  1.11it/s]


In [26]:
import pandas as pd
# Load 0.7 0.3 scores

excel_file_path = 'scorings\RAG_Prompt_Engineered\RAG Improvement\Ensemble HF\Combined_Ensemble_meta_filtered.xlsx'

# Read the Excel file into a DataFrame
meta_filtered = pd.read_excel(excel_file_path)

meta_filtered

Unnamed: 0.1,Unnamed: 0,question,answer,contexts,ground_truth,answer_relevancy,answer_correctness,context_precision,context_recall,Scores,Reasons
0,0,What have you been up to Roydon?,"Response 1: I tried to, but everywhere I went,...","['{""Roydon"": ""I tried to, but everywhere I wen...",Response 1: I've been watching Arsenal games h...,0.874612,0.213545,0.0,0.333333,0.389333,Some parts of the actual output do not match a...
1,1,Woah really how is Arsenal doing right now then?,Response 1: Arsenal is currently showing great...,"['{""Roydon"": ""I couldn\'t agree more! Aubameya...","Response 1: Arsenal is doing well, did you cat...",0.754243,0.603599,1.0,0.666667,0.610755,Two out of the three responses in the actual o...
2,2,Nice what breed is your new pet dog?,"Response 1: It's a golden retriever, and he's ...","['{""Roydon"": ""Guess what, I just got a new pet...","Response 1: He is a golden retriever, and he's...",0.941946,0.738309,1.0,1.0,0.87903,The main content of the responses generated cl...
3,3,So what you planning to do with your pet dog?,Response 1: I'm planning to teach him how to f...,"['{""Roydon"": ""Guess what, I just got a new pet...",Response 1: I'm planning to take him on long h...,0.544969,0.608215,1.0,0.666667,0.558312,The main content of some responses in the actu...
4,4,How was your trip to thailand?,"Response 1: Thailand was a disaster, everythin...","['{""Roydon"": ""I can\'t believe how terrible my...",Response 1: It was a horrible experience and I...,0.99587,0.803058,1.0,1.0,0.873509,The main content of all actual responses match...
5,5,What happened in thailand?,Response 1: It was a series of unfortunate eve...,"['{""Roydon"": ""I can\'t believe what happened t...",Response 1: I got scammed by a taxi driver and...,0.0,0.215579,1.0,0.333333,0.589833,Only one of the main content of the responses ...
6,6,What channel are you planning to create for yo...,Response 1: I'm thinking of creating a channel...,"['{""Roydon"": ""Guess what, I just got a new pet...",Response 1: I'm planning to create a special I...,0.0,0.558308,1.0,1.0,0.784435,The main content of the responses in the 'actu...
7,7,How was your trip to thailand? Any new travel ...,"Response 1: Thailand was a disaster, but I'm e...","['{""Roydon"": ""I can\'t wait to immerse myself ...",Response 1: It was a horrible experience. I go...,0.789362,0.723084,1.0,0.666667,0.591981,The main content of the responses in 'actual o...
8,8,I heard you got a new pet dog how is he? What ...,"Response 1: He's been great so far, very playf...","['{""Roydon"": ""Guess what, I just got a new pet...",Response 1: He is so fun to be with. Im planni...,0.888668,0.98132,1.0,1.0,0.85174,The main content of the responses in the 'actu...
9,9,Hows your new pet dog? What breed is he?,"Response 1: He is a golden retriever, and he's...","['{""Roydon"": ""Guess what, I just got a new pet...",Response 1: He brings so much joy to my life. ...,0.933533,0.236593,1.0,1.0,0.744463,The main content of the responses generated is...


In [46]:
import pandas as pd
# Load 0.7 0.3 scores

excel_file_path = 'scorings\RAG_Prompt_Engineered\RAG Improvement\Ensemble HF\Combined_Ensemble_meta_filtered_Query_Altered_v2.xlsx'

# Read the Excel file into a DataFrame
rag_ensemble_meta_filter_query_altered_df = pd.read_excel(excel_file_path)

In [47]:
import pandas as pd
# Calculate average for meta_filtered
rag_ensemble_meta_filter_avg_answer_relevancy = meta_filtered['answer_relevancy'].mean(skipna=True)
rag_ensemble_meta_filter_avg_answer_correctness = meta_filtered['answer_correctness'].mean(skipna=True)
rag_ensemble_meta_filter_avg_precision = meta_filtered['context_precision'].mean(skipna=True)
rag_ensemble_meta_filter_avg_recall = meta_filtered['context_recall'].mean(skipna=True)

# Calculate average for query altering
rag_ensemble_query_alter_avg_answer_relevancy = rag_ensemble_meta_filter_query_altered_df['answer_relevancy'].mean(skipna=True)
rag_ensemble_query_alter_avg_answer_correctness = rag_ensemble_meta_filter_query_altered_df['answer_correctness'].mean(skipna=True)
rag_ensemble_query_alter_avg_precision = rag_ensemble_meta_filter_query_altered_df['context_precision'].mean(skipna=True)
rag_ensemble_query_alter_avg_recall = rag_ensemble_meta_filter_query_altered_df['context_recall'].mean(skipna=True)

# Print the averages
print("=========================Ensemble Meta Filtered=========================")
print("RAG Average Answer Relevancy:", rag_ensemble_meta_filter_avg_answer_relevancy)
print("RAG Average Answer Correctness:", rag_ensemble_meta_filter_avg_answer_correctness)
print("RAG Average Context Precision:", rag_ensemble_meta_filter_avg_precision)
print("RAG Average Context Recall:", rag_ensemble_meta_filter_avg_recall)
print("=========================Ensemble Meta Filtered Query Altered=========================")
print("RAG Average Answer Relevancy:", rag_ensemble_query_alter_avg_answer_relevancy)
print("RAG Average Answer Correctness:", rag_ensemble_query_alter_avg_answer_correctness)
print("RAG Average Context Precision:", rag_ensemble_query_alter_avg_precision)
print("RAG Average Context Recall:", rag_ensemble_query_alter_avg_recall)

RAG Average Answer Relevancy: 0.6723204037179018
RAG Average Answer Correctness: 0.568161100684651
RAG Average Context Precision: 0.89999999991
RAG Average Context Recall: 0.7666666666666666
RAG Average Answer Relevancy: 0.6798901851761995
RAG Average Answer Correctness: 0.6569532556501141
RAG Average Context Precision: 0.9
RAG Average Context Recall: 0.7666666666666666


In [41]:
# Specify the file path for the Excel file
excel_file_path = 'scorings/RAG_Prompt_Engineered/RAG Improvement/Ensemble HF/ensemble_meta_filter_scores_v3.xlsx'

# Store the DataFrame into an Excel file
rag_ensemble_meta_filter_query_altered_df.to_excel(excel_file_path)

In [34]:
# G-eval
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
from deepeval.test_case import LLMTestCase
from dotenv import load_dotenv
import json
load_dotenv(override=True)

# Dataframes
import pandas as pd
import openpyxl



In [35]:
correctness_metric = GEval(
    name="Relevance",
    #criteria="Determine whether the actual output matches the expected output as close as possible.",
    # NOTE: you can only provide either criteria or evaluation_steps, and not both
    evaluation_steps=[
        "Check whether the main content of the responses generated in 'actual output' are similar to the responses in the 'expected output'",
        """As long as one of the main content of the responses generated is similar to any of the expected output, the test case is considered correct.
        For example, if response 1 content is on a pet dog and it matches response 3 content of also a pet dog, give it a high score. 
        The order of the responses is not important.""",
        "Evaluate mainly based on main content but do still give a higher score depending on similarity of responses."
    ],
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
    model="gpt-3.5-turbo",
)

In [36]:
# Ensemble_open_hf scores
ensemble_meta_filter_query_altered_scores = []
ensemble_meta_filter_query_altered_reasons = []


for i in range(len(rag_dataset_query_altered_meta_filter['question'])):
    test_case = LLMTestCase(
        input=rag_dataset_query_altered_meta_filter['question'][i],
        actual_output=rag_dataset_query_altered_meta_filter['answer'][i],
        expected_output=rag_dataset_query_altered_meta_filter['ground_truth'][i]
    )

    correctness_metric.measure(test_case)
    # print(correctness_metric.score)
    # print(correctness_metric.reason)
    ensemble_meta_filter_query_altered_scores.append(correctness_metric.score)
    ensemble_meta_filter_query_altered_reasons.append(correctness_metric.reason)

# print(ensemble_open_ai_scores)
# print(ensemble_open_ai_reasons)

In [49]:
# Combine scores and reasons into a DataFrame
ensemble_hf_df_meta_filter_query_altered = pd.DataFrame({'Scores': ensemble_meta_filter_query_altered_scores, 'Reasons': ensemble_meta_filter_query_altered_reasons})

# Calculate the average scores for each DataFrame
ensemble_hf_meta_filter = meta_filtered['Scores'].mean()
ensemble_hf_meta_filter_query_altered = ensemble_hf_df_meta_filter_query_altered['Scores'].mean()


# Print the average scores
print("Average Score for Ensemble HF Meta Filtered:", ensemble_hf_meta_filter)
print("Average Score for Ensemble HF Meta Filtered Query Altered:", ensemble_hf_meta_filter_query_altered)

Average Score for Ensemble HF Meta Filtered: 0.6873390817852749
Average Score for Ensemble HF Meta Filtered Query Altered: 0.7141140981676445


In [40]:
# Specify the file path for the Excel file
excel_file_path = 'scorings/RAG_Prompt_Engineered/RAG Improvement/Ensemble HF/g_eval_ensemble_meta_filtered_v3.xlsx'

# Store the DataFrame into an Excel file
ensemble_hf_df_meta_filter_query_altered.to_excel(excel_file_path)