In [2]:
import json
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch

# Loading Actual Data

In [3]:
import json

# Update the file path to the JSON file
file_path = "/workspaces/Rag_Project_Pod/Data_prep/final_data.json"

# Load the JSON file into a dictionary
with open(file_path, 'r') as json_file:
    documents = json.load(json_file)

documents[0]


{'Category': 'General Information',
 'Question': 'What is syndicated research?',
 'Answer': 'Syndicated research is a type of market research where data and findings are collected and compiled by a research agency and then sold to multiple clients. It provides insights into consumer behavior, market trends, and product performance across various FMCG categories.',
 'doc_id': '3e72e1c8'}

# Loading Ground Truth Data

In [4]:
import pandas as pd

df_ground_truth = pd.read_csv(r'/workspaces/Rag_Project_Pod/Data_prep/ground_truth_data.csv')
df_ground_truth.head()                              

Unnamed: 0,Question,Category,Document
0,Can you explain what syndicated research entails?,General Information,3e72e1c8
1,What type of data is included in syndicated re...,General Information,3e72e1c8
2,Who compiles the findings for syndicated resea...,General Information,3e72e1c8
3,In what industries is syndicated research comm...,General Information,3e72e1c8
4,How can syndicated research benefit multiple c...,General Information,3e72e1c8


In [5]:
ground_truth = df_ground_truth.to_dict(orient='records')
ground_truth[0]

{'Question': 'Can you explain what syndicated research entails?',
 'Category': 'General Information',
 'Document': '3e72e1c8'}

In [6]:
doc_idx = {d['doc_id']: d for d in documents}

In [7]:
doc_idx['3e72e1c8']['Answer']

'Syndicated research is a type of market research where data and findings are collected and compiled by a research agency and then sold to multiple clients. It provides insights into consumer behavior, market trends, and product performance across various FMCG categories.'

# Index data


In [8]:
from sentence_transformers import SentenceTransformer


model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)



In [9]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "Answer": {"type": "text"},
            "Category": {"type": "text"},
            "Question": {"type": "text"},
            "doc_id": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "insights-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'insights-questions'})

In [10]:
for doc in tqdm(documents):
    question = doc['Question']
    text = doc['Answer']
    qt = question + ' ' + text

    doc['question_vector'] = model.encode(question)
    doc['text_vector'] = model.encode(text)
    doc['question_text_vector'] = model.encode(qt)

  0%|          | 0/260 [00:00<?, ?it/s]

In [11]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/260 [00:00<?, ?it/s]

# Retrieval using Hybrid Search & RRF

In [12]:
def compute_rrf(rank, k=60):
    """ Our own implementation of the relevance score """
    return 1 / (k + rank)

In [13]:
def elastic_search_hybrid_rrf(field, query, vector, k=60):
    # KNN Query
    knn_query = {
        "field": field,
        "query_vector": vector,
        "k": 10,
        "num_candidates": 10000,
        "boost": 0.5
    }

    # Keyword Query
    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["Question", "Answer", "Category"],  # Updated fields
                    "type": "best_fields",
                    "boost": 0.5
                }
            }
        }
    }

    # KNN Search
    knn_results = es_client.search(
        index=index_name, 
        body={
            "knn": knn_query, 
            "size": 10
        }
    )['hits']['hits']
    
    # Keyword Search
    keyword_results = es_client.search(
        index=index_name, 
        body={
            "query": keyword_query, 
            "size": 10
        }
    )['hits']['hits']
    
    # Reciprocal Rank Fusion (RRF) scoring
    rrf_scores = {}
    
    # Calculate RRF scores for KNN results
    for rank, hit in enumerate(knn_results):
        doc_id = hit['_id']
        rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Calculate RRF scores for keyword results
    for rank, hit in enumerate(keyword_results):
        doc_id = hit['_id']
        if doc_id in rrf_scores:
            rrf_scores[doc_id] += compute_rrf(rank + 1, k)
        else:
            rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Sort RRF scores in descending order
    reranked_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Get top-K documents by the score
    final_results = []
    for doc_id, score in reranked_docs[:5]:
        doc = es_client.get(index=index_name, id=doc_id)
        final_results.append(doc['_source'])
    
    return final_results


In [14]:
def question_text_hybrid_rrf(q):
    question = q['Question']

    v_q = model.encode(question)

    return elastic_search_hybrid_rrf('question_text_vector', question, v_q)

In [15]:
query = {'Question': "What is the sample size?"}
search_results = question_text_hybrid_rrf(query)
search_results[0]['Answer']

'The consumer panel consists of approximately 10,000 households, representing diverse demographic and geographic segments to ensure the data is reflective of the broader population.'

# The RAG flow


In [16]:
def build_prompt(query, search_results):
    prompt_template = """
You're a syndicated market research provider. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['Category']}\nquestion: {doc['Question']}\nanswer: {doc['Answer']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [17]:
prompt = build_prompt(query['Question'], search_results)
prompt

"You're a syndicated market research provider. Answer the QUESTION based on the CONTEXT from the FAQ database.\nUse only the facts from the CONTEXT when answering the QUESTION.\n\nQUESTION: What is the sample size?\n\nCONTEXT: \nsection: Data Collection Methodology\nquestion: What is the sample size of the consumer panel?\nanswer: The consumer panel consists of approximately 10,000 households, representing diverse demographic and geographic segments to ensure the data is reflective of the broader population.\n\nsection: General Information\nquestion: What is the sample size for global studies?\nanswer: For global studies, the sample size typically includes over 50,000 respondents, ensuring a representative sample across different regions and demographics.\n\nsection: Data Collection Methodology\nquestion: How is household size factored into the research?\nanswer: Household size is factored into the research by segmenting data to understand how family size influences purchase behavior, 

In [18]:

import os
from openai import OpenAI

os.environ['OPENAI_API_KEY'] = ''
client = OpenAI()


def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [19]:
def rag(query: dict, model='gpt-4o-mini') -> str:
    search_results = question_text_hybrid_rrf(query)
    prompt = build_prompt(query['Question'], search_results)
    answer = llm(prompt, model=model)
    return answer

In [20]:
ground_truth[6]

{'Question': 'Are quarterly summaries included with the monthly updates?',
 'Category': 'General Information',
 'Document': '7a6f8a30'}

In [21]:
rag(ground_truth[6])

'Yes, quarterly summaries are available as part of the data updates, which are typically done on a monthly basis.'

In [22]:
doc_idx['7a6f8a30']['Answer']

'The data is typically updated on a monthly basis, with quarterly and annual summaries available. Real-time or weekly updates may also be available depending on the subscription level.'

# LLM as a judge

# Eval using AQA only ---checking relevance of LLM Answer against the question & original answer

In [23]:
prompt1_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: {answer_orig}
Generated Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the original
answer and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()



In [24]:
#Load Dataframe to judge

import pandas as pd

# Load the CSV data from the given path
file_path = '/workspaces/Rag_Project_Pod/Evaluation/LLM Evaluation/llm_data/results-gpt4o-cosine.csv'
df_gpt4o = pd.read_csv(file_path)

# Display the first few rows of the data
df_gpt4o.head()



Unnamed: 0,answer_llm,answer_orig,document,question,category,cosine
0,Syndicated research is a type of market resear...,Syndicated research is a type of market resear...,3e72e1c8,Can you explain what syndicated research entails?,General Information,0.998721
1,Syndicated research includes data and findings...,Syndicated research is a type of market resear...,3e72e1c8,What type of data is included in syndicated re...,General Information,0.786594
2,The findings for syndicated research are compi...,Syndicated research is a type of market resear...,3e72e1c8,Who compiles the findings for syndicated resea...,General Information,0.792197
3,Syndicated research is commonly used in indust...,Syndicated research is a type of market resear...,3e72e1c8,In what industries is syndicated research comm...,General Information,0.834927
4,Syndicated research benefits multiple clients ...,Syndicated research is a type of market resear...,3e72e1c8,How can syndicated research benefit multiple c...,General Information,0.801818


In [25]:
samples = df_gpt4o.to_dict(orient='records')
record = samples[0]
record

{'answer_llm': 'Syndicated research is a type of market research where data and findings are collected and compiled by a research agency and then sold to multiple clients. It provides insights into consumer behavior, market trends, and product performance across various FMCG (Fast-Moving Consumer Goods) categories.',
 'answer_orig': 'Syndicated research is a type of market research where data and findings are collected and compiled by a research agency and then sold to multiple clients. It provides insights into consumer behavior, market trends, and product performance across various FMCG categories.',
 'document': '3e72e1c8',
 'question': 'Can you explain what syndicated research entails?',
 'category': 'General Information',
 'cosine': 0.9987208}

In [26]:
prompt = prompt1_template.format(**record)
print(prompt)

You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: Syndicated research is a type of market research where data and findings are collected and compiled by a research agency and then sold to multiple clients. It provides insights into consumer behavior, market trends, and product performance across various FMCG categories.
Generated Question: Can you explain what syndicated research entails?
Generated Answer: Syndicated research is a type of market research where data and findings are collected and compiled by a research agency and then sold to multiple clients. It provides insights into consumer behavior, market trends, and product performance acros

In [27]:
answer = llm(prompt, model='gpt-4o-mini')

In [28]:
answer

'{\n  "Relevance": "RELEVANT",\n  "Explanation": "The generated answer is identical to the original answer, providing a full and accurate explanation of what syndicated research is. It addresses the question directly and thoroughly."\n}'

In [29]:
import json

evaluations = []

for record in tqdm(samples):
    prompt = prompt1_template.format(**record)
    evaluation = llm(prompt, model='gpt-4o-mini')
    evaluations.append(evaluation)

  0%|          | 0/1300 [00:00<?, ?it/s]

In [31]:
json_evaluations = []

for i, str_eval in enumerate(evaluations):
    json_eval = json.loads(str_eval)
    json_evaluations.append(json_eval)

JSONDecodeError: Expecting property name enclosed in double quotes: line 4 column 1 (char 323)

In [33]:
import json
import re

json_evaluations = []

for i, str_eval in enumerate(evaluations):
    try:
        # Remove trailing commas before closing braces or brackets
        str_eval = re.sub(r',\s*([\]}])', r'\1', str_eval)
        json_eval = json.loads(str_eval)
        json_evaluations.append(json_eval)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON at index {i}: {e}")
        print(f"Problematic string: {str_eval}")


In [34]:
df_evaluations = pd.DataFrame(json_evaluations)
df_evaluations.Relevance.value_counts()


Relevance
RELEVANT           1059
PARTLY_RELEVANT     237
NON_RELEVANT          4
Name: count, dtype: int64

In [36]:
df_evaluations[df_evaluations.Relevance == 'NON_RELEVANT'] #.to_dict(orient='records')


Unnamed: 0,Relevance,Explanation
93,NON_RELEVANT,The generated answer fails to address the ques...
234,NON_RELEVANT,The generated answer addresses how frequently ...
644,NON_RELEVANT,The generated answer addresses a different que...
983,NON_RELEVANT,The generated answer does not address the core...


In [35]:
df_evaluations.to_csv('/workspaces/Rag_Project_Pod/Evaluation/LLM Evaluation/llm_data/evaluations-aqa.csv', index=False)


# Eval using QA only --- checking relevance of Answer against the question

In [37]:
prompt2_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [38]:
prompt = prompt2_template.format(**record)
print(prompt)

You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: Can I convert reports into PDF for easy distribution?
Generated Answer: Yes, you can convert reports into PDF for easy distribution by selecting the export option in the portal's report viewer. This will allow you to create print-ready documents for sharing insights with stakeholders or integrating them into your presentations.

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}


In [39]:
evaluation = llm(prompt, model='gpt-4o-mini')
print(evaluation)

{
  "Relevance": "RELEVANT",
  "Explanation": "The generated answer directly addresses the question by confirming that reports can be converted into PDF for distribution and provides a method for doing so, specifically mentioning the export option in the portal's report viewer. This information is pertinent and useful for the user's request."
}


In [40]:
evaluations_2 = []

for record in tqdm(samples):
    prompt = prompt2_template.format(**record)
    evaluation = llm(prompt, model='gpt-4o-mini')
    evaluations_2.append(evaluation)

  0%|          | 0/1300 [00:00<?, ?it/s]

In [43]:
import json
import re

json_evaluations_2 = []

for i, str_eval in enumerate(evaluations_2):
    try:
        # Remove trailing commas before closing braces or brackets
        str_eval = re.sub(r',\s*([\]}])', r'\1', str_eval)
        json_eval = json.loads(str_eval)
        json_evaluations_2.append(json_eval)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON at index {i}: {e}")
        print(f"Problematic string: {str_eval}")


In [44]:
df_evaluations_2 = pd.DataFrame(json_evaluations_2)
df_evaluations_2.Relevance.value_counts()


Relevance
RELEVANT           1233
PARTLY_RELEVANT      64
NON_RELEVANT          3
Name: count, dtype: int64

In [45]:
df_evaluations_2.to_csv('/workspaces/Rag_Project_Pod/Evaluation/LLM Evaluation/llm_data/evaluations-qa.csv', index=False)


In [46]:
df_evaluations_2[df_evaluations_2.Relevance == 'NON_RELEVANT']


Unnamed: 0,Relevance,Explanation
93,NON_RELEVANT,The generated answer states that there is no s...
189,NON_RELEVANT,The generated answer states that the provided ...
983,NON_RELEVANT,The generated answer does not directly address...
