In [6]:
import json
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch

In [7]:
import json

# Update the file path to the JSON file
file_path = "/workspaces/Rag_Project_Pod/Data_prep/final_data.json"

# Load the JSON file into a dictionary
with open(file_path, 'r') as json_file:
    documents = json.load(json_file)

documents[0]


{'Category': 'General Information',
 'Question': 'What is syndicated research?',
 'Answer': 'Syndicated research is a type of market research where data and findings are collected and compiled by a research agency and then sold to multiple clients. It provides insights into consumer behavior, market trends, and product performance across various FMCG categories.',
 'doc_id': '3e72e1c8'}

# Evaluating RAG Offline

In [1]:
from sentence_transformers import SentenceTransformer


model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

  from tqdm.autonotebook import tqdm, trange


### Eval Vector Search

In [11]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "Answer": {"type": "text"},
            "Category": {"type": "text"},
            "Question": {"type": "text"},
            "doc_id": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "insights-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [10]:
for doc in tqdm(documents):
    question = doc['Question']
    text = doc['Answer']
    qt = question + ' ' + text

    doc['question_vector'] = model.encode(question)
    doc['text_vector'] = model.encode(text)
    doc['question_text_vector'] = model.encode(qt)

  0%|          | 0/260 [00:00<?, ?it/s]

In [12]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/260 [00:00<?, ?it/s]

In [13]:
def elastic_search_knn(field, vector):
    # Define the KNN part of the query
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000
    }

    # Define the base search query without any filter
    search_query = {
        "knn": knn,
        "_source": ["Answer", "Category", "Question", "doc_id"]
    }

    # Execute the query against Elasticsearch
    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    # Process the results
    result_docs = []
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs


In [15]:
import pandas as pd

df_ground_truth = pd.read_csv(r'/workspaces/Rag_Project_Pod/Data_prep/ground_truth_data.csv')
df_ground_truth.head()                              

Unnamed: 0,Question,Category,Document
0,Can you explain what syndicated research entails?,General Information,3e72e1c8
1,What type of data is included in syndicated re...,General Information,3e72e1c8
2,Who compiles the findings for syndicated resea...,General Information,3e72e1c8
3,In what industries is syndicated research comm...,General Information,3e72e1c8
4,How can syndicated research benefit multiple c...,General Information,3e72e1c8


In [17]:
ground_truth = df_ground_truth.to_dict(orient='records')
ground_truth[0]

{'Question': 'Can you explain what syndicated research entails?',
 'Category': 'General Information',
 'Document': '3e72e1c8'}

relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['Document']
    results = elastic_search(query=q['Question'])
    relevance = [d['doc_id'] == doc_id for d in results]
    relevance_total.append(relevance)

In [18]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [19]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [None]:
def question_vector_knn(q):
    question = q['Question']

    # Encode the question using the model to get the vector representation
    v_q = model.encode(question)

    # Call the elastic_search_knn function
    return elastic_search_knn('question_vector', v_q)

In [20]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['Document']
        results = search_function(q)
        relevance = [d['doc_id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [21]:
evaluate(ground_truth, question_vector_knn)

  0%|          | 0/1300 [00:00<?, ?it/s]

{'hit_rate': 0.8153846153846154, 'mrr': 0.6719871794871807}

In [27]:
def answer_vector_knn(q):
    question = q['Question']
    v_q = model.encode(question)
    return elastic_search_knn('text_vector', v_q)

In [28]:
evaluate(ground_truth, answer_vector_knn)

  0%|          | 0/1300 [00:00<?, ?it/s]

{'hit_rate': 0.9, 'mrr': 0.7621410256410267}

In [31]:
def elastic_search_knn_combined(vector):
    search_query = {
        "size": 5,  # Limit to 5 results
        "query": {
            "script_score": {
                "query": {
                    "match_all": {}  # Use match_all for the base query
                },
                "script": {
                    "source": """
                        cosineSimilarity(params.query_vector, 'question_vector') + 
                        cosineSimilarity(params.query_vector, 'text_vector') + 
                        cosineSimilarity(params.query_vector, 'question_text_vector') + 
                        1
                    """,  # Cosine similarity for the provided vectors
                    "params": {
                        "query_vector": vector  # Pass the query vector
                    }
                }
            }
        },
        "_source": ["Answer", "Category", "Question", "doc_id"]  # Return the specified fields
    }

    # Execute the query against Elasticsearch
    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    # Process the results
    result_docs = []
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs


In [32]:
def vector_combined_knn(q):
    question = q['Question']
    v_q = model.encode(question)
    return elastic_search_knn_combined(v_q)

evaluate(ground_truth, vector_combined_knn)

  0%|          | 0/1300 [00:00<?, ?it/s]

{'hit_rate': 0.8815384615384615, 'mrr': 0.7472564102564111}