In [1]:
import json
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch

In [2]:
import json

# Update the file path to the JSON file
file_path = "/workspaces/Rag_Project_Pod/Data_prep/final_data.json"

# Load the JSON file into a dictionary
with open(file_path, 'r') as json_file:
    documents = json.load(json_file)

documents[0]


{'Category': 'General Information',
 'Question': 'What is syndicated research?',
 'Answer': 'Syndicated research is a type of market research where data and findings are collected and compiled by a research agency and then sold to multiple clients. It provides insights into consumer behavior, market trends, and product performance across various FMCG categories.',
 'doc_id': '3e72e1c8'}

# Evaluating RAG Offline

In [3]:
from sentence_transformers import SentenceTransformer


model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)



### Eval Hybrid Search

In [5]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "Answer": {"type": "text"},
            "Category": {"type": "text"},
            "Question": {"type": "text"},
            "doc_id": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "insights-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'insights-questions'})

In [6]:
for doc in tqdm(documents):
    question = doc['Question']
    text = doc['Answer']
    qt = question + ' ' + text

    doc['question_vector'] = model.encode(question)
    doc['text_vector'] = model.encode(text)
    doc['question_text_vector'] = model.encode(qt)

  0%|          | 0/260 [00:00<?, ?it/s]

In [7]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/260 [00:00<?, ?it/s]

In [None]:
#Hybrid Searcg Example

In [8]:
query = 'What is the sample size?'


In [9]:
v_q = model.encode(query)


In [10]:
knn_query = {
    "field": "text_vector",
    "query_vector": v_q,
    "k": 5,
    "num_candidates": 10000,
    "boost": 0.5,
}

In [11]:
keyword_query = {
    "bool": {
        "must": {
            "multi_match": {
                "query": query,
                "fields": ["Question^3", "Answer","Category"],
                "type": "best_fields",
                "boost": 0.5,
    
            }
        }
    }
}

In [12]:
response = es_client.search(
    index=index_name,
    query=keyword_query,
    knn=knn_query,
    size=5
)

In [13]:
response['hits']['hits'][0]['_source']['Answer']

'For global studies, the sample size typically includes over 50,000 respondents, ensuring a representative sample across different regions and demographics.'

In [17]:
import pandas as pd

df_ground_truth = pd.read_csv(r'/workspaces/Rag_Project_Pod/Data_prep/ground_truth_data.csv')
df_ground_truth.head()                              

Unnamed: 0,Question,Category,Document
0,Can you explain what syndicated research entails?,General Information,3e72e1c8
1,What type of data is included in syndicated re...,General Information,3e72e1c8
2,Who compiles the findings for syndicated resea...,General Information,3e72e1c8
3,In what industries is syndicated research comm...,General Information,3e72e1c8
4,How can syndicated research benefit multiple c...,General Information,3e72e1c8


In [18]:
ground_truth = df_ground_truth.to_dict(orient='records')
ground_truth[0]

{'Question': 'Can you explain what syndicated research entails?',
 'Category': 'General Information',
 'Document': '3e72e1c8'}

relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['Document']
    results = elastic_search(query=q['Question'])
    relevance = [d['doc_id'] == doc_id for d in results]
    relevance_total.append(relevance)

In [23]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [24]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [36]:
def elastic_search_hybrid(field, query, vector):
    knn_query = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "boost": 0.5
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["Question", "Answer", "Category"],  # Updated fields for keyword query
                    "type": "best_fields",
                    "boost": 0.5
                }
            }
        }
    }

    search_query = {
        "knn": knn_query,
        "query": keyword_query,
        "size": 5,
        "_source": ["Question", "Answer", "Category", "doc_id"]  # Adjusted to return specified fields
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs


In [27]:
def question_hybrid(q):
    question = q['Question']

    v_q = model.encode(question)

    return elastic_search_hybrid('question_vector', question, v_q)

In [30]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['Document']
        results = search_function(q)
        relevance = [d['doc_id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [37]:
evaluate(ground_truth, question_hybrid)

  0%|          | 0/1300 [00:00<?, ?it/s]

{'hit_rate': 0.8630769230769231, 'mrr': 0.6993205128205137}

ES question vector : {'hit_rate': 0.8153846153846154, 'mrr': 0.6719871794871807}


In [42]:
def text_hybrid(q):
    question = q['Question']
    v_q = model.encode(question)
    return elastic_search_hybrid('text_vector', question, v_q)

In [43]:
evaluate(ground_truth, text_hybrid)

  0%|          | 0/1300 [00:00<?, ?it/s]

{'hit_rate': 0.8684615384615385, 'mrr': 0.7041923076923081}

ES text vector {'hit_rate': 0.9, 'mrr': 0.7621410256410267}


In [44]:
def question_text_hybrid(q):
    question = q['Question']

    v_q = model.encode(question)

    return elastic_search_hybrid('question_text_vector', question, v_q)

evaluate(ground_truth, question_text_hybrid)

  0%|          | 0/1300 [00:00<?, ?it/s]

{'hit_rate': 0.8607692307692307, 'mrr': 0.6975000000000007}

ES combined vector : {'hit_rate': 0.8815384615384615, 'mrr': 0.7472564102564111}


# Implementing RRF

In [45]:
def compute_rrf(rank, k=60):
    """ Our own implementation of the relevance score """
    return 1 / (k + rank)

In [54]:
def elastic_search_hybrid_rrf(field, query, vector, k=60):
    # KNN Query
    knn_query = {
        "field": field,
        "query_vector": vector,
        "k": 10,
        "num_candidates": 10000,
        "boost": 0.5
    }

    # Keyword Query
    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["Question", "Answer", "Category"],  # Updated fields
                    "type": "best_fields",
                    "boost": 0.5
                }
            }
        }
    }

    # KNN Search
    knn_results = es_client.search(
        index=index_name, 
        body={
            "knn": knn_query, 
            "size": 10
        }
    )['hits']['hits']
    
    # Keyword Search
    keyword_results = es_client.search(
        index=index_name, 
        body={
            "query": keyword_query, 
            "size": 10
        }
    )['hits']['hits']
    
    # Reciprocal Rank Fusion (RRF) scoring
    rrf_scores = {}
    
    # Calculate RRF scores for KNN results
    for rank, hit in enumerate(knn_results):
        doc_id = hit['_id']
        rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Calculate RRF scores for keyword results
    for rank, hit in enumerate(keyword_results):
        doc_id = hit['_id']
        if doc_id in rrf_scores:
            rrf_scores[doc_id] += compute_rrf(rank + 1, k)
        else:
            rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Sort RRF scores in descending order
    reranked_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Get top-K documents by the score
    final_results = []
    for doc_id, score in reranked_docs[:5]:
        doc = es_client.get(index=index_name, id=doc_id)
        final_results.append(doc['_source'])
    
    return final_results


In [49]:
def question_text_hybrid_rrf(q):
    question = q['Question']

    v_q = model.encode(question)

    return elastic_search_hybrid_rrf('question_text_vector', question, v_q)



In [55]:
evaluate(ground_truth, question_text_hybrid_rrf)

  0%|          | 0/1300 [00:00<?, ?it/s]

{'hit_rate': 0.9161538461538462, 'mrr': 0.7835256410256419}

#Paid version Reranking RRF

To use the Reciprocal rank fusion (RRF) score we need to pull the docker image with a more recent version of Elasticsearch:

docker run -it \
    --rm \
    --name elasticsearch \
    -m 4GB \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.9.0

def elastic_search_hybrid_rrf(field, query, vector, course):
    knn_query = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "boost": 0.5,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question", "text", "section"],
                    "type": "best_fields",
                    "boost": 0.5,
                }
            },
            "filter": {
                "term": {
                    "course": course
                }
            }
        }
    }

    search_query = {
        "knn": knn_query,
        "query": keyword_query,
        "size": 5,
        "rank": {
            "rrf": {}
        },
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs