# Retriever Evaluation for Q&A Systems
This notebook introduces several approaches for the retrieval component of a Retrieval-Augmented Generation (RAG) pipeline system. We will evaluate and compare these approaches.

1. Data Preparation
2. Baseline Retriever:
   - Sparse Retriever
   - Dense Retriever
   - Ensemble: Sparse + Dense Retriever (baseline)
3. Advanced Retriever:
   - Ensemble + Reranking

In [2]:
!pip install -qU nltk bs4 torch faiss-cpu langchain-community langchain_huggingface transformers rank_bm25 rank-llm==0.12.8

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.2/180.2 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.7/137.7 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m256.9/256.9 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [72]:
import os
import json
import pickle
from typing import List
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import numpy as np
import pandas as pd

import torch
import faiss
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

from langchain_core.documents import Document
from langchain_community.document_compressors.rankllm_rerank import RankLLMRerank
from langchain_community.docstore import InMemoryDocstore
from langchain.retrievers import ParentDocumentRetriever
from langchain_community.retrievers import BM25Retriever
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_core.stores import InMemoryStore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer

# Utils functions for loading data and print metrics

In [73]:
def extract_text_and_links(element):
    plain_text = element.get_text(separator=" ", strip=True)
    links = element.find_all('a')
    text_urls = []
    for link in links:
        link_url = link.get('href')
        link_text = link.get_text()
        text_urls.append({link_text: link_url})

    return plain_text, text_urls


def parse_article(article):
    html_content = article['html_content']
    soup = BeautifulSoup(html_content, "html.parser")

    texts = [article['title']]
    text_urls = []
    section_title = None
    for element in soup.find_all(['div', 'h1', 'h2', 'h3']):
        if element.name == 'div' and element.get('data-component-type') in ['text', 'informative']:
            section_text, section_urls = extract_text_and_links(element)

            texts.append(f'{section_title}\n{section_text}' if section_title else section_text)
            text_urls += section_urls

            section_title = None

        elif element.name in ['h1', 'h2', 'h3']:
            section_title = element.get_text(separator=" ", strip=True)

    return Document(
        id=article['article_id'],
        page_content='\n\n'.join(texts),
        metadata={
            'id': article['article_id'],
            'title': article['title'],
            'url': article['url'],
            'text_urls': text_urls
        }
    )

In [74]:
def format_metrics(metrics):
    formatted = {}
    for k, value in metrics["mrr_at_k"].items():
        formatted[f"MRR@{k}"] = value
    for k, value in metrics["ndcg_at_k"].items():
        formatted[f"nDCG@{k}"] = value
    for k, value in metrics["precision_at_k"].items():
        formatted[f"Precision@{k}"] = value
    for k, value in metrics["recall_at_k"].items():
        formatted[f"Recall@{k}"] = value
    formatted["MAP"] = metrics["map"]

    return formatted

# Evaluation functions

In [75]:
def compute_mrr(relevant_flags):
    reciprocal_ranks = [
        1 / (1 + flags.index(1)) if 1 in flags else 0
        for flags in relevant_flags
    ]
    return sum(reciprocal_ranks) / len(relevant_flags)


def compute_ndcg(relevant_flags):
    ndcg_scores = []
    for flags in relevant_flags:
        dcg = sum((rel / np.log2(i + 2)) for i, rel in enumerate(flags))

        ideal_flags = sorted(flags, reverse=True)
        idcg = sum((rel / np.log2(i + 2)) for i, rel in enumerate(ideal_flags))

        ndcg = dcg / idcg if idcg > 0 else 0.0
        ndcg_scores.append(ndcg)

    return np.mean(ndcg_scores)


def mean_average_precision(relevant_flags):
    def average_precision(flags):
        if sum(flags) == 0:
            return 0.0

        ap = 0.0
        for k in range(1, len(flags) + 1):
            p_k = sum(flags[:k]) / k
            rel_k = flags[k-1]
            ap += p_k * rel_k
        return ap / sum(flags)

    return sum(average_precision(flags) for flags in relevant_flags) / len(relevant_flags)


def evaluate(retriever, data):
    k_to_check = [5, 10]

    relevant_flags_at_k = {k: [] for k in k_to_check}
    precision_at_k = {k: [] for k in k_to_check}  # Precision@k
    recall_at_k = {k: [] for k in k_to_check}  # Recall@k
    total_relevant_docs = []

    for query in tqdm(data):
        question = query['question']
        retrieved_docs = retriever.retrieve(
            user_query=question,
            k=max(k_to_check),
        )
        assert len(retrieved_docs) == max(k_to_check), f'Requested {max(k_to_check)} docs retrieve but return {len(retrieved_docs)} docs'

        retrieved_ids = [doc.metadata['id'] for doc in retrieved_docs]
        relevant_ids = set(query['article_ids'])
        total_relevant_docs.append(len(relevant_ids))

        for k in k_to_check:
            top_k_retrieved_ids = retrieved_ids[:k]
            relevant_flags = [1 if doc_id in relevant_ids else 0 for doc_id in top_k_retrieved_ids]
            relevant_flags_at_k[k].append(relevant_flags)

            precision = sum(relevant_flags) / k
            precision_at_k[k].append(precision)

            recall = sum(relevant_flags) / len(relevant_ids) if relevant_ids else 0
            recall_at_k[k].append(recall)

    # Calculate average metrics
    mrr_at_k = {k: compute_mrr(relevant_flags) for k, relevant_flags in relevant_flags_at_k.items()}
    ndcg_at_k = {k: compute_ndcg(relevant_flags) for k, relevant_flags in relevant_flags_at_k.items()}
    precision_at_k_mean = {k: sum(values) / len(values) for k, values in precision_at_k.items()}
    recall_at_k_mean = {k: sum(values) / len(values) for k, values in recall_at_k.items()}
    map_score = mean_average_precision(relevant_flags_at_k[5])

    return {
        "mrr_at_k": mrr_at_k,
        "ndcg_at_k": ndcg_at_k,
        "precision_at_k": precision_at_k_mean,
        "recall_at_k": recall_at_k_mean,
        "map": map_score,
    }

# Loading the data

In [98]:
TEST_FILE_PATH = 'Copy of bookings_train.json'

To run this notebook on a private test set, replace TEST_FILE_PATH

In [99]:
with open('Copy of kb_bookings_en_20240614.json', 'r') as f:
    kb = json.load(f)
articles = [parse_article(article) for article in kb]

with open(TEST_FILE_PATH, 'r') as f:
    qa_data = json.load(f)

In [100]:
print(f'articles in knowledge base: {len(articles)}')
print(f'Q&A questions for evaluation: {len(qa_data)}')

articles in knowledge base: 303
Q&A questions for evaluation: 150


#  Sparse retriever using BM25

In [78]:
nltk.download('wordnet')
nltk.download('stopwords')

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))


def preprocess_func(text: str) -> List[str]:
    tokens = text.lower().split()
    tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]

    bigrams = ['_'.join(pair) for pair in zip(tokens, tokens[1:])]

    return tokens + bigrams


class SparseRetriever:
    def __init__(self, docs):
        self.bm25_retriever = BM25Retriever.from_documents(docs, preprocess_func=preprocess_func)

    def retrieve(self, user_query, k) -> List[Document]:
        self.bm25_retriever.k = k
        return self.bm25_retriever.invoke(user_query)

[nltk_data] Downloading package wordnet to /Users/shono/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/shono/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [79]:
sparse_retriever = SparseRetriever(docs=articles)
sparse_metrics = evaluate(sparse_retriever, qa_data)
format_metrics(sparse_metrics)

100%|██████████| 150/150 [00:00<00:00, 1001.43it/s]


{'MRR@5': 0.3077777777777779,
 'MRR@10': 0.32105026455026475,
 'nDCG@5': 0.3390089328974232,
 'nDCG@10': 0.37024814551381524,
 'Precision@5': 0.08666666666666657,
 'Precision@10': 0.05466666666666659,
 'Recall@5': 0.3788888888888889,
 'Recall@10': 0.478888888888889,
 'MAP': 0.3077777777777779}

The sparse retrieval results show moderate performance. MRR@5 is 0.31, with nDCG@5 at 0.34, indicating decent ranking relevance in the top 5 results. Recall@5: 0.38, but Precision@5 is low: 0.087, highlighting that while relevant documents are retrieved, they make up a smaller fraction of the top results.

# Dense retriever

In [80]:
class DenseRetriever:
    def __init__(self, embedding_model_name, chunk_size, chunk_overlap):
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.embedding_model = HuggingFaceEmbeddings(
            model_name=embedding_model_name,
            multi_process=False,
            show_progress=False,
            model_kwargs={"device": device, 'trust_remote_code': True},
            encode_kwargs={"normalize_embeddings": True}
        )

        self.child_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
            AutoTokenizer.from_pretrained(embedding_model_name),
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            add_start_index=True,
            strip_whitespace=True,
            separators=["\n\n", "\n", " ", "", "."]
        )

        index = faiss.IndexFlatIP(len(self.embedding_model.embed_query("hello world")))
        self.vector_store = FAISS(
            embedding_function=self.embedding_model,
            index=index,
            docstore=InMemoryDocstore(),
            index_to_docstore_id={},
            distance_strategy=DistanceStrategy.COSINE
        )

        self.store = InMemoryStore()
        self.retriever = ParentDocumentRetriever(
            vectorstore=self.vector_store,
            docstore=self.store,
            child_splitter=self.child_splitter,
        )

    def add_documents(self, docs: List[Document]):
        print(f'Number of docs to add to vector store: {len(docs)}')
        self.retriever.add_documents(docs)
        print(f'Number of chunks in vector store: {len(self.vector_store.index_to_docstore_id)}')

    def retrieve(self, user_query, k) -> List[Document]:
        # since we store children and we return parents, we may have few children from the same parent.
        # this is quick and dirty solution
        for i in range(1, 5):
            self.retriever.search_kwargs = {'k': k * i}
            retrieved_docs = self.retriever.invoke(user_query)[:k]

            if len(retrieved_docs) == k:
                return retrieved_docs

        # if we dont succed in 5 times return the last.
        return retrieved_docs

In [81]:
dense_retriever = DenseRetriever(
    embedding_model_name="thenlper/gte-large",
    chunk_size=512,
    chunk_overlap=128,
)
dense_retriever.add_documents(articles)

Number of docs to add to vector store: 303
Number of chunks in vector store: 497


In [82]:
dense_metrics = evaluate(dense_retriever, qa_data)
format_metrics(dense_metrics)

100%|██████████| 150/150 [00:28<00:00,  5.18it/s]


{'MRR@5': 0.42488888888888904,
 'MRR@10': 0.4364603174603176,
 'nDCG@5': 0.47614369464922124,
 'nDCG@10': 0.5020967192472046,
 'Precision@5': 0.1333333333333331,
 'Precision@10': 0.07933333333333317,
 'Recall@5': 0.5744444444444444,
 'Recall@10': 0.6744444444444445,
 'MAP': 0.4223333333333335}


The dense retrieval results outperform the sparse approach, with MRR@5 at 0.42 and nDCG@5 at 0.48, indicating stronger ranking relevance. Recall is notably higher (Recall@5: 0.57), though precision remains modest (Precision@5: 0.133), reflecting improved retrieval of relevant documents in the top results.

# Now let's ensemble both Sparse and Dense For our Baseline retriever 

In [83]:
class EnsembleRetriever:
    def __init__(self, dense_retriever, sparse_retriever ,dense_w, sparse_w):
        self.dense_retriever = dense_retriever
        self.sparse_retriever = sparse_retriever
        self.weights = [dense_w, sparse_w]

    def retrieve(self, user_query, k) -> List[Document]:
        c = 5
        dense_docs = self.dense_retriever.retrieve(user_query, k)
        sparse_docs = self.sparse_retriever.retrieve(user_query, k)
        doc_lists = [dense_docs, sparse_docs]

        rrf_score = defaultdict(float)
        for doc_list, weight in zip(doc_lists, self.weights):
            for rank, doc in enumerate(doc_list, start=1):
                rrf_score[doc.metadata['id']] += weight / (rank + c)

        all_docs = dense_docs + sparse_docs
        unique_docs = []
        seen_ids = set()
        for doc in all_docs:
            if doc.metadata['id'] not in seen_ids:
                unique_docs.append(doc)
                seen_ids.add(doc.metadata['id'])

        sorted_docs = sorted(
            unique_docs,
            reverse=True,
            key=lambda doc: rrf_score[doc.metadata['id']]
        )
        return sorted_docs[:k]

## Lets search for the right weights between them

In [84]:
results = {}
values = np.arange(0, 1.1, 0.1).tolist()
for dense_w in values:
    ensemble_retriever = EnsembleRetriever(dense_retriever, sparse_retriever, dense_w=dense_w, sparse_w=1 - dense_w)
    metrics = evaluate(ensemble_retriever, qa_data)
    results[(dense_w, 1-dense_w)] = metrics

100%|██████████| 150/150 [00:28<00:00,  5.21it/s]
100%|██████████| 150/150 [00:28<00:00,  5.23it/s]
100%|██████████| 150/150 [00:28<00:00,  5.32it/s]
100%|██████████| 150/150 [00:29<00:00,  5.16it/s]
100%|██████████| 150/150 [00:29<00:00,  5.03it/s]
100%|██████████| 150/150 [00:30<00:00,  4.96it/s]
100%|██████████| 150/150 [00:29<00:00,  5.16it/s]
100%|██████████| 150/150 [00:28<00:00,  5.29it/s]
100%|██████████| 150/150 [00:28<00:00,  5.32it/s]
100%|██████████| 150/150 [00:28<00:00,  5.23it/s]
100%|██████████| 150/150 [00:28<00:00,  5.29it/s]


In [90]:
data = {k: format_metrics(v) for k, v in results.items()}
df = pd.DataFrame(data)

# Define a function to highlight values in col_x based on the condition in col_y
def highlight_based_on_y(row):
    colors = []
    for (dense_w, sparse_w), value in row.items():
        if dense_w != 1:
            if value > row[1.0][0.0]:
                colors.append('background-color: green')
            else:
                colors.append('')
        else:
            colors.append('')
    return colors

# Apply the function row-wise
styled_df = df.style.apply(highlight_based_on_y, axis=1)

styled_df

Unnamed: 0_level_0,0.000000,0.100000,0.200000,0.300000,0.400000,0.500000,0.600000,0.700000,0.800000,0.900000,1.000000
Unnamed: 0_level_1,1.000000,0.900000,0.800000,0.700000,0.600000,0.500000,0.400000,0.300000,0.200000,0.100000,0.000000
MRR@5,0.307778,0.309222,0.339222,0.354444,0.390889,0.429667,0.447444,0.436889,0.434667,0.424667,0.424889
MRR@10,0.32105,0.324077,0.34905,0.365786,0.408365,0.447651,0.463222,0.45045,0.444672,0.436643,0.43646
nDCG@5,0.339009,0.340174,0.369861,0.387273,0.424307,0.464349,0.487096,0.483246,0.48587,0.476025,0.476144
nDCG@10,0.370248,0.37306,0.39261,0.413969,0.464808,0.502905,0.519739,0.511386,0.507633,0.502153,0.502097
Precision@5,0.086667,0.086667,0.094667,0.098667,0.106667,0.12,0.129333,0.133333,0.137333,0.133333,0.133333
Precision@10,0.054667,0.054667,0.054667,0.059333,0.070667,0.076667,0.08,0.080667,0.079333,0.079333,0.079333
Recall@5,0.378889,0.378889,0.412222,0.432222,0.465556,0.517778,0.554444,0.574444,0.594444,0.574444,0.574444
Recall@10,0.478889,0.478889,0.478889,0.515556,0.607778,0.657778,0.681111,0.687778,0.674444,0.674444,0.674444
MAP,0.307778,0.309222,0.337556,0.353333,0.389778,0.427111,0.443778,0.431556,0.430444,0.422111,0.422333



As observed, the hybrid retriever shows a modest improvement of 2-3% with weights of (0.6, 0.4). Additionally, incorporating the sparse component incurs almost no additional cost. and this will be our baseline retriever.

In [91]:
ensemble_retriever = EnsembleRetriever(dense_retriever, sparse_retriever, dense_w=0.6, sparse_w=0.4)
ensemble_metrics = evaluate(ensemble_retriever, qa_data)
format_metrics(ensemble_metrics)

100%|██████████| 150/150 [00:29<00:00,  5.05it/s]


{'MRR@5': 0.4474444444444446,
 'MRR@10': 0.4632962962962965,
 'nDCG@5': 0.48709570822215825,
 'nDCG@10': 0.5194135553127732,
 'Precision@5': 0.1293333333333331,
 'Precision@10': 0.07933333333333317,
 'Recall@5': 0.5544444444444445,
 'Recall@10': 0.6777777777777778,
 'MAP': 0.44377777777777794}

# Advanced Retriever: Ensemble And Reranking

We will use GPT-4o-mini as our ranker, which leverages a sliding window algorithm to generate improved document permutations.

Relevant paper: https://arxiv.org/pdf/2304.09542



In [92]:
class EnsembleAndRerankingRetriever:
    def __init__(self, dense_retriever, sparse_retriever ,dense_w, sparse_w):
        self.dense_retriever = dense_retriever
        self.sparse_retriever = sparse_retriever
        self.weights = [dense_w, sparse_w]

    def retrieve(self, user_query, k):
        new_k = k * 2   # we going to rerank so lets increase k :)
        dense_docs = self.dense_retriever.retrieve(user_query, new_k)
        sparse_docs = self.sparse_retriever.retrieve(user_query, new_k)
        all_docs = dense_docs + sparse_docs

        unique_docs = []
        seen_ids = set()
        for doc in all_docs:
            if doc.metadata['id'] not in seen_ids:
                unique_docs.append(doc)
                seen_ids.add(doc.metadata['id'])

        ranker = RankLLMRerank(
            top_n=k,
            model='gpt',
            step_size=10,
            gpt_model='gpt-4o-mini'
        )

        compressed_docs = ranker.compress_documents(unique_docs, user_query)
        return list(compressed_docs)

In [93]:
import os
os.environ['OPENAI_API_KEY'] = "YOUR_OPENAI_API_KEY"

In [94]:
ranker_and_ensemble_retriever = EnsembleAndRerankingRetriever(
    dense_retriever,
    sparse_retriever,
    dense_w=0.6, sparse_w=0.4
)
ranker_and_ensemble_metrics = evaluate(ranker_and_ensemble_retriever, qa_data)

  0%|          | 0/150 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:03<00:00,  4.00s/it][A
  1%|          | 1/150 [00:04<10:27,  4.21s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:03<00:00,  3.87s/it][A
  1%|▏         | 2/150 [00:08<10:11,  4.13s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:05<00:00,  5.75s/it][A
  2%|▏         | 3/150 [00:14<12:12,  4.98s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:04<00:00,  4.88s/it][A
  3%|▎         | 4/150 [00:19<12:15,  5.04s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:06<00:00,  6.41s/it][A
  3%|▎         | 5/150 [00:26<13:32,  5.61s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:07<00:00,  7.41s/it][A
  4%|▍         | 6/150 [00:33<15:05,  6.29s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:05<00:00,  5.46s/it][A
  5%|▍         | 7/150 [00:39<14:31,  6.09

In [70]:
format_metrics(ranker_and_ensemble_metrics)

{'MRR@5': 0.5354444444444445,
 'MRR@10': 0.5472354497354498,
 'nDCG@5': 0.5801305437229533,
 'nDCG@10': 0.6080386218198135,
 'Precision@5': 0.161333333333333,
 'Precision@10': 0.09133333333333313,
 'Recall@5': 0.6811111111111111,
 'Recall@10': 0.7566666666666667,
 'MAP': 0.5312222222222224}

The ensemble + reranking approach shows significant improvements, with MRR@5 at 0.54 and nDCG@5 nearly reaching 60% (0.58), reflecting the effectiveness of reranking in enhancing top result relevance. Precision@5 (0.16) and recall@5 (0.68), recall@10 (0.76) also show notable gains, with recall seeing a double-digit increase, demonstrating the overall enhancement in retrieval performance.








# All Retrievers results

In [95]:
pd.DataFrame({
    'sparse_retriever': format_metrics(sparse_metrics),
    'dense_retriever': format_metrics(dense_metrics),
    'ensemble_retriever': format_metrics(ensemble_metrics),
    'ranker_and_ensemble_metrics': format_metrics(ranker_and_ensemble_metrics)
})

Unnamed: 0,sparse_retriever,dense_retriever,ensemble_retriever,ranker_and_ensemble_metrics
MRR@5,0.307778,0.424889,0.447444,0.524
MRR@10,0.32105,0.43646,0.463296,0.534672
nDCG@5,0.339009,0.476144,0.487096,0.576399
nDCG@10,0.370248,0.502097,0.519414,0.600493
Precision@5,0.086667,0.133333,0.129333,0.162667
Precision@10,0.054667,0.079333,0.079333,0.092
Recall@5,0.378889,0.574444,0.554444,0.687778
Recall@10,0.478889,0.674444,0.677778,0.766667
MAP,0.307778,0.422333,0.443778,0.522889
