## Sparse Retrieval
Implementation of sparse passage retrieval using TF-IDF and BM25. Evaluated using MRR and retrieval time.

## Load Dataset

In [None]:
from ir_datasets import load
from ranx import Qrels
from itertools import islice

# choose dataset
# dataset_name = "beir/msmarco/dev"
# dataset_name = "beir/hotpotqa/dev"
dataset_name = "beir/climate-fever"

# choose subset of docs
subset = 1_000_000
# subset = 2_000_000
# subset = 3_000_000

# ==== do not change below ====

# load dataset
dataset = load(dataset_name)
qrels = Qrels.from_ir_datasets(dataset_name)

# load a subset of docs and queries
docs = {d.doc_id: d.text for d in islice(dataset.docs_iter(), subset)} # load subset
queries = {q.query_id: q.text for q in islice(dataset.queries_iter(), 500)} # load first 500

# # uncomment to load full dataset
# docs = {d.doc_id: d.text for d in dataset.docs_iter()}
# queries = {q.query_id: q.text for q in dataset.queries_iter()}

print(f'DOCS ({len(docs)}): {list(docs.items())[0]} \n')
print(f'QUERIES ({len(queries)}): {list(queries.items())[0]} \n')
print(f'QRELS ({len(qrels)}): {list(qrels.to_dict().items())[0]} \n')

## Data Preprocessing

In [None]:
# extract document and query IDs + texts
doc_ids, doc_texts = list(docs.keys()), list(docs.values())
query_ids, query_texts = list(queries.keys()), list(queries.values())

# clean qrels - remove doc_ids that dont exist in subset
qrels_dict = qrels.to_dict()

filtered_qrels_dict = {
    qid: {
        did: rel for did, rel in dids.items() if did in doc_ids
    }
    for qid, dids in qrels_dict.items()
    if qid in query_ids and any(did in doc_ids for did in dids)
}

qrels = Qrels.from_dict(filtered_qrels_dict)

## Build Retriever

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi
import time

# TF-IDF
print("Building TF-IDF")
start_tfidf = time.time()

tfidf = TfidfVectorizer(lowercase=True, stop_words="english", max_features=50_000)
doc_tfidf = tfidf.fit_transform(doc_texts)
query_tfidf = tfidf.transform(query_texts)

finish_tfidf = time.time() - start_tfidf
print(f"Finished TF-IDF: {finish_tfidf:.4f}s")

# BM25

print("Building BM25")
start_bm25 = time.time()

# Use the same preprocessor and tokenizer as TF-IDF
preprocessor = tfidf.build_preprocessor()
tokenizer = tfidf.build_tokenizer()

tokenized_docs = [tokenizer(preprocessor(doc)) for doc in doc_texts]
tokenized_queries = [tokenizer(preprocessor(query)) for query in query_texts]

bm25 = BM25Okapi(tokenized_docs)

finish_bm25 = time.time() - start_bm25
print(f"Finished BM25: {finish_bm25:.4f}s")


## Evaluation

### Retrieval Time

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm

k = 10 # top-k docs

# ====== TF-IDF ======
batch_size = 100
tfidf_scores = {}

start_time = time.time()

for i in tqdm(range(0, query_tfidf.shape[0], batch_size), desc="TF-IDF - Retrieving"):
    batch_queries = query_tfidf[i:i + batch_size]
    sims = cosine_similarity(batch_queries, doc_tfidf)

    # build dict of scores
    for bi, sim_row in enumerate(sims):
        qid = query_ids[i + bi]

        top_k_idx = np.argpartition(sim_row, -k)[-k:]
        top_k_idx = top_k_idx[np.argsort(sim_row[top_k_idx])[::-1]]

        tfidf_scores[qid] = {
            doc_ids[j]: float(sim_row[j]) for j in top_k_idx
        }

total_time = time.time() - start_time

retrieval_time = total_time / len(query_ids)
print(f"TF-IDF retrieval time per query: {retrieval_time:.6f} seconds")

# ====== BM25 ======
# Score each query against the corpus
bm25_scores = {}
bm25_total_time = 0  # total time for all queries

for query_id, query_tokens in tqdm(zip(query_ids, tokenized_queries), total=len(query_ids), desc="BM25 - Retrieving"):
    start_time = time.time()
    # get scores for all documents
    scores = bm25.get_scores(query_tokens)
    bm25_total_time += time.time() - start_time

    # get top-k scores
    top_k_idx = np.argpartition(scores, -k)[-k:]
    top_k_idx = top_k_idx[np.argsort(scores[top_k_idx])[::-1]]

    # build dict of scores
    bm25_scores[query_id] = {
        doc_ids[i]: float(scores[i])
        for i in top_k_idx
    }


# Average retrieval time per query
bm25_time = bm25_total_time / len(query_ids)
print(f"BM25 Retrieval time per query: {bm25_time:.6f} seconds")

### Mean Reciprocal Rank (MRR)

In [None]:
from ranx import Run, evaluate

# calculate MRR
# Run: stores the relevance scores estimated by the model under evaluation
tfidf_run = Run.from_dict(tfidf_scores, name="tfidf")
bm25_run = Run.from_dict(bm25_scores, name="bm25")

tfidf_mrr = evaluate(qrels=qrels, run=tfidf_run, metrics="mrr", make_comparable=True) # make_comparable removes query ids that are not in both qrels and run
bm25_mrr = evaluate(qrels=qrels, run=bm25_run, metrics="mrr", make_comparable=True)
print(f"TF-IDF MRR: {tfidf_mrr:.6f}")
print(f"BM25 MRR: {bm25_mrr:.6f}")

In [None]:
# save results to file
name = "hotpot" if "hotpot" in dataset_name else ("msmarco" if "msmarco" in dataset_name else "climate-fever")

tfidf_run.save(f"data/{name}/{subset}_tfidf_run.json")
bm25_run.save(f"data/{name}/{subset}_bm25_run.json")

# re-print results at the end for convenience
print(f"dataset: {name}, num of docs: {len(docs)}")
print(f"TF-IDF retrieval time per query: {retrieval_time:.6f} seconds")
print(f"BM25 Retrieval time per query: {bm25_time:.6f} seconds")
print(f"TF-IDF MRR: {tfidf_mrr:.6f}")
print(f"BM25 MRR: {bm25_mrr:.6f}")