# Dense Retrieval
Implementation of dense passage retrieval using DistilBERT and FAISS. Evaluated on the MS MARCO dataset using MRR and retrieval time.


## Load Dataset

In [None]:
from ir_datasets import load
from ranx import Qrels
from itertools import islice

# choose dataset
# dataset_name = "beir/msmarco/dev"
dataset_name = "beir/hotpotqa/dev"
# dataset_name = "beir/climate-fever"

# choose subset of docs
# subset = 1_000_000
# subset = 2_000_000
subset = 3_000_000

# load dataset
dataset = load(dataset_name)
qrels = Qrels.from_ir_datasets(dataset_name)

# load a subset of docs and queries
docs = {d.doc_id: d.text for d in islice(dataset.docs_iter(), subset)} # load subset
queries = {q.query_id: q.text for q in islice(dataset.queries_iter(), 500)} # load first 500

# # uncomment to load full dataset
# docs = {d.doc_id: d.text for d in dataset.docs_iter()}
# queries = {q.query_id: q.text for q in dataset.queries_iter()}

print(f'DOCS ({len(docs)}): {list(docs.items())[0]} \n')
print(f'QUERIES ({len(queries)}): {list(queries.items())[0]} \n')
print(f'QRELS ({len(qrels)}): {list(qrels.to_dict().items())[0]} \n')

## Data Preprocessing

In [None]:
# extract document and query IDs + texts
doc_ids, doc_texts = list(docs.keys()), list(docs.values())
query_ids, query_texts = list(queries.keys()), list(queries.values())

# clean qrels - remove doc_ids that dont exist in subset
qrels_dict = qrels.to_dict()

filtered_qrels_dict = {
    qid: {
        did: rel for did, rel in dids.items() if did in doc_ids
    }
    for qid, dids in qrels_dict.items()
    if qid in query_ids and any(did in doc_ids for did in dids)
}

qrels = Qrels.from_dict(filtered_qrels_dict)

## Document Embedding

In [None]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.quantization import quantize_embeddings
import torch

# load encoder model pretrained on MS MARCO
model_name = "msmarco-MiniLM-L6-cos-v5"
model = SentenceTransformer(model_name)

# use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 32 if device == "cuda" else 16
model.eval() # put in eval mode to speed up inference

# encode documents and queries
document_embeddings = model.encode(doc_texts,
                                    batch_size=batch_size,
                                    convert_to_tensor=True,
                                    device=device,
                                    show_progress_bar=True)
query_embeddings = model.encode(query_texts,
                                batch_size=batch_size,
                                convert_to_tensor=True,
                                device=device,
                                show_progress_bar=True)

# convert to numpy array for FAISS
doc_embs = document_embeddings.cpu().numpy().astype("float32")
query_embs = query_embeddings.cpu().numpy().astype("float32")

## Build FAISS Index

In [None]:
import faiss

#normalize embeddings with faiss
faiss.normalize_L2(doc_embs)
faiss.normalize_L2(query_embs)

dim = doc_embs.shape[1]
print(dim)

# build index, use cosine similarity
index = faiss.IndexFlatIP(dim)
index.add(doc_embs)
print(index.ntotal)


## Evaluation

### Retrieval Time

In [None]:
import time

# do KNN Search and return retrieval time
k = 10
start_time = time.time()
distances, indices = index.search(query_embs, k)
retrieval_time = (time.time() - start_time) / len(query_ids)
print(f"Retrieval time per query: {retrieval_time:.6f} seconds")

### Mean Reciprocal Rank (MRR)

In [None]:
from ranx import Run, evaluate

# calculate MRR

# Run: stores the relevance scores estimated by the model under evaluation
# map results for each query_id -> { doc_id: score }
run = {
    query_ids[i]: {
        doc_ids[indices[i][j]]: float(distances[i][j]) for j in range(k)
    }
    for i in range(len(query_ids))
}

# convert to Run object
run_rx = Run(run)

# measure MRR
mrr = evaluate(qrels=qrels, run=run_rx, metrics="mrr", make_comparable=True)
print(f"MRR: {mrr:.8f}")


In [None]:
name = "hotpot" if "hotpot" in dataset_name else ("msmarco" if "msmarco" in dataset_name else "climate-fever")

# # save run to google drive
# from google.colab import drive
# drive.mount('/gdrive')
# run_rx.save(f"/gdrive/MyDrive/{name}_{subset}_dense.json")

# save locally
run_rx.save(f"data/{name}/{subset}_dense.json")

# re-print results
print(f"dataset: {dataset_name}, num of docs: {len(docs)}")
print(f"Dense retrieval time per query: {retrieval_time:.6f} seconds")
print(f"MRR: {mrr:.8f}")