# Dense Retrieval
Implementation of dense passage retrieval using DistilBERT and FAISS. Evaluated on the MS MARCO dataset using MRR and retrieval time.


## Load Dataset

In [1]:
from dataset_loader import DatasetLoader

# Load cranfield dataset for testing functions, use MS MARCO, HotpotQA, and potentially Climate-FEVER for real evaluation
loader = DatasetLoader("cranfield")
docs, queries, qrels = loader.get_all()
loader.print_info()

# # load multiple datasets for evaluation
# nameset = ["beir/msmarco/dev", "bier/hotpotqa/dev", "bier/climate-fever/dev"]
#
# # Dictionary to hold datasets
# datasets = {}
#
# for name in nameset:
#     loader = DatasetLoader(name)
#     docs, queries, qrels = loader.get_all()
#     datasets[name] = {
#         "docs": docs,
#         "queries": queries,
#         "qrels": qrels
#     }
#     loader.print_info()


DATASET: cranfield
DOCS (1400): ('1', 'experimental investigation of the aerodynamics of a\nwing in a slipstream .\n  an experimental study of a wing in a propeller slipstream was\nmade in order to determine the spanwise distribution of the lift\nincrease due to slipstream at different angles of attack of the wing\nand at different free stream to slipstream velocity ratios .  the\nresults were intended in part as an evaluation basis for different\ntheoretical treatments of this problem .\n  the comparative span loading curves, together with\nsupporting evidence, showed that a substantial part of the lift increment\nproduced by the slipstream was due to a /destalling/ or\nboundary-layer-control effect .  the integrated remaining lift\nincrement, after subtracting this destalling lift, was found to agree\nwell with a potential flow theory .\n  an empirical evaluation of the destalling effects was made for\nthe specific configuration of the experiment .') 

QUERIES (225): ('1', 'what simi

## Data Preprocessing

In [2]:
# extract document and query IDs + texts for embedding
doc_ids, doc_texts = list(docs.keys()), list(docs.values())
query_ids, query_texts = list(queries.keys()), list(queries.values())
#
# print(doc_ids[:5])
# print(doc_texts[:5])
#
# print(query_ids[:5])
# print(query_texts[:5])


## Document Embedding

In [3]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.quantization import quantize_embeddings
import torch
# load encoder model pretrained on MS MARCO
model_name = "msmarco-MiniLM-L6-cos-v5"
model = SentenceTransformer(model_name)

# use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 32 if device == "cuda" else 16
model.eval() # put in eval mode to speed up inference

# encode documents and queries
document_embeddings = model.encode(doc_texts,
                                    batch_size=batch_size,
                                    convert_to_tensor=True,
                                    device=device,
                                    show_progress_bar=True,
                                    normalize_embeddings=True)
query_embeddings = model.encode(query_texts,
                                batch_size=batch_size,
                                convert_to_tensor=True,
                                device=device,
                                show_progress_bar=True,
                                normalize_embeddings=True)

# convert to numpy array for FAISS
# doc_embs = document_embeddings.cpu().numpy().astype("float32")
# query_embs = query_embeddings.cpu().numpy().astype("float32")

# convert to ubinary to save memory space
doc_embs = quantize_embeddings(document_embeddings, "ubinary")
query_embs = quantize_embeddings(query_embeddings, "ubinary")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/430 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/88 [00:00<?, ?it/s]

Batches:   0%|          | 0/15 [00:00<?, ?it/s]

## Build FAISS Index

In [5]:
import faiss

# normalize embeddings with faiss
# faiss.normalize_L2(doc_embs)
# faiss.normalize_L2(query_embs)

# build index, use cosine similarity
dim = doc_embs.shape[1]
print(dim)

index = faiss.IndexFlatIP(dim)
index.add(doc_embs)
print(index.ntotal)


48
1400


In [5]:
print(doc_embs.shape)
print(query_embs.shape)

(1400, 768)
(225, 768)


## Evaluation

### Retrieval Time

In [None]:
import time

# do KNN Search and return retrieval time
k = 10
start_time = time.time()
distances, indices = index.search(query_embs, k)
retrieval_time = (time.time() - start_time) / len(query_ids)
print(f"Retrieval time per query: {retrieval_time:.6f} seconds")

In [None]:
# import numpy as np
# from tqdm import tqdm
#
# k = 10
# batch_size = 32  # Tune based on available RAM
# num_queries = query_embs.shape[1]
# all_distances = []
# all_indices = []
#
# for i in tqdm(range(0, num_queries, batch_size)):
#     end = min(i + batch_size, num_queries)
#     query_batch = query_embs[i:end]
#
#     # FAISS search for the batch
#     distances, indices = index.search(query_batch, k)
#
#     all_distances.append(distances)
#     all_indices.append(indices)
#
# # Concatenate results
# all_distances = np.vstack(all_distances)
# all_indices = np.vstack(all_indices)

  0%|          | 0/8 [00:00<?, ?it/s]

### Mean Reciprocal Rank (MRR)

In [None]:
from ranx import Run, evaluate

# calculate MRR

# Run: stores the relevance scores estimated by the model under evaluation
# map results for each query_id -> { doc_id: score }
run = {
    query_ids[i]: {
        doc_ids[indices[i][j]]: float(distances[i][j]) for j in range(k)
    }
    for i in range(len(query_ids))
}

# convert to Run object
run_rx = Run(run)
run_rx.save("dense_run.json")

# measure MRR
mrr = evaluate(qrels, run_rx, "mrr", make_comparable=True)
print(f"MRR: {mrr:.4f}")
