## Sparse Retrieval
Implementation of sparse passage retrieval using TF-IDF and BM25. Evaluated on the MS MARCO dataset using MRR and retrieval time.

In [2]:
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi
import numpy as np
import time

In [3]:
# load MSMARCO dataset
dataset = load_dataset("ms_marco", "v2.1")
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]
test_dataset = dataset["test"]

In [4]:
skip_cell = True
if not skip_cell:
    # display dataset information
    print("Train dataset size:", len(train_dataset))
    print("Validation dataset size:", len(eval_dataset))
    print("Test dataset size:", len(test_dataset))

    # print column labels
    print("Train dataset columns:", train_dataset.column_names, "\n")

    # print sample from the train dataset
    for col in train_dataset.column_names:
        print(f"Sample {col}: {train_dataset[0][col]}")

In [5]:
def preprocess(data):
    """
    Preprocesses the MS MARCO dataset, extracting queries and passages.
    """
    queries = []
    passages = []
    # For training data, we need to extract queries and positive passages
    if "answers" in data.features:
        for item in data:
            queries.append(item["query"])
            # In training, we will use the first passage as the positive passage.
            # in test and validation, we would need to do this differently.
            if len(item["passages"]["passage_text"]) > 0:
                passages.extend(item["passages"]["passage_text"])

    # For validation and test, we need to extract only queries and passages
    else:
        for item in data:
            queries.append(item["query"])
            if len(item["passages"]["passage_text"]) > 0:
                passages.extend(item["passages"]["passage_text"])
    return queries, passages

# Preprocess the train, validation, and test datasets
train_queries, train_passages = preprocess(train_dataset)
validation_queries, validation_passages = preprocess(eval_dataset)
test_queries, test_passages = preprocess(test_dataset)

# Example of the first training query and passage
print("\nExample:")
print(f"Query: {train_queries[0]}")
print(f"Passage: {train_passages[0]}")   


Example:
Query: )what was the immediate impact of the success of the manhattan project?
Passage: The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.


In [None]:
class SparseRetriever:
    def __init__(self, passages):
        self.passages = passages
        self.tokenized_passages = [p.lower().split() for p in passages]
        
    def build_indices(self):
        # TF-IDF
        self.tfidf = TfidfVectorizer(
            lowercase=True,
            stop_words='english',
            ngram_range=(1, 2)
        )
        self.tfidf_vectors = self.tfidf.fit_transform(self.passages)
        
        # BM25
        self.bm25 = BM25Okapi(self.tokenized_passages)
    
    def tfidf_search(self, query, top_k=10):
        query_vec = self.tfidf.transform([query])
        scores = np.squeeze(query_vec.dot(self.tfidf_vectors.T).toarray())
            
        top_indices = np.argsort(scores)[-top_k:][::-1]
        return top_indices, scores[top_indices]

    def bm25_search(self, query, top_k=10):
        tokenized_query = query.lower().split()
        scores = self.bm25.get_scores(tokenized_query)
            
        top_indices = np.argsort(scores)[-top_k:][::-1]
        return top_indices, scores[top_indices]

retriever = SparseRetriever(train_passages)
retriever.build_indices()