## Dense Retrieval
Implementation of dense passage retrieval using a SOTA sentence embedding model and FAISS for retrieval. Evaluated on the MS MARCO dataset using MRR and retrieval time. 

In [None]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
import torch
import faiss
import time
import pandas as pd
import numpy as np

In [None]:
# load MSMARCO dataset
dataset = load_dataset("ms_marco", "v2.1")
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]
test_dataset = dataset["test"]

In [None]:
skip_cell = True
if not skip_cell:
    # display dataset information
    print("Train dataset size:", len(train_dataset))
    print("Validation dataset size:", len(eval_dataset))
    print("Test dataset size:", len(test_dataset))

    # print column labels
    print("Train dataset columns:", train_dataset.column_names, "\n")

    # print sample from the train dataset
    for col in train_dataset.column_names:
        print(f"Sample {col}: {train_dataset[0][col]}")


In [None]:
def preprocess(data):
    """
    Preprocesses the MS MARCO dataset, extracting queries and passages.
    """
    queries = []
    passages = []
    # For training data, we need to extract queries and positive passages
    if "answers" in data.features:
        for item in data:
            queries.append(item["query"])
            # In training, we will use the first passage as the positive passage.
            # in test and validation, we would need to do this differently.
            if len(item["passages"]["passage_text"]) > 0:
                passages.extend(item["passages"]["passage_text"])

    # For validation and test, we need to extract only queries and passages
    else:
        for item in data:
            queries.append(item["query"])
            if len(item["passages"]["passage_text"]) > 0:
                passages.extend(item["passages"]["passage_text"])
    return queries, passages

# Preprocess the train, validation, and test datasets
train_queries, train_passages = preprocess(train_dataset)
validation_queries, validation_passages = preprocess(eval_dataset)
test_queries, test_passages = preprocess(test_dataset)

# Example of the first training query and passage
print("\nExample:")
print(f"Query: {train_queries[0]}")
print(f"Passage: {train_passages[0]}")   

In [None]:
# generate document embeddings with SOTA model
model_name = "all-MiniLM-L6-v2"
device = "cuda" if torch.cuda.is_available() else "cpu"

model = SentenceTransformer(model_name)
model.to(device)

def generate_embeddings(model, texts):
    """
    Generates embeddings for a list of texts using the given model.
    """
    embeddings = model.encode(texts, convert_to_tensor=False)
    return np.array(embeddings)

# Generate embeddings for training data
train_query_embeddings = generate_embeddings(model, train_queries)
train_passage_embeddings = generate_embeddings(model, train_passages)

# Generate embeddings for validation data
validation_query_embeddings = generate_embeddings(model, validation_queries)
validation_passage_embeddings = generate_embeddings(model, validation_passages)

# Generate embeddings for test data
test_query_embeddings = generate_embeddings(model, test_queries)
test_passage_embeddings = generate_embeddings(model, test_passages)
