In [12]:
cd /content/drive/MyDrive/UMass/"685 PROJECT"/"LSR-VQ"

/content/drive/MyDrive/UMass/685 PROJECT/LSR-VQ


## Download Dataset (Don't run!!! Data is already downloaded)

In [None]:
# # Download collection (pId -> passage text)
# !wget -P data/raw/ https://msmarco.z22.web.core.windows.net/msmarcoranking/collection.tar.gz

# # Download queries (qId -> query text)
# !wget -P data/raw/ https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz

# # Download qRels Dev and Train
# !wget -P data/raw/ https://msmarco.z22.web.core.windows.net/msmarcoranking/qrels.dev.tsv
# !wget -P data/raw/ https://msmarco.z22.web.core.windows.net/msmarcoranking/qrels.train.tsv

# # Download training data - qId positive_pId and negative_pId
# !wget -P data/raw/ https://msmarco.z22.web.core.windows.net/msmarcoranking/qidpidtriples.train.full.2.tsv.gz

--2025-03-31 19:45:49--  https://msmarco.z22.web.core.windows.net/msmarcoranking/collection.tar.gz
Resolving msmarco.z22.web.core.windows.net (msmarco.z22.web.core.windows.net)... 20.150.34.1
Connecting to msmarco.z22.web.core.windows.net (msmarco.z22.web.core.windows.net)|20.150.34.1|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1035009698 (987M) [application/octet-stream]
Saving to: ‘data/collection.tar.gz’


2025-03-31 19:46:09 (49.1 MB/s) - ‘data/collection.tar.gz’ saved [1035009698/1035009698]



In [None]:
# # unzip files
# !tar -xf "data/raw/queries.tar.gz"
# !tar -xf "data/raw/collection.tar.gz"
# !tar -xf "data/raw/qidpidtriples.train.full.2.tsv.gz"

[0m[01;34mraw[0m/


## Dataset

In [9]:
import pandas as pd
import csv

In [13]:
# Read the tsv file as a dictionary
def open_file(file_path, keys = [0, 1]):
    with open(file_path, mode = "r", encoding = "utf-8") as file:
        reader = csv.reader(file, delimiter= "\t")
        data = {row[keys[0]]: row[keys[1]] for row in reader}

    return data

# Load and preprocess the dataset
def load_and_preprocess_dataset():
    # Load all passages
    passages = open_file("data/collection.tsv")

    # Load all train, dev & eval queries
    queries_train = open_file("data/queries.train.tsv")
    queries_dev = open_file("data/queries.dev.tsv")
    queries_eval = open_file("data/queries.eval.tsv")

    # Load qRels train & dev
    # These only have relevant files i.e binary relevance judgment score
    # will be 1 for all the entries
    qrels_train = open_file("data/qrels.train.tsv", keys = [0, 2])
    qrels_dev = open_file("data/qrels.dev.tsv", keys = [0, 2])

    return passages, queries_train, queries_dev, queries_eval, qrels_train, qrels_dev

In [14]:
passages, queries_train, queries_dev, queries_eval, qrels_train, qrels_dev = load_and_preprocess_dataset()

In [19]:
def print_samples(file_name, dict_, n = 2):
    print('-' * 15)
    print(f'Samples for {file_name}:')
    samples = list(dict_.items())[:2]
    for entry in samples:
        print(entry)

print_samples("passages", passages)
print_samples("queries_train", queries_train)
print_samples("queries_dev", queries_dev)
print_samples("queries_eval", queries_eval)
print_samples("qrels_train", qrels_train)
print_samples("qrels_dev", qrels_dev)

---------------
Samples for passages:
('0', 'The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.')
('1', 'The Manhattan Project and its atomic bomb helped bring an end to World War II. Its legacy of peaceful uses of atomic energy continues to have an impact on history and science.')
---------------
Samples for queries_train:
('121352', 'define extreme')
('634306', 'what does chattel mean on credit history')
---------------
Samples for queries_dev:
('1048578', 'cost of endless pools/swim spa')
('1048579', 'what is pcnt')
---------------
Samples for queries_eval:
('786436', 'what is prescribed to treat thyroid storm')
('9', ' Refer to the data. Diminishing returns begin to occur with the hiring of the _________ unit of

## Evaluate Model

In [5]:
# Install required libraries
!pip install torch ir_datasets faiss-cpu wandb numpy scikit-learn sentence-transformers transformers tqdm scipy matplotlib rank-eval faiss-gpu-cu11

Collecting ir_datasets
  Using cached ir_datasets-0.5.10-py3-none-any.whl.metadata (12 kB)
Collecting faiss-cpu
  Using cached faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting rank-eval
  Downloading rank_eval-0.1.3-py3-none-any.whl.metadata (6.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-n

In [5]:
# Import libraries
import torch
import ir_datasets
import faiss
import wandb
import heapq
import time
import sys
import random
import string
import os
import pickle
import math

import torch.nn as nn
import torch.optim as optim
import numpy as np

from sentence_transformers import SentenceTransformer
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel, AutoTokenizer
from tqdm import tqdm
from sklearn.metrics import ndcg_score, recall_score
from collections import defaultdict
from scipy.sparse import csr_matrix
from collections import defaultdict
from tqdm import tqdm
from rank_eval import Qrels, Run, evaluate

In [16]:
import torch
from transformers import AutoTokenizer, AutoModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained('facebook/contriever-msmarco')
model = AutoModel.from_pretrained('facebook/contriever-msmarco').to(device)

# Mean pooling
def mean_pooling(token_embeddings, mask):
    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
    return sentence_embeddings

In [2]:
# Evaluate model
def evaluate_model(model, passages, dev_queries, dev_qrels, tokenizer, device, batch_size = 32, batch_size_inference = 128, min_weight = 1e-5, query_topk = 128, doc_topk = 1024, quick_mode = True):
    base_model = model.module if isinstance(model, nn.DataParallel) else model

    model.eval()
    all_results = {}  # Store results for each query

    # Filter passages if in quick mode
    if quick_mode:
        relevant_passage_ids = set()
        for qid in dev_qrels:
            relevant_passage_ids.update(dev_qrels[qid])
        passages = {passage_id: passages[passage_id] for passage_id in relevant_passage_ids}
        print(f"Quick mode: using {len(passages)} passages for evaluation")

    # Filter queries if in quick mode
    if quick_mode:
        dev_queries = dict(list(dev_queries.items())[:200])
        dev_qrels = {qid: dev_qrels[qid] for qid in dev_queries if qid in dev_qrels}
        print(f"Quick mode: using {len(dev_queries)} queries for evaluation")

    # Dense retrieval using FAISS
    print("Building FAISS index...")
    passage_ids = list(passages.keys())
    passage_embeddings = []

    # Encode all passages
    for i in tqdm(range(0, len(passage_ids), batch_size_inference), desc = "Encoding passages"):
        batch_passages = [passages[pid] for pid in passage_ids[i:i + batch_size_inference]]

        # Pad till the model's configured max_len (512)
        passage_inputs = tokenizer(batch_passages, padding = True, truncation = True, return_tensors = 'pt')
        passage_inputs = {k: v.to(device) for k, v in passage_inputs.items()}

        with torch.no_grad():
            outputs = model(passage_inputs["input_ids"], passage_inputs["attention_mask"])
            batch_embeddings = mean_pooling(outputs[0], passage_inputs['attention_mask'])
            passage_embeddings.append(batch_embeddings.cpu().numpy())

    passage_embeddings = np.vstack(passage_embeddings)

    # Build FAISS index
    dimension = passage_embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)
    index.add(passage_embeddings)

    # Evaluate queries
    with torch.no_grad():
        for qid, query in tqdm(dev_queries.items(), desc = "Evaluating"):
            if qid not in dev_qrels:
                continue

            # Pad till the model's configured max_len (512)
            query_input = tokenizer(query, padding = True, truncation = True, return_tensors = 'pt')
            query_input = {k: v.to(device) for k, v in query_input.items()}
            output = model(query_input["input_ids"], query_input["attention_mask"])
            query_embedding = mean_pooling(output[0], query_input['attention_mask'])
            query_embedding = query_embedding.cpu().numpy()

            # Search using FAISS
            scores, indices = index.search(query_embedding, 1000)
            search_results = [(passage_ids[idx], float(score)) for idx, score in zip(indices[0], scores[0])]
            all_results[qid] = search_results

    # Create rank_eval Run and Qrels objects
    run = Run()
    qrels = Qrels()

    # Add results to Run object
    for qid in all_results:
        passage_ids = [str(passage_id) for passage_id, score in all_results[qid]]
        scores = [float(score) for _, score in all_results[qid]]
        run.add(qid, passage_ids, scores)

    # Add relevance judgments to Qrels object
    for qid in dev_qrels:
        qrels.add(qid, [str(doc_id) for doc_id in dev_qrels[qid]], [1] * len(dev_qrels[qid]))

    # Evaluate using rank_eval
    metrics = ["ndcg@10", "ndcg@100", "ndcg@1000", "recall@10", "recall@100", "recall@1000", "mrr@10"]
    results = evaluate(qrels, run, metrics)

    return (
        results["mrr@10"],
        {
            '10': results["ndcg@10"],
            '100': results["ndcg@100"],
            '1000': results["ndcg@1000"]
        },
        {
            '10': results["recall@10"],
            '100': results["recall@100"],
            '1000': results["recall@1000"]
        }
    )


In [17]:
mrr_10, ndcg, recall = evaluate_model(model = model, passages = passages, dev_queries = queries_dev, dev_qrels = qrels_dev, tokenizer = tokenizer, device = device)

Quick mode: using 10 passages for evaluation
Quick mode: using 200 queries for evaluation
Building FAISS index...


Encoding passages: 100%|██████████| 1/1 [00:00<00:00,  1.13it/s]
Evaluating: 100%|██████████| 200/200 [00:01<00:00, 124.09it/s]
  scores[i] = _ndcg(qrels[i], run[i], k, jarvelin)


In [18]:
# Metrics
print("Final Evaluation")
print(f"MRR@10: {mrr_10:.4f}")
print(f"nDCG@10: {ndcg['10']:.4f}, nDCG@100: {ndcg['100']:.4f}, nDCG@1000: {ndcg['1000']:.4f}")
print(f"Recall@10: {recall['10']:.4f}, Recall@100: {recall['100']:.4f}, Recall@1000: {recall['1000']:.4f}")

Final Evaluation
MRR@10: 0.7461
nDCG@10: 0.7981, nDCG@100: 0.7981, nDCG@1000: 0.7981
Recall@10: 1.0000, Recall@100: 1.0000, Recall@1000: 1.0000
