In [1]:
import ir_datasets
import tqdm
import pyterrier as pt
print(pt.__version__)

pt.init()
from pathlib import Path
import re
import pandas as pd
from pyterrier.measures import RR, nDCG, MAP

0.13.0


Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]
java is now started automatically with default settings. To force initialisation early, run:
pt.java.init() # optional, forces java initialisation
  pt.init()


In [2]:
msmarco = ir_datasets.load("msmarco-passage/train/judged")
print(msmarco)
print(dir(msmarco))

Dataset(id='msmarco-passage/train/judged', provides=['docs', 'queries', 'qrels', 'scoreddocs', 'docpairs'])
['EXTENSIONS', 'PREFIX', '__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__get__', '__getattr__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__self__', '__self_class__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__thisclass__', '__weakref__', '_affected_files', '_cls', '_count', '_dataset', '_dataset_id', '_datatype', '_dlc', '_documentation', '_format_3col', '_iter', '_key', '_lazy_qids', '_message', '_metadata', '_metadata_provider', '_migrate', '_mode', '_negate_score', '_path', '_qrels_defs', '_qrels_dlc', '_qrels_internal_iter', '_queries_handler', '_queries_lang', '_queries_namespace', '_read_version', '_scoreddocs_dlc', '_scoreddocs_handler', '_skip_first_line', '_s

In [3]:
# OPTIONAL - Index the data

idx_path = Path.cwd() / "indices" / "msmarco_train_judged"

def msmarco_gen(limit=100000):
    lastdoc = 0
    for elem in msmarco.docs:
        if lastdoc >= limit:
            break
        yield {
            "docno": elem.doc_id,
            "text": elem.text,
        }
        lastdoc += 1

max(len(text.encode("utf-8")) for _, text in msmarco.docs)

if not idx_path.exists() or not any(idx_path.iterdir()):
    indexer = pt.IterDictIndexer(
        str(idx_path),
        meta={
            "docno": 20,
            "text": 4096,
        },
        stemmer="porter",
        stopwords="terrier",
    )

    index_ref = indexer.index(msmarco_gen())
else:
    print("Indices already exist, skipping creation")

Indices already exist, skipping creation


In [4]:
# Define index paths
index_dir_msmarco = Path.cwd() / "indices" / "msmarco_train_judged"
# Load the indexes
index_msmarco = pt.IndexFactory.of(str(index_dir_msmarco))
#index_antique = pt.IndexFactory.of(str(index_dir_antique))

# Use BM25 as the baseline retriever
#retriever_msmarco = pt.BatchRetrieve(index_msmarco, wmodel="BM25")
retriever_msmarco = pt.terrier.Retriever(index_msmarco, wmodel="BM25", num_results=100)
#retriever_msmarco = retriever_msmarco.batch(batch_size=1000)
#retriever_antique = pt.BatchRetrieve(index_antique, wmodel="BM25")

In [5]:
# Convert qrels to DataFrames
qrels_msmarco = pd.DataFrame(msmarco.qrels_iter())
#qrels_antique = pd.DataFrame(antique.qrels_iter())

# Convert queries to DataFrames
queries_msmarco = pd.DataFrame(msmarco.queries_iter())
#queries_antique = pd.DataFrame(antique.queries_iter())

# Rename columns for PyTerrier compatibility
qrels_msmarco.rename(columns={"query_id": "qid", "doc_id": "docno", "relevance": "label"}, inplace=True)
#qrels_antique.rename(columns={"query_id": "qid", "doc_id": "docno", "relevance": "label"}, inplace=True)

queries_msmarco.rename(columns={"query_id": "qid", "text": "query"}, inplace=True)
#queries_antique.rename(columns={"query_id": "qid", "text": "query"}, inplace=True)

In [6]:
def clean_query(query):
    query = query.encode("ascii", "ignore").decode()
    query = query.replace("'", "").replace('"', "").replace("`", "")
    query = re.sub(r"\s+", " ", query).strip()
    return query

queries_msmarco["query"] = queries_msmarco["query"].apply(clean_query)
#queries_antique["query"] = queries_antique["query"].apply(clean_query)

In [7]:
#Model imports
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the T5 model
MODEL_ID = "prhegde/t5-query-reformulation-RL"
tokenizer = T5Tokenizer.from_pretrained(MODEL_ID)
model = T5ForConditionalGeneration.from_pretrained(MODEL_ID)
model.eval()


  from .autonotebook import tqdm as notebook_tqdm





T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [8]:
print("CUDA Available:", torch.cuda.is_available())
print("Current Device:", torch.cuda.current_device())
print("Device Name:", torch.cuda.get_device_name(0))
print("Device Count:", torch.cuda.device_count())

CUDA Available: True
Current Device: 0
Device Name: NVIDIA GeForce RTX 3060
Device Count: 1


In [9]:
from joblib import Parallel, delayed

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Parallel clean queries (CPU)
print("🔹 Step 1: Cleaning queries (CPU)...")
queries_msmarco["cleaned_query"] = Parallel(n_jobs=-1)(
    delayed(clean_query)(q) for q in queries_msmarco["query"]
)
print("✅ Step 1 Complete: Query cleaning finished!\n")

# Rewrite queries (GPU with batching)
print("🔹 Step 2: Rewriting queries (GPU)...")
batch_size = 64
queries = list(queries_msmarco["cleaned_query"])
rewritten_queries = []

for i in range(0, len(queries), batch_size):
    print(f"  🔄 Processing batch {i//batch_size + 1}/{len(queries)//batch_size + 1}...")
    batch = queries[i : i + batch_size]
    input_ids = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)
    with torch.no_grad():
        output = model.generate(input_ids, max_length=35, num_beams=1, do_sample=True, repetition_penalty=1.8)
    rewritten_queries.extend(tokenizer.batch_decode(output, skip_special_tokens=True))

print("✅ Step 2 Complete: Query rewriting finished!\n")
queries_msmarco["rewritten_query"] = rewritten_queries


🔹 Step 1: Cleaning queries (CPU)...
✅ Step 1 Complete: Query cleaning finished!

🔹 Step 2: Rewriting queries (GPU)...
  🔄 Processing batch 1/7859...
  🔄 Processing batch 2/7859...
  🔄 Processing batch 3/7859...
  🔄 Processing batch 4/7859...
  🔄 Processing batch 5/7859...
  🔄 Processing batch 6/7859...
  🔄 Processing batch 7/7859...
  🔄 Processing batch 8/7859...
  🔄 Processing batch 9/7859...
  🔄 Processing batch 10/7859...
  🔄 Processing batch 11/7859...
  🔄 Processing batch 12/7859...
  🔄 Processing batch 13/7859...
  🔄 Processing batch 14/7859...
  🔄 Processing batch 15/7859...
  🔄 Processing batch 16/7859...
  🔄 Processing batch 17/7859...
  🔄 Processing batch 18/7859...
  🔄 Processing batch 19/7859...
  🔄 Processing batch 20/7859...
  🔄 Processing batch 21/7859...
  🔄 Processing batch 22/7859...
  🔄 Processing batch 23/7859...
  🔄 Processing batch 24/7859...
  🔄 Processing batch 25/7859...
  🔄 Processing batch 26/7859...
  🔄 Processing batch 27/7859...
  🔄 Processing batch 28/785

In [10]:
def clean_query_slow(query):
    query = query.replace("\n", " ").replace("\r", " ").replace("?", "")
    query = query.encode("ascii", "ignore").decode()
    query = re.sub(r"[\"`]", "", query)
    query = re.sub(r"\s+", " ", query).strip()
    query = re.sub(r"[^\w\s]", "", query)
    return query
print("🔹 Step 3: Cleaning queries (CPU)...")

queries_msmarco["cleaned_rewritten_query"] = queries_msmarco["rewritten_query"].apply(clean_query_slow)
# Remove empty or whitespace-only queries
queries_msmarco = queries_msmarco[queries_msmarco["cleaned_rewritten_query"].str.strip() != ""]



🔹 Step 3: Cleaning queries (CPU)...


In [22]:
bm25_msmarco = pt.terrier.Retriever(index_msmarco, wmodel="BM25")

In [23]:
# queries_msmarco['query'] = queries_msmarco['cleaned_rewritten_query']

# pt.Experiment(
#     [bm25_msmarco],
#     queries_msmarco,  # Use rewritten queries
#     qrels_msmarco,
#     eval_metrics=[RR @ 10, nDCG @ 20, MAP],
# )

import numpy as np

# Assuming pt.init() is already done
# Your full query set
queries = queries_msmarco.copy()
queries['query'] = queries['cleaned_rewritten_query']

# Define your model
model = bm25_msmarco

# Split into batches (e.g., 5 batches)
n_batches = 100
query_batches = np.array_split(queries, n_batches)

# Store results
all_results = []

for i, query_batch in enumerate(query_batches):
    print(f"Running batch {i + 1}/{n_batches}...")

    # Evaluate using the modern approach
    evaluator = pt.Experiment(
        [bm25_msmarco],
        query_batch, 
        qrels_msmarco,
        eval_metrics=[RR@10, nDCG@20, MAP],
        perquery=False
    )

    all_results.append(evaluator)


Running batch 1/100...
Running batch 2/100...
Running batch 3/100...
Running batch 4/100...
Running batch 5/100...
Running batch 6/100...
Running batch 7/100...
Running batch 8/100...
Running batch 9/100...
Running batch 10/100...
Running batch 11/100...
Running batch 12/100...
Running batch 13/100...
Running batch 14/100...
Running batch 15/100...
Running batch 16/100...
Running batch 17/100...
Running batch 18/100...
Running batch 19/100...
Running batch 20/100...
Running batch 21/100...
Running batch 22/100...
Running batch 23/100...
Running batch 24/100...
Running batch 25/100...
Running batch 26/100...
Running batch 27/100...
Running batch 28/100...
Running batch 29/100...
Running batch 30/100...
Running batch 31/100...
Running batch 32/100...
Running batch 33/100...
Running batch 34/100...
Running batch 35/100...
Running batch 36/100...
Running batch 37/100...
Running batch 38/100...
Running batch 39/100...
Running batch 40/100...
Running batch 41/100...
Running batch 42/100...
R

In [24]:


# Combine all batch results into one DataFrame
combined_df = pd.concat(all_results, ignore_index=True)

# Calculate the mean for each metric
average_results = combined_df.mean(numeric_only=True)

# Format and print results as a nice table
summary_table = pd.DataFrame(average_results).T  # Transpose for prettier display
summary_table.index = ['Average']
print(summary_table)

            RR@10   nDCG@20       AP
Average  0.001329  0.001799  0.00135
