In [2]:
import pyterrier as pt
import pandas as pd

In [4]:
indexer = pt.IterDictIndexer('./indices/msmarco-passage')
index = indexer.path
bm25 = pt.terrier.Retriever(index, wmodel="BM25", num_results=10) 

In [7]:
ds_topics = pt.get_dataset('irds:msmarco-passage/trec-dl-2019/judged')
ds_text = pt.get_dataset('irds:msmarco-passage')
topics = ds_topics.get_topics()

In [8]:
def get_text_for_docnos(docnos, dataset):
    """Efficiently get text for specific document IDs"""
    doc_texts = {}
    
    # Convert all docnos to the format we'll try
    docno_variants = {}
    for docno in docnos:
        str_docno = str(docno)
        docno_variants[str_docno] = docno
        # Also try integer version if it's numeric
        if str_docno.isdigit():
            docno_variants[int(str_docno)] = docno
    
    # Only iterate through corpus until we find all needed documents
    needed_ids = set(docno_variants.keys())
    found_count = 0
    
    corpus_iter = dataset.get_corpus_iter()
    for doc in corpus_iter:
        doc_id = doc['docno']
        
        # Check if this document is one we need
        if doc_id in needed_ids:
            original_docno = docno_variants[doc_id]
            doc_texts[original_docno] = doc['text']
            found_count += 1
            
            # Stop early if we found all documents we need
            if found_count >= len(docnos):
                break
    
    # Fill in missing documents
    for docno in docnos:
        if docno not in doc_texts:
            doc_texts[docno] = "Document text not available"
    
    return doc_texts

In [11]:
# Process each query with BM25 only, but also add text in the same format as MonoT5
all_results = []

for idx, row in topics.iterrows():
    query_id = row['qid']
    query_text = row['query']
    
    print(f"Processing query {idx+1}/{len(topics)}: {query_text}")
    
    # Get BM25 results - no reranking needed
    results = bm25.search(query_text)
    
    # Add text to the results
    docnos = results['docno'].tolist()
    doc_texts = get_text_for_docnos(docnos, ds_text)
    results['text'] = results['docno'].map(doc_texts)
    
    # Add query information to results
    results['qid'] = query_id
    results['query'] = query_text
    
    # Add docid (same as docno) to match MonoT5 format
    results['docid'] = results['docno']
    
    # Convert rank to 0-based if needed
    if 'rank' in results.columns and results['rank'].min() == 1:
        results['rank'] = results['rank'] - 1
    # Create rank column if missing
    elif 'rank' not in results.columns:
        results['rank'] = range(len(results))
    
    # Reorder columns to match MonoT5 CSV structure
    results = results[['qid', 'docid', 'docno', 'query', 'text', 'score', 'rank']]
    
    # Add to collection
    all_results.append(results)
    
    # Show progress for first query
    if idx == 0:
        print("\nSample results for first query:")
        print(results[['docno', 'score', 'text']].head(3))

# Combine all results
combined_results = pd.concat(all_results) if all_results else pd.DataFrame()

# Save results to CSV
combined_results.to_csv('bm25_results.csv', index=False)

print(f"\nRetrieved {len(combined_results)} documents across {len(topics)} queries")
print(f"Results saved to bm25_results.csv")

# Show summary of results
print("\nTop documents for first few queries:")
print(combined_results.groupby('qid').head(1)[['qid', 'docno', 'score']].head(5))

Processing query 1/43: do goldfish grow


msmarco-passage documents:   0%|          | 0/8841823 [00:00<?, ?it/s]

msmarco-passage documents:  93%|█████████▎| 8182166/8841823 [00:23<00:01, 341771.26it/s]



Sample results for first query:
     docno      score                                               text
0  8182161  31.156444  A: While goldfish can grow up to 18 inches in ...
1  6139386  31.032333  A: The conditions goldfish are kept in plus th...
2  3288600  30.953843  A goldfish will grow to the depth of the water...
Processing query 2/43: what is wifi vs bluetooth


msmarco-passage documents:  92%|█████████▏| 8160519/8841823 [00:25<00:02, 325800.25it/s]


Processing query 3/43: why did the us volunterilay enter ww1


msmarco-passage documents:  69%|██████▉   | 6093907/8841823 [00:19<00:08, 319468.03it/s]


Processing query 4/43: definition declaratory judgment


msmarco-passage documents:  97%|█████████▋| 8612906/8841823 [00:29<00:00, 291506.01it/s]


Processing query 5/43: right pelvic pain causes


msmarco-passage documents:  63%|██████▎   | 5583224/8841823 [00:18<00:10, 303933.00it/s]


Processing query 6/43: what are the social determinants of health


msmarco-passage documents:  84%|████████▎ | 7384325/8841823 [00:24<00:04, 299478.20it/s]


Processing query 7/43: does legionella pneumophila cause pneumonia


msmarco-passage documents:  44%|████▍     | 3908773/8841823 [00:12<00:15, 314556.48it/s]


Processing query 8/43: how is the weather in jamaica


msmarco-passage documents:  93%|█████████▎| 8255705/8841823 [00:26<00:01, 311602.81it/s]


Processing query 9/43: types of dysarthria from cerebral palsy


msmarco-passage documents:  97%|█████████▋| 8617271/8841823 [00:28<00:00, 304066.22it/s]


Processing query 10/43: who is robert gray


msmarco-passage documents:  99%|█████████▉| 8760873/8841823 [00:27<00:00, 318835.51it/s]


Processing query 11/43: what types of food can you cook sous vide


msmarco-passage documents:  93%|█████████▎| 8178998/8841823 [00:24<00:02, 327642.54it/s]


Processing query 12/43: how long is life cycle of flea


msmarco-passage documents:  75%|███████▌  | 6641238/8841823 [00:22<00:07, 299993.84it/s]


Processing query 13/43: what can contour plowing reduce


msmarco-passage documents:  91%|█████████ | 8052624/8841823 [00:32<00:03, 251408.95it/s]


Processing query 14/43: when was the salvation army founded


msmarco-passage documents:  98%|█████████▊| 8689057/8841823 [00:31<00:00, 277007.99it/s]


Processing query 15/43: what is a active margin


msmarco-passage documents:  92%|█████████▏| 8093931/8841823 [00:25<00:02, 315401.95it/s]


Processing query 16/43: difference between rn and bsn


msmarco-passage documents:  94%|█████████▎| 8283527/8841823 [00:28<00:01, 295289.53it/s]


Processing query 17/43: medicare s definition of mechanical ventilation


msmarco-passage documents:  62%|██████▏   | 5475308/8841823 [00:17<00:10, 309785.09it/s]


Processing query 18/43: how to find the midsegment of a trapezoid


msmarco-passage documents:  69%|██████▉   | 6108721/8841823 [00:21<00:09, 289088.71it/s]


Processing query 19/43: what is an aml surveillance analyst


msmarco-passage documents:  92%|█████████▏| 8106324/8841823 [00:27<00:02, 293645.80it/s]


Processing query 20/43: what is the daily life of thai people


msmarco-passage documents:  92%|█████████▏| 8139258/8841823 [00:31<00:02, 258809.45it/s]


Processing query 21/43: definition of a sigmet


msmarco-passage documents:  99%|█████████▊| 8710813/8841823 [00:26<00:00, 325455.38it/s]


Processing query 22/43: cost of interior concrete flooring


msmarco-passage documents:  96%|█████████▌| 8495099/8841823 [00:26<00:01, 325896.79it/s]


Processing query 23/43: what is the most popular food in switzerland


msmarco-passage documents:  98%|█████████▊| 8683095/8841823 [00:26<00:00, 326995.19it/s]


Processing query 24/43: how are some sharks warm blooded


msmarco-passage documents:  94%|█████████▎| 8273763/8841823 [00:25<00:01, 327402.64it/s]


Processing query 25/43: what is durable medical equipment consist of


msmarco-passage documents:  99%|█████████▉| 8754404/8841823 [00:26<00:00, 330679.89it/s]


Processing query 26/43: exons definition biology


msmarco-passage documents:  99%|█████████▉| 8794308/8841823 [00:30<00:00, 288630.34it/s]


Processing query 27/43: define visceral


msmarco-passage documents:  93%|█████████▎| 8204463/8841823 [00:28<00:02, 289541.96it/s]


Processing query 28/43: tracheids are part of


msmarco-passage documents:  96%|█████████▌| 8492305/8841823 [00:30<00:01, 281983.51it/s]


Processing query 29/43: rsa definition key


msmarco-passage documents:  89%|████████▊ | 7839906/8841823 [00:26<00:03, 298871.01it/s]


Processing query 30/43: who formed the commonwealth of independent states


msmarco-passage documents:  93%|█████████▎| 8220089/8841823 [00:28<00:02, 286676.23it/s]


Processing query 31/43: causes of left ventricular hypertrophy


msmarco-passage documents:  94%|█████████▍| 8332546/8841823 [00:27<00:01, 303899.80it/s]


Processing query 32/43: lps laws definition


msmarco-passage documents:  97%|█████████▋| 8536118/8841823 [00:27<00:00, 309503.72it/s]


Processing query 33/43: what are the three percenters


msmarco-passage documents:  86%|████████▌ | 7573692/8841823 [00:24<00:04, 309822.47it/s]


Processing query 34/43: causes of military suicide


msmarco-passage documents: 100%|█████████▉| 8819116/8841823 [00:28<00:00, 311087.86it/s]


Processing query 35/43: what is theraderm used for


msmarco-passage documents:  98%|█████████▊| 8651776/8841823 [00:27<00:00, 313300.12it/s]


Processing query 36/43: what is famvir prescribed for


msmarco-passage documents:  92%|█████████▏| 8117094/8841823 [00:26<00:02, 309010.48it/s]


Processing query 37/43: anthropological definition of environment


msmarco-passage documents:  95%|█████████▌| 8412687/8841823 [00:26<00:01, 311860.82it/s]


Processing query 38/43: axon terminals or synaptic knob definition


msmarco-passage documents:  98%|█████████▊| 8641107/8841823 [00:27<00:00, 311878.76it/s]


Processing query 39/43: is cdg airport in main paris


msmarco-passage documents:  95%|█████████▌| 8433858/8841823 [00:27<00:01, 308233.55it/s]


Processing query 40/43: example of monotonic function


msmarco-passage documents:  99%|█████████▉| 8757181/8841823 [00:27<00:00, 315555.45it/s]


Processing query 41/43: what is physical description of spruce


msmarco-passage documents:  92%|█████████▏| 8128798/8841823 [00:26<00:02, 311552.66it/s]


Processing query 42/43: hydrogen is a liquid below what temperature


msmarco-passage documents:  89%|████████▉ | 7911557/8841823 [00:25<00:02, 312925.24it/s]


Processing query 43/43: difference between a mcdouble and a double cheeseburger


msmarco-passage documents:  95%|█████████▌| 8434626/8841823 [00:27<00:01, 309881.63it/s]


Retrieved 425 documents across 43 queries
Results saved to bm25_results.csv

Top documents for first few queries:
       qid    docno      score
0   156493  8182161  31.156444
0  1110199  3838645  36.361917
0  1063750  4788295  30.340022
0   130510  1494936  41.166240
0   489204  1479542  38.213874



