In [1]:
import ir_datasets
import tqdm
import pyterrier as pt
from pathlib import Path
import re
import pandas as pd
from pyterrier.measures import RR, nDCG, MAP

In [2]:
pt.datasets.list_datasets()

Unnamed: 0,dataset,topics,topics_lang,qrels,corpus,corpus_lang,index,info_url
0,50pct,"[training, validation]",en,"[training, validation]",,,"[ex2, ex3]",
1,antique,"[train, test]",en,"[train, test]",True,en,,https://ciir.cs.umass.edu/downloads/Antique/re...
2,vaswani,True,en,True,True,en,True,http://ir.dcs.gla.ac.uk/resources/test_collect...
3,msmarco_document,"[train, dev, test, test-2020, leaderboard-2020]",en,"[train, dev, test, test-2020]",True,en,True,https://microsoft.github.io/msmarco/
4,msmarcov2_document,"[train, dev1, dev2, valid1, valid2, trec_2021]",en,"[train, dev1, dev2, valid1, valid2]",,,True,https://microsoft.github.io/msmarco/TREC-Deep-...
...,...,...,...,...,...,...,...,...
763,irds:neuclir,,,,,,,https://ir-datasets.com/neuclir.html
764,irds:neuclir/1,,,,,,,https://ir-datasets.com/neuclir.html#neuclir/1
779,irds:sara,True,en,True,True,en,,https://ir-datasets.com/sara.html
780,trec-deep-learning-docs,"[train, dev, test, test-2020, leaderboard-2020]",en,"[train, dev, test, test-2020]",True,en,True,https://microsoft.github.io/msmarco/


In [3]:
# Load datasets
msmarco = ir_datasets.load("msmarco-passage/train")
antique = ir_datasets.load("antique/test")

In [4]:
print(msmarco)

Dataset(id='msmarco-passage/train', provides=['docs', 'queries', 'qrels', 'scoreddocs', 'docpairs'])


In [5]:
print(msmarco.docs[:10])

<ir_datasets.indices.lz4_pickle.Lz4PickleIter object at 0x0000017FF7F3E750>


In [6]:
def msmarco_gen(limit=100000):
    lastdoc = 0
    for elem in msmarco.docs:
        if lastdoc >= limit:
            break
        yield {
            "docno": elem.doc_id,
            "text": elem.text,
        }
        lastdoc += 1

In [7]:
max(len(text.encode("utf-8")) for _, text in msmarco.docs)

1669

In [8]:
idx_path = Path.cwd() / "indices" / "msmarco_test"
indexer = pt.IterDictIndexer(
    str(idx_path),
    meta={
        "docno": 20,
        "text": 4096,
    },
    stemmer="porter",
    stopwords="terrier",
)

Java started (triggered by TerrierIndexer.__init__) and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]


In [9]:
# Dont run this cell if you have already indexed the data
# index_ref = indexer.index(msmarco_gen())

In [10]:
print(antique.docs[0])

GenericDoc(doc_id='2020338_0', text="A small group of politicians believed strongly that the fact that Saddam Hussien remained in power after the first Gulf War was a signal of weakness to the rest of the world, one that invited attacks and terrorism. Shortly after taking power with George Bush in 2000 and after the attack on 9/11, they were able to use the terrorist attacks to justify war with Iraq on this basis and exaggerated threats of the development of weapons of mass destruction. The military strength of the U.S. and the brutality of Saddam's regime led them to imagine that the military and political victory would be relatively easy.")


In [11]:
def antique_gen(limit=100000):
    lastdoc = 0
    for elem in antique.docs:
        if lastdoc >= limit:
            break
        yield {
            "docno": elem.doc_id,
            "text": elem.text,
        }
        lastdoc += 1

In [12]:
max(len(text.encode("utf-8")) for _, text in antique.docs)

4000

In [13]:
idx_path = Path.cwd() / "indices" / "antique_test"
indexer = pt.IterDictIndexer(
    str(idx_path),
    meta={
        "docno": 20,
        "text": 4096,
    },
    stemmer="porter",
    stopwords="terrier",
)

In [14]:
# Don't run this cell if you have already indexed the data
# index_ref = indexer.index(antique_gen())

In [15]:
# Define index paths
index_dir_msmarco = Path.cwd() / "indices" / "msmarco_test"
index_dir_antique = Path.cwd() / "indices" / "antique_test"

# Load the indexes
index_msmarco = pt.IndexFactory.of(str(index_dir_msmarco))
index_antique = pt.IndexFactory.of(str(index_dir_antique))

# Use BM25 as the baseline retriever
retriever_msmarco = pt.BatchRetrieve(index_msmarco, wmodel="BM25")
retriever_antique = pt.BatchRetrieve(index_antique, wmodel="BM25")

  retriever_msmarco = pt.BatchRetrieve(index_msmarco, wmodel="BM25")
  retriever_antique = pt.BatchRetrieve(index_antique, wmodel="BM25")


In [16]:
# Convert qrels to DataFrames
qrels_msmarco = pd.DataFrame(msmarco.qrels_iter())
qrels_antique = pd.DataFrame(antique.qrels_iter())

# Convert queries to DataFrames
queries_msmarco = pd.DataFrame(msmarco.queries_iter())
queries_antique = pd.DataFrame(antique.queries_iter())

# Rename columns for PyTerrier compatibility
qrels_msmarco.rename(columns={"query_id": "qid", "doc_id": "docno", "relevance": "label"}, inplace=True)
qrels_antique.rename(columns={"query_id": "qid", "doc_id": "docno", "relevance": "label"}, inplace=True)

queries_msmarco.rename(columns={"query_id": "qid", "text": "query"}, inplace=True)
queries_antique.rename(columns={"query_id": "qid", "text": "query"}, inplace=True)

In [None]:
def clean_query(query):
    query = query.encode("ascii", "ignore").decode()
    query = query.replace("'", "").replace('"', "").replace("`", "")
    query = re.sub(r"\s+", " ", query).strip()
    return query

queries_msmarco["query"] = queries_msmarco["query"].apply(clean_query)
queries_antique["query"] = queries_antique["query"].apply(clean_query)

In [18]:
queries_msmarco

Unnamed: 0,qid,query
0,121352,define extreme
1,634306,what does chattel mean on credit history
2,920825,what was the great leap forward brainly
3,510633,tattoo fixers how much does it cost
4,737889,what is decentralization process.
...,...,...
808726,633855,what does canada post regulations mean
808727,1059728,wholesale lularoe price
808728,210839,how can i watch the day after
808729,908165,what to use instead of pgp in windows


In [19]:
qrels_msmarco

Unnamed: 0,qid,docno,label,iteration
0,1185869,0,1,0
1,1185868,16,1,0
2,597651,49,1,0
3,403613,60,1,0
4,1183785,389,1,0
...,...,...,...,...
532756,19285,8841362,1,0
532757,558837,4989159,1,0
532758,559149,8841547,1,0
532759,706678,8841643,1,0


In [20]:
bm25_marco = pt.terrier.Retriever(index_msmarco, wmodel="BM25")
bm25_antique = pt.terrier.Retriever(index_antique, wmodel="BM25")

In [21]:
pt.Experiment(
    [bm25_marco],
    queries_msmarco,
    qrels_msmarco,
    eval_metrics=[RR @ 10, nDCG @ 20, MAP],
)

JavaException: JVM exception occurred: Failed to process qid 278900 'how many cars enter the la jolla concours d elegance?' -- Lexical error at line 1, column 54.  Encountered: <EOF> after : "" org.terrier.querying.parser.QueryParserException

In [22]:
pt.Experiment(
    [bm25_antique],
    queries_antique,
    qrels_antique,
    eval_metrics=[RR @ 10, nDCG @ 20, MAP],
)

JavaException: JVM exception occurred: Failed to process qid 3990512 'how can we get concentration onsomething?' -- Lexical error at line 1, column 42.  Encountered: <EOF> after : "" org.terrier.querying.parser.QueryParserException