In [1]:
import bm25s
import Stemmer  # optional: for stemming
import ir_datasets
import pyterrier as pt
import polars as pl
from bidict import bidict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pt.init()

PyTerrier 0.10.1 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [None]:
trec_covid = ir_datasets.load("beir/trec-covid")

In [None]:
qrels = pl.DataFrame(data=trec_covid.qrels_iter())
collection = pl.DataFrame(data=trec_covid.docs_iter())
queries = pl.DataFrame(data=trec_covid.queries_iter())

In [None]:
qrels.head()

In [None]:
queries.head()

In [None]:
collection.head()

In [None]:
# Create your corpus here
corpus = collection["text"]

# optional: create a stemmer
stemmer = Stemmer.Stemmer("english")

# Tokenize the corpus and only keep the ids (faster and saves memory)
corpus_tokens = bm25s.tokenize(corpus, stopwords="en", stemmer=stemmer)

In [None]:
# Create the BM25 model and index the corpus
retriever = bm25s.BM25()
retriever.index(corpus_tokens)

In [None]:
query_tokens = bm25s.tokenize(queries["text"], stopwords="en", stemmer=stemmer)

In [None]:
docs, scores = retriever.retrieve(query_tokens, k=1000)

In [None]:
docs

In [None]:
query = 2
num_docs = 10

In [None]:
docs = docs[query, :num_docs]

In [None]:
queries.item(query, "text")

In [None]:
collection[docs]["text"].to_list()

In [None]:
x = trec_covid.docs_iter()

In [None]:
for y in x:
    pass

In [5]:
from typing import Iterable, NamedTuple, List
from itertools import tee
from bidict import bidict
import pandas as pd


class BM25s(pt.Transformer):
    def __init__(self, dataset, k: int = 1000):
        #TODO: assert that doc_id and text are in the dataset's NamedTuple
        self.idx_to_docid = {
            idx : doc["docno"] 
            for idx, doc in enumerate(dataset.get_corpus_iter(verbose=False)) 
        }
        self.k = k
        
        self.stemmer = Stemmer.Stemmer("english")
        corpus_tokens = bm25s.tokenize(
            texts=(doc["text"] for doc in dataset.get_corpus_iter(verbose=False)),
            stopwords="english",
            stemmer=self.stemmer,
        )
        self.retriever = bm25s.BM25()
        self.retriever.index(corpus_tokens)
    
    def transform(self, topics : pd.DataFrame) -> pd.DataFrame:
        idx_to_qid = {idx : qid for idx, qid in enumerate(topics["qid"])}
        query_tokens = bm25s.tokenize(topics["query"], stemmer=self.stemmer)
        results, scores = self.retriever.retrieve(query_tokens, k=self.k)

        return pd.DataFrame([
            {"qid": idx_to_qid[q_idx], "docno": self.idx_to_docid[doc_idx], "score": score, "rank": rank}
            for q_idx, (doc_idxs, doc_scores) in enumerate(zip(results, scores))
            for rank, (doc_idx, score) in enumerate(zip(doc_idxs, doc_scores))
        ])


In [13]:
dataset = pt.get_dataset("irds:beir/fiqa")

In [14]:
bm25s_transform = BM25s(dataset, k=1000);

[INFO] [starting] building docstore
[INFO] [starting] opening zip file                                              
[INFO] If you have a local copy of https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/fiqa.zip, you can symlink it here to avoid downloading it again: /root/.ir_datasets/downloads/17918ed23cd04fb15047f73e6c3bd9d9
[INFO] [starting] https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/fiqa.zip
                                                                                
[A                                                                                                                    [INFO] [finished] https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/fiqa.zip: [00:01] [17.9MB] [16.7MB/s]
[INFO] [finished] opening zip file [2.06s]                                      
docs_iter: 100%|██████████████████████| 57638/57638 [00:03<00:00, 15772.66doc/s]
[INFO] [finished] docs_iter: [00:03] [57638doc] [15768.52doc/s]
[INFO] [fin

In [15]:
from pyterrier.measures import nDCG

pt.Experiment(
    [bm25s_transform],
    dataset.get_topics(),
    dataset.get_qrels(),
    eval_metrics=[nDCG@10],
    names=["BM25"],
)

[INFO] [starting] opening zip file
[INFO] [finished] opening zip file [1ms]


AssertionError: beir/fiqa doesn't support get_qrels