In [3]:
import os
# do this to prevent JAX from planting itself on every GPU and pre-allocating 75% of memory on GPU:0
os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"]="false"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [4]:
import bm25s
import Stemmer
import polars as pl
import pandas as pd
import pyterrier as pt
import ir_datasets
from typing import Optional, List, Dict, Any, Tuple
from pyterrier.measures import nDCG, R, AP
from collections import namedtuple

  from .autonotebook import tqdm as notebook_tqdm
2024-07-09 13:20:07.729558: W external/xla/xla/service/gpu/nvptx_compiler.cc:765] The NVIDIA driver's CUDA version is 12.3 which is older than the ptxas CUDA version (12.5.82). Because the driver is older than the ptxas version, XLA is disabling parallel compilation, which may slow down compilation. You should update your NVIDIA driver or use the NVIDIA-provided CUDA forward compatibility packages.


In [5]:
pt.init()

PyTerrier 0.10.1 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [6]:
def load_dataset(dataset_name: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    dataset = ir_datasets.load(dataset_name)

    queries = pd.DataFrame(dataset.queries_iter())[["query_id", "text"]].rename(columns={"query_id": "qid", "text" : "query"})

    qrels = pd.DataFrame(dataset.qrels_iter()).iloc[:, :3].rename(columns={"query_id" : "qid", "doc_id" : "docno", "relevance" : "label"})

    corpus = pd.DataFrame(dataset.docs_iter())
    
    if "title" in corpus.columns:
        corpus['text'] = corpus['title'] + ' | ' + corpus['text']

    corpus = corpus[["doc_id", "text"]].rename(columns={"doc_id" : "docno"})

    return corpus, queries, qrels


In [22]:
corpus, queries, qrels = load_dataset("beir/msmarco/dev")

[INFO] [starting] opening zip file
[INFO] If you have a local copy of https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/msmarco.zip, you can symlink it here to avoid downloading it again: /home/rjha/.ir_datasets/downloads/444067daf65d982533ea17ebd59501e4
[INFO] [starting] https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/msmarco.zip
[INFO] [finished] https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/msmarco.zip: [00:27] [1.08GB] [39.6MB/s]
[INFO] [finished] opening zip file [27.57s]                                                                
[INFO] [starting] opening zip file
[INFO] [finished] opening zip file [0ms]
[INFO] [starting] building docstore
[INFO] [starting] opening zip file                                              
[INFO] [finished] opening zip file [2ms]                                        
docs_iter: 100%|██████████████████| 8841823/8841823 [01:36<00:00, 91629.34doc/s]
[INFO] [finished] docs_iter: [01:36] [8841

In [16]:
# documents are [docno, text]
Document = namedtuple("Document", ["docno", "text"])
# queries are [qid, query]
Query = namedtuple("Query", ["qid", "query"])
# qrels are [qid, docno, label]
Qrel = namedtuple("Qrel", ["qid", "docno", "label"])
# retrieval results are a pd.DataFrame with [qid, docno, score, rank]
Result = namedtuple("Result", ["qid", "docno", "score", "rank"])


In [9]:
class BM25s(pt.Transformer):
    def __init__(self, corpus: pd.DataFrame, lang="en", stemmer: Optional[Stemmer.Stemmer] = None, index_path: Optional[str] = None, load_index: bool = False):
        self.corpus = corpus
        self.lang = lang
        self.stemmer = stemmer
        self.index_path = index_path

        if load_index:
            print(f"Loading index from {self.index_path}")
            self.retriever = bm25s.BM25.load(self.index_path)
        else:
            self.retriever = bm25s.BM25()
            corpus_tokens = bm25s.tokenize(corpus["text"].to_list(), stopwords=lang, stemmer=self.stemmer)
            self.retriever.index(corpus_tokens)

            if self.index_path is not None:
                print(f"Saving index to {self.index_path}")
                self.retriever.save(self.index_path)

    def transform(self, queries: pd.DataFrame) -> pd.DataFrame:
        query_token_ids = bm25s.tokenize(queries["query"].to_list(), stopwords=self.lang, stemmer=self.stemmer)
        docnos, scores = self.retriever.retrieve(query_token_ids, corpus=self.corpus["docno"].to_list(), k=1000)
        results = pd.DataFrame(
            [
                Result(qid=qid, docno=docno, score=score, rank=rank)
                for qid, docnos_i, scores_i in zip(queries["qid"], docnos, scores)
                for rank, (docno, score) in enumerate(zip(docnos_i, scores_i), start=1)
            ]
        )
        return results

In [10]:
def subsample_corpus(corpus, qrels, results, top_k=50):
    if top_k < 0: return corpus
    judged = set(qrels.docno)
    top_k_docnos = set(results.groupby(by="qid").apply(lambda x: x.nlargest(top_k, 'score'), include_groups=False).reset_index(drop=True).docno)
    keep_docnos = judged.union(top_k_docnos)
    print(top_k, len(keep_docnos))
    return corpus[corpus['docno'].isin(keep_docnos)]


In [19]:
def dataset_experiment(dataset: str, corpus: pd.DataFrame, queries: pd.DataFrame, qrels: pd.DataFrame, top_ks: List[int] = [1,10,50,100,250,500], load_index: bool = False, load_sub_indices: bool = False) -> pd.DataFrame:
    
    index_prefix = os.path.join("indexes", dataset)
    # make full-fidelity retriever
    print("Indexing and retrieving from full corpus")
    bm25_full_corpus = BM25s(corpus, lang="en", stemmer=Stemmer.Stemmer("english"), index_path=os.path.join(index_prefix, "full_corpus"), load_index=load_index)
    results_full_corpus = bm25_full_corpus.transform(queries)

    subsampled_corpora = {
        top_k : subsample_corpus(corpus, qrels, results_full_corpus, top_k=top_k)
        for top_k in top_ks
    }
    
    subsampled_corpus_size = {
        top_k : len(ss_corpus) for top_k, ss_corpus in subsampled_corpora.items()
    }
    print(subsampled_corpus_size)

    print("Generating and indexing corpus subsets")
    retrievers = {
        "bm25_full_corpus" : bm25_full_corpus,
        ** {
            f"bm25_top_{top_k}_corpus" : BM25s(ss_corpus, lang="en", stemmer=Stemmer.Stemmer("english"), index_path=os.path.join(index_prefix, f"top_{top_k}_corpus"), load_index=load_sub_indices)
            for top_k, ss_corpus in subsampled_corpora.items()
        }
    }

    metrics = [metric@k for k in (1, 10 ,100, 1000) for metric in [nDCG, R, AP]]

    print("Executing retrieval")
    results = pt.Experiment(
        list(retrievers.values()),
        queries,
        qrels,
        eval_metrics=metrics,
        names=list(retrievers.keys()),
    )

    return results, subsampled_corpus_size


## Don't run

In [None]:
bm25_s = BM25s(corpus, lang="en", stemmer=Stemmer.Stemmer("english"))

In [None]:
full_fidelity_results = bm25_s.transform(queries)

In [None]:
from pyterrier.measures import nDCG, R

pt.Experiment(
    [full_fidelity_results],
    queries,
    qrels,
    eval_metrics=[nDCG@10, R@1000],
    names=["bm25_full"],
)

In [None]:
subsampled_corpora = {
    top_k : subsample_corpus(corpus, qrels, full_fidelity_results, top_k=top_k)
    for top_k in [1, 10, 50, 100]
}

In [None]:
top_ks = [1, 10, 50, 100]

results = pt.Experiment(
    [full_fidelity_results, *[BM25s(ss_corpus, lang="en", stemmer=Stemmer.Stemmer("english")) for ss_corpus in subsampled_corpora.values()]],
    queries,
    qrels,
    eval_metrics=[metric@k for k in (1, 10 ,100, 1000) for metric in [nDCG, R, AP]],
    names=["bm25_full_fidelity", *[f"bm25_subsample_topk={top_k}" for top_k in subsampled_corpora.keys()]],
)

## Run

In [27]:
results = dataset_experiment("msmarco", corpus, queries, qrels, load_index=False, load_sub_indices=False)

Indexing and retrieving from full corpus


                                                                                  

Saving index to indexes/msmarco/full_corpus


                                                                   

1 13676
10 72067
50 316541
100 595331
250 1320888
500 2293128
{1: 13676, 10: 72067, 50: 316541, 100: 595331, 250: 1320888, 500: 2293128}
Generating and indexing corpus subsets


                                                                             

Saving index to indexes/msmarco/top_1_corpus


                                                                              

Saving index to indexes/msmarco/top_10_corpus


                                                                                

Saving index to indexes/msmarco/top_50_corpus


                                                                                

Saving index to indexes/msmarco/top_100_corpus


                                                                                  

Saving index to indexes/msmarco/top_250_corpus


                                                                                  

Saving index to indexes/msmarco/top_500_corpus
Executing retrieval


                                                                    

In [29]:
results_tuple = results

In [30]:
results, corpus_sizes = results_tuple

In [31]:
results

Unnamed: 0,name,nDCG@1,R@1,AP@1,nDCG@10,R@10,AP@10,nDCG@100,R@100,AP@100,nDCG@1000,R@1000,AP@1000
0,bm25_full_corpus,0.095415,0.092646,0.092646,0.218882,0.369269,0.170875,0.279009,0.656996,0.182402,0.303712,0.852089,0.183275
1,bm25_top_1_corpus,0.105014,0.101552,0.101552,0.573097,0.908656,0.457534,0.58722,0.972254,0.460695,0.589587,0.990831,0.460784
2,bm25_top_10_corpus,0.097135,0.093911,0.093911,0.230125,0.398436,0.176935,0.366475,0.934945,0.215404,0.372271,0.979585,0.215635
3,bm25_top_50_corpus,0.097708,0.094413,0.094413,0.223571,0.376409,0.174704,0.3137,0.83584,0.18947,0.330067,0.959241,0.19017
4,bm25_top_100_corpus,0.097278,0.094222,0.094222,0.223829,0.377483,0.174739,0.286159,0.68011,0.186374,0.322265,0.944257,0.188099
5,bm25_top_250_corpus,0.096991,0.093935,0.093935,0.223429,0.377412,0.17428,0.282986,0.662011,0.185726,0.315219,0.916846,0.186834
6,bm25_top_500_corpus,0.096275,0.09329,0.09329,0.222611,0.375621,0.173737,0.282635,0.662297,0.185293,0.310934,0.889625,0.186229


In [32]:
corpus_sizes

{1: 13676, 10: 72067, 50: 316541, 100: 595331, 250: 1320888, 500: 2293128}

In [33]:
len(corpus)

8841823

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming your data is already in a DataFrame named results

# Metrics to consider
metrics = ['nDCG', 'R', 'AP']
top_k_values = [1, 10, 100, 1000]

# Initialize a figure with subplots
fig, axes = plt.subplots(nrows=len(metrics), ncols=1, figsize=(10, 16), sharex=True)

# Iterate over each metric
for i, metric in enumerate(metrics):
    # Initialize a list to store ratios for each top_k value
    ratios = []
    
    # Calculate ratios relative to bm25_full_corpus for each top_k run
    for k in top_k_values:
        full_corpus_value = results[f'{metric}@{k}'].iloc[0]
        top_k_value = results[f'{metric}@{k}'].iloc[1:]
        ratio = top_k_value / full_corpus_value
        ratios.append(ratio)
    
    # Plot ratios
    for j, k in enumerate(top_k_values[1:]):  # start from index 1 to skip full_corpus_value
        ax = axes[i] if len(metrics) > 1 else axes
        ax.plot(results['name'].iloc[1:], ratios[j], marker='o', label=f'{metric}@{k}')
    

    # Set y-axis label for each subplot
    axes[i].set_ylabel(f'{metric} Ratio')

     # Set y-axis to log scale and range
    ax.set_yscale('log')
    ax.set_ylim(0.99, 1.1)
    
    # Add a dotted black line at y=1
    ax.axhline(y=1.01, color='red', linestyle='--', label="Within 1%")
    ax.axhline(y=1, color='black', linestyle='--', label="True Score")
    ax.legend()

# Set x-axis label and title
axes[-1].set_xlabel('Run Name')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

# Add legend and display the plot
plt.show()


In [None]:
results

In [None]:
for column in results.columns:
    if column == "name" or "ratio" in column: continue
    results[column + "_ratio"] = results[column] / results[column][0]


In [None]:
results.iloc[:, -12:]