In [1]:
# Install all extra dependencies
!pip install bm25s[full]

# If you want to use stemming for better results, you can install a stemmer
!pip install PyStemmer

# To speed up the top-k selection process, you can install `jax`
!pip install jax[cpu]

!pip install datasets

Collecting bm25s[full]
  Downloading bm25s-0.1.10-py3-none-any.whl.metadata (15 kB)
Collecting ujson (from bm25s[full])
  Downloading ujson-5.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.3 kB)
Collecting PyStemmer (from bm25s[full])
  Downloading PyStemmer-2.2.0.1.tar.gz (303 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.0/303.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting black (from bm25s[full])
  Downloading black-24.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl.metadata (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.2/78.2 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting mypy-extensions>=0.4.3 (from black->bm25s[full])
  Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)
Collecting pathspec>=0.9.0 (from black->bm25s[full])
  Downloading pathspec-0.12.1-py3-none-any

In [22]:
from datasets import load_dataset

# Load the dataset
ds = load_dataset("rag-datasets/rag-mini-bioasq", "question-answer-passages")

data = ds['test']

In [23]:
len(data)

4719

In [11]:
from datasets import load_dataset

ds = load_dataset("rag-datasets/rag-mini-bioasq", "question-answer-passages")
ds

DatasetDict({
    test: Dataset({
        features: ['question', 'answer', 'relevant_passage_ids', 'id'],
        num_rows: 4719
    })
})

### Load the data

In [2]:
from datasets import load_dataset

ds = load_dataset("rag-datasets/rag-mini-bioasq", "text-corpus")
ds_text_corpus = ds['passages']
len(ds_text_corpus)

Downloading readme:   0%|          | 0.00/813 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/24.5M [00:00<?, ?B/s]

Generating passages split:   0%|          | 0/40221 [00:00<?, ? examples/s]

40221

In [3]:
ds_text_corpus[0]

{'passage': 'New data on viruses isolated from patients with subacute thyroiditis de Quervain \nare reported. Characteristic morphological, cytological, some physico-chemical \nand biological features of the isolated viruses are described. A possible role \nof these viruses in human and animal health disorders is discussed. The isolated \nviruses remain unclassified so far.',
 'id': 9797}

In [4]:
ds_text_corpus,type(ds_text_corpus)

(Dataset({
     features: ['passage', 'id'],
     num_rows: 40221
 }),
 datasets.arrow_dataset.Dataset)

In [5]:
passages_split = ds['passages']

df = passages_split.to_pandas()
df.head()

Unnamed: 0,passage,id
0,New data on viruses isolated from patients wit...,9797
1,We describe an improved method for detecting d...,11906
2,We have studied the effects of curare on respo...,16083
3,Kinetic and electrophoretic properties of 230-...,23188
4,Male Wistar specific-pathogen-free rats aged 2...,23469


In [10]:
corpus_json = df.to_dict(orient='records')

In [12]:
corpus_json[:2]

[{'passage': 'New data on viruses isolated from patients with subacute thyroiditis de Quervain \nare reported. Characteristic morphological, cytological, some physico-chemical \nand biological features of the isolated viruses are described. A possible role \nof these viruses in human and animal health disorders is discussed. The isolated \nviruses remain unclassified so far.',
  'id': 9797},
 {'passage': "We describe an improved method for detecting deficiency of the acid hydrolase, \nalpha-1,4-glucosidase in leukocytes, the enzyme defect in glycogen storage \ndisease Type II (Pompe disease). The procedure requires smaller volumes of blood \nand less time than previous methods. The assay involves the separation of \nleukocytes by Peter's method for beta-glucosidase and a modification of Salafsky \nand Nadler's fluorometric method for alpha-glucosidase.",
  'id': 11906}]

In [15]:
corpus_text = df['passage'].to_list()

In [16]:
corpus_text[:2]

['New data on viruses isolated from patients with subacute thyroiditis de Quervain \nare reported. Characteristic morphological, cytological, some physico-chemical \nand biological features of the isolated viruses are described. A possible role \nof these viruses in human and animal health disorders is discussed. The isolated \nviruses remain unclassified so far.',
 "We describe an improved method for detecting deficiency of the acid hydrolase, \nalpha-1,4-glucosidase in leukocytes, the enzyme defect in glycogen storage \ndisease Type II (Pompe disease). The procedure requires smaller volumes of blood \nand less time than previous methods. The assay involves the separation of \nleukocytes by Peter's method for beta-glucosidase and a modification of Salafsky \nand Nadler's fluorometric method for alpha-glucosidase."]

## BM25s Implementation

In [9]:
import bm25s
import Stemmer

In [24]:
%%time
# optional: create a stemmer
stemmer = Stemmer.Stemmer("english")

corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en",stemmer=stemmer)

# Create the BM25 retriever and attach your corpus_json to it
retriever = bm25s.BM25(corpus=corpus_json)
# Now, index the corpus_tokens (the corpus_json is not used yet)
retriever.index(corpus_tokens)

# Query the corpus
query = "What were the two types of aneurysms classified based on their location in relation to the pericallosal artery?"
query_tokens = bm25s.tokenize(query,stemmer=stemmer)


results, scores = retriever.retrieve(query_tokens, k=2)

for i in range(results.shape[1]):
    doc, score = results[0, i], scores[0, i]
    print(f"Rank {i+1} (score: {score:.2f}): {doc}")

Split strings:   0%|          | 0/40221 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/40221 [00:00<?, ?it/s]

DEBUG:bm25s:Building index from IDs objects


BM25S Count Tokens:   0%|          | 0/40221 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/40221 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Rank 1 (score: 19.80): {'passage': 'Forty-five patients with aneurysms of the anterior cerebral artery distal to the \nanterior communicating artery were operated on by a direct approach method in \nthe years 1960-1973. The incidence of aneurysms in this location was 4.8% of the \ntotal 1,000 aneurysms. It is of upmost importance in the treatment of aneurysms \nto insure the parent artery for the purpose of temporary occlusion. This makes \nit easier and safer to approach the aneurysmal neck and to handle possible \npremature aneurysmal rupture. From this technical standpoint, the aneurysms in \nthis location were classified into two types, ascending and horizontal. \nAneurysms of the pericallosal artery between the origin of the anterior \ncommunicating artery and the knee of the corpus callosum were designated as the \naneurysms of the ascending portion, whereas the aneurysms of the pericallosal \nartery from the knee of the corpus callosum and beyond were designated as the \naneurys

### Save the index

In [20]:
%%time
# You can save the arrays to a directory...
# Note that this will fail if your corpus passed to `BM25(corpus...)` is not serializable
retriever.save("index_bm25s")

Finding newlines for mmindex:   0%|          | 0.00/43.4M [00:00<?, ?B/s]

CPU times: user 463 ms, sys: 119 ms, total: 582 ms
Wall time: 591 ms


### mmap=False

In [38]:
%%time
# set load_corpus=False if you don't need the corpus

# mmap=True --> Load the BM25 index as a memory-mapped file, which is memory efficient
# and reduce overhead of loading the full index into memory
query = "What were the two types of aneurysms classified based on their location in relation to the pericallosal artery?"

reloaded_retriever = bm25s.BM25.load("index_bm25s", load_corpus=True)
query_tokens = bm25s.tokenize(query,stemmer=stemmer)
results, scores = reloaded_retriever.retrieve(query_tokens, k=4)

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

CPU times: user 302 ms, sys: 23.1 ms, total: 325 ms
Wall time: 329 ms


### mmap=True

In [39]:
%%time
# set load_corpus=False if you don't need the corpus

# mmap=True --> Load the BM25 index as a memory-mapped file, which is memory efficient
# and reduce overhead of loading the full index into memory
query = "What were the two types of aneurysms classified based on their location in relation to the pericallosal artery?"

reloaded_retriever = bm25s.BM25.load("index_bm25s", load_corpus=True,mmap=True)
query_tokens = bm25s.tokenize(query,stemmer=stemmer)
results, scores = reloaded_retriever.retrieve(query_tokens, k=4)

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

CPU times: user 74.1 ms, sys: 2.92 ms, total: 77 ms
Wall time: 84.7 ms


In [40]:
for i in range(results.shape[1]):
    doc, score = results[0, i], scores[0, i]
    print(f"Rank {i+1} (score: {score:.2f}): {doc['passage']},id--->{doc['id']}")
    print("------------")

Rank 1 (score: 19.80): Forty-five patients with aneurysms of the anterior cerebral artery distal to the 
anterior communicating artery were operated on by a direct approach method in 
the years 1960-1973. The incidence of aneurysms in this location was 4.8% of the 
total 1,000 aneurysms. It is of upmost importance in the treatment of aneurysms 
to insure the parent artery for the purpose of temporary occlusion. This makes 
it easier and safer to approach the aneurysmal neck and to handle possible 
premature aneurysmal rupture. From this technical standpoint, the aneurysms in 
this location were classified into two types, ascending and horizontal. 
Aneurysms of the pericallosal artery between the origin of the anterior 
communicating artery and the knee of the corpus callosum were designated as the 
aneurysms of the ascending portion, whereas the aneurysms of the pericallosal 
artery from the knee of the corpus callosum and beyond were designated as the 
aneurysms of the horizontal port