In [1]:
! pip install -q sentence-transformers datasets transformers

In [None]:
import wandb
wandb.init(mode="disabled")

In [3]:
from datasets.utils.logging import disable_progress_bar
disable_progress_bar()

#### **Create and Prepare embedding dataset**

In [4]:
from datasets import load_dataset

dataset = load_dataset("rasyosef/Amharic-Passage-Retrieval-Dataset-V2")
dataset

Access to the secret `HF_TOKEN` has not been granted on this notebook.
You will not be requested again.
Please restart the session if you want to be prompted again.


DatasetDict({
    train: Dataset({
        features: ['query_id', 'passage_id', 'query', 'passage', 'category', 'link', 'source_dataset'],
        num_rows: 61469
    })
    test: Dataset({
        features: ['query_id', 'passage_id', 'query', 'passage', 'category', 'link', 'source_dataset'],
        num_rows: 6832
    })
})

In [5]:
# rename columns
dataset = dataset.rename_column("query", "anchor")
dataset = dataset.rename_column("passage", "positive")

#### **Create baseline and evaluate pretrained model**

In [6]:
from datasets import concatenate_datasets

train_dataset = dataset["train"]
test_dataset = dataset["test"]
corpus_dataset = concatenate_datasets([train_dataset, test_dataset])

corpus_dataset

Dataset({
    features: ['query_id', 'passage_id', 'anchor', 'positive', 'category', 'link', 'source_dataset'],
    num_rows: 68301
})

In [7]:
# Convert the datasets to dictionaries
corpus = dict(
    zip(corpus_dataset["passage_id"], corpus_dataset["positive"])
) # Our corpus (cid => document)
queries = dict(
    zip(test_dataset["query_id"], test_dataset["anchor"])
) # Our queries (qid => question)

In [8]:
# Create a mapping of relevant document (1 in our case) for each query
relevant_docs = {}
for row in test_dataset:
  relevant_docs[row["query_id"]] = [row["passage_id"]]

#### **Evaluate Amharic Embedding model**

List of models to evaluate:
- intfloat/multilingual-e5-large-instruct
- Alibaba-NLP/gte-modernbert-base
- Alibaba-NLP/gte-multilingual-base


In [9]:
import torch
from sentence_transformers import SparseEncoder

model_id = "rasyosef/SPLADE-RoBERTa-Amharic-Medium"
# model_id = "rasyosef/SPLADE-RoBERTa-Amharic-Base"

# Load a model
model = SparseEncoder(
    model_id,
    device="cuda" if torch.cuda.is_available() else "cpu"
)

print(model.max_seq_length)
if model.max_seq_length > 1024:
  model.max_seq_length = 1024
model.max_seq_length

Loading weights:   0%|          | 0/138 [00:00<?, ?it/s]

510


510

In [10]:
from sentence_transformers.sparse_encoder.evaluation import SparseInformationRetrievalEvaluator

evaluator = SparseInformationRetrievalEvaluator(
      queries=queries,
      corpus=corpus,
      relevant_docs=relevant_docs,
      batch_size=32,
      mrr_at_k=[10],
      ndcg_at_k=[10],
      precision_recall_at_k=[5, 10],
      corpus_chunk_size=8192,
      show_progress_bar=False
  )


In [11]:
# Evaluate the model
results = evaluator(model)

In [13]:
metrics = [
    f"dot_recall@5",
    f"dot_recall@10",
    f"dot_mrr@10",
    f"dot_ndcg@10"
  ]

for key in metrics:
  metric_name = key.split("_")[-1]
  print(f"{metric_name}: {round(results[key], 3)}")

recall@5: 0.857
recall@10: 0.895
mrr@10: 0.728
ndcg@10: 0.769
