In [3]:
#! pip install -Uq sentence-transformers datasets transformers

In [4]:
! pip install -q pylate==1.2.0 beir ranx

In [5]:
import wandb
wandb.init(mode="disabled")

In [6]:
from datasets.utils.logging import disable_progress_bar
disable_progress_bar()

#### **Create and Prepare embedding dataset**

In [7]:
from datasets import load_dataset

dataset = load_dataset("rasyosef/Amharic-Passage-Retrieval-Dataset-V2")
dataset

DatasetDict({
    train: Dataset({
        features: ['query_id', 'passage_id', 'query', 'passage', 'category', 'link', 'source_dataset'],
        num_rows: 61469
    })
    test: Dataset({
        features: ['query_id', 'passage_id', 'query', 'passage', 'category', 'link', 'source_dataset'],
        num_rows: 6832
    })
})

In [8]:
# rename columns
dataset = dataset.rename_column("query", "anchor")
dataset = dataset.rename_column("passage", "positive")

#### **Create baseline and evaluate pretrained model**

In [9]:
from datasets import concatenate_datasets

train_dataset = dataset["train"]
test_dataset = dataset["test"]
corpus_dataset = concatenate_datasets([train_dataset, test_dataset])

corpus_dataset

Dataset({
    features: ['query_id', 'passage_id', 'anchor', 'positive', 'category', 'link', 'source_dataset'],
    num_rows: 68301
})

In [10]:
# Convert the datasets to dictionaries
corpus = dict(
    zip(corpus_dataset["passage_id"], corpus_dataset["positive"])
) # Our corpus (cid => document)
queries = dict(
    zip(test_dataset["query_id"], test_dataset["anchor"])
) # Our queries (qid => question)

In [11]:
# Create a mapping of relevant document (1 in our case) for each query
relevant_docs = {}
for row in test_dataset:
  relevant_docs[row["query_id"]] = [row["passage_id"]]

#### **Evaluate Amharic Embedding model**


In [12]:
from pylate.evaluation import PyLateInformationRetrievalEvaluator
from pylate import models

# Step 1: Initialize the ColBERT model

# dataset = "scifact" # Choose the dataset you want to evaluate
model = models.ColBERT(
    model_name_or_path="rasyosef/colbert-roberta-amharic-base",
    device="cuda" # "cpu" or "cuda" or "mps"
)

evaluator = PyLateInformationRetrievalEvaluator(
    queries=queries,
    corpus=corpus,
    relevant_docs=relevant_docs,
    mrr_at_k=[10],
    ndcg_at_k=[10],
    precision_recall_at_k=[5, 10],
    batch_size=16,
    corpus_chunk_size=64,
    show_progress_bar=True
)

2026-02-03 21:14:16.889514: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770153256.912337     335 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770153256.919603     335 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1770153256.936935     335 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770153256.936954     335 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770153256.936957     335 computation_placer.cc:177] computation placer alr

modules.json:   0%|          | 0.00/216 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/851 [00:00<?, ?B/s]

1_Dense/model.safetensors:   0%|          | 0.00/393k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

In [13]:
# Evaluate the model
results = evaluator(model)

Encoding queries (bs=16):   0%|          | 0/427 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1034/1034 [14:52<00:00,  1.16it/s]


In [14]:
# rasyosef/snowflake-arctic-embed-l-v2.0-finetuned-amharic

metrics = [
    f"MaxSim_recall@5",
    f"MaxSim_recall@10",
    f"MaxSim_mrr@10",
    f"MaxSim_ndcg@10"
  ]

for key in metrics:
  metric_name = key.split("_")[-1]
  print(f"{metric_name}: {round(results[key], 3)}")

recall@5: 0.86
recall@10: 0.899
mrr@10: 0.736
ndcg@10: 0.776


##### rasyosef/ColBERT-Amharic-Base

- recall@5: 0.902
- recall@10: 0.93
- mrr@10: 0.803
- ndcg@10: 0.835


##### rasyosef/ColBERT-Amharic-Medium
- recall@5: 0.882
- recall@10: 0.913
- mrr@10: 0.778
- ndcg@10: 0.811

##### rasyosef/colbert-roberta-amharic-base
- recall@5: 0.86
- recall@10: 0.899
- mrr@10: 0.736
- ndcg@10: 0.776