In [1]:
#! pip install -Uq sentence-transformers datasets transformers

In [None]:
import wandb
wandb.init(mode="disabled")

In [3]:
from datasets.utils.logging import disable_progress_bar
disable_progress_bar()

#### **Create and Prepare embedding dataset**

In [4]:
from datasets import load_dataset

dataset = load_dataset("rasyosef/Amharic-Passage-Retrieval-Dataset-V2")
dataset

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/116M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/13.0M [00:00<?, ?B/s]

DatasetDict({
    train: Dataset({
        features: ['query_id', 'passage_id', 'query', 'passage', 'category', 'link', 'source_dataset'],
        num_rows: 61469
    })
    test: Dataset({
        features: ['query_id', 'passage_id', 'query', 'passage', 'category', 'link', 'source_dataset'],
        num_rows: 6832
    })
})

In [5]:
# rename columns
dataset = dataset.rename_column("query", "anchor")
dataset = dataset.rename_column("passage", "positive")

#### **Create baseline and evaluate pretrained model**

In [6]:
from datasets import concatenate_datasets

train_dataset = dataset["train"]
test_dataset = dataset["test"]
corpus_dataset = concatenate_datasets([train_dataset, test_dataset])

corpus_dataset

Dataset({
    features: ['query_id', 'passage_id', 'anchor', 'positive', 'category', 'link', 'source_dataset'],
    num_rows: 68301
})

In [7]:
# Convert the datasets to dictionaries
corpus = dict(
    zip(corpus_dataset["passage_id"], corpus_dataset["positive"])
) # Our corpus (cid => document)
queries = dict(
    zip(test_dataset["query_id"], test_dataset["anchor"])
) # Our queries (qid => question)

In [8]:
# Create a mapping of relevant document (1 in our case) for each query
relevant_docs = {}
for row in test_dataset:
  relevant_docs[row["query_id"]] = [row["passage_id"]]

#### **Evaluate Amharic Embedding model**


In [None]:
import torch
from sentence_transformers import SentenceTransformer

# model_id = "rasyosef/RoBERTa-Amharic-Embed-Medium"
# model_id = "rasyosef/RoBERTa-Amharic-Embed-Base"
# model_id = "rasyosef/roberta-amharic-text-embedding-base"
# model_id = "rasyosef/roberta-amharic-text-embedding-medium"

# # OTHER MODELS
# model_id = "intfloat/multilingual-e5-large-instruct"
# model_id = "Alibaba-NLP/gte-multilingual-base"
model_id = "Snowflake/snowflake-arctic-embed-l-v2.0"

# Load a model
model = SentenceTransformer(
    model_id,
    device="cuda" if torch.cuda.is_available() else "cpu",
    trust_remote_code=True
)

print(model.max_seq_length)
if model.max_seq_length > 1024:
  model.max_seq_length = 1024
model.max_seq_length

2026-02-01 23:58:40.751764: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1769990320.966667      24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1769990321.021397      24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1769990321.498503      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769990321.498541      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769990321.498543      24 computation_placer.cc:177] computation placer alr

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/203 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/818 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

8192


1024

In [10]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator, SequentialEvaluator
from sentence_transformers.util import cos_sim

EMBED_DIM = model.get_sentence_embedding_dimension()
matryoshka_dimensions = [EMBED_DIM]

print("Embedding Dimension:", EMBED_DIM)

matryoshka_evaluators = []
# Iterate over the different dimensions
for dim in matryoshka_dimensions:
  ir_evaluator = InformationRetrievalEvaluator(
      queries=queries,
      corpus=corpus,
      relevant_docs=relevant_docs,
      batch_size=64,
      name=f"dim_{dim}",
      truncate_dim=dim,
      score_functions={"cosine": cos_sim},
      mrr_at_k=[10],
      ndcg_at_k=[10],
      precision_recall_at_k=[5, 10],
      corpus_chunk_size=8192,
      show_progress_bar=True
  )
  matryoshka_evaluators.append(ir_evaluator)

# Create a sequential evaluator
evaluator = SequentialEvaluator(matryoshka_evaluators)

Embedding Dimension: 1024


In [11]:
# Evaluate the model
results = evaluator(model)

Batches:   0%|          | 0/107 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/9 [00:00<?, ?it/s]

Batches:   0%|          | 0/128 [00:00<?, ?it/s]

Corpus Chunks:  11%|█         | 1/9 [09:06<1:12:54, 546.86s/it]

Batches:   0%|          | 0/128 [00:00<?, ?it/s]

Corpus Chunks:  22%|██▏       | 2/9 [18:15<1:03:53, 547.67s/it]

Batches:   0%|          | 0/128 [00:00<?, ?it/s]

Corpus Chunks:  33%|███▎      | 3/9 [27:31<55:09, 551.63s/it]  

Batches:   0%|          | 0/128 [00:00<?, ?it/s]

Corpus Chunks:  44%|████▍     | 4/9 [36:42<45:57, 551.43s/it]

Batches:   0%|          | 0/128 [00:00<?, ?it/s]

Corpus Chunks:  56%|█████▌    | 5/9 [45:54<36:45, 551.46s/it]

Batches:   0%|          | 0/128 [00:00<?, ?it/s]

Corpus Chunks:  67%|██████▋   | 6/9 [55:09<27:38, 552.95s/it]

Batches:   0%|          | 0/128 [00:00<?, ?it/s]

Corpus Chunks:  78%|███████▊  | 7/9 [1:04:28<18:29, 554.95s/it]

Batches:   0%|          | 0/128 [00:00<?, ?it/s]

Corpus Chunks:  89%|████████▉ | 8/9 [1:14:02<09:20, 560.84s/it]

Batches:   0%|          | 0/10 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 9/9 [1:14:47<00:00, 498.56s/it]


In [12]:
# rasyosef/snowflake-arctic-embed-l-v2.0-finetuned-amharic

dim = EMBED_DIM
metrics = [
    f"dim_{dim}_cosine_recall@5",
    f"dim_{dim}_cosine_recall@10",
    f"dim_{dim}_cosine_mrr@10",
    f"dim_{dim}_cosine_ndcg@10"
  ]

for key in metrics:
  metric_name = key.split("_")[-1]
  print(f"{metric_name}: {round(results[key], 3)}")

recall@5: 0.795
recall@10: 0.848
mrr@10: 0.653
ndcg@10: 0.701
