In [None]:
! pip install -Uq torch torchvision tensorboard sentence-transformers datasets transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m821.2/821.2 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m393.1/393.1 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m135.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m106.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.7/897.7 kB[0m [31m58.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m571.0/571.0 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.2/200.2 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m68.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import wandb
wandb.init(mode="disabled")

#### **Create and Prepare embedding dataset**

In [None]:
from datasets import load_dataset

dataset = load_dataset("yosefw/amharic-news-retrieval-dataset-v2-with-negatives-V2")
dataset

In [None]:
# rename columns
dataset = dataset.rename_column("query", "anchor")
dataset = dataset.rename_column("passage", "positive")
dataset

DatasetDict({
    train: Dataset({
        features: ['query_id', 'passage_id', 'anchor', 'positive', 'category', 'link', 'source_dataset', 'negative_passages'],
        num_rows: 61469
    })
    test: Dataset({
        features: ['query_id', 'passage_id', 'anchor', 'positive', 'category', 'link', 'source_dataset', 'negative_passages'],
        num_rows: 6832
    })
})

In [None]:
test_passage_ids = set(dataset["test"]["passage_id"])
len(test_passage_ids)

6764

In [None]:
from datasets import Dataset
from tqdm import tqdm
import random

ds_rows = []
for row in tqdm(dataset["train"]):
  neg_passages = row["negative_passages"]
  # neg_passages = list(filter(lambda x: x["passage_id"] not in test_passage_ids, neg_passages))
  neg_passages_filtered = neg_passages[:2] + neg_passages[-2:]

  ds_rows.append({
      "query_id": row["query_id"],
      "passage_id": row["passage_id"],
      "anchor": row["anchor"],
      "positive": row["positive"],
      "negative_1": neg_passages_filtered[0]["passage"],
      "negative_2": neg_passages_filtered[2]["passage"],
    })

  ds_rows.append({
      "query_id": row["query_id"],
      "passage_id": row["passage_id"],
      "anchor": row["anchor"],
      "positive": row["positive"],
      "negative_1": neg_passages_filtered[1]["passage"],
      "negative_2": neg_passages_filtered[3]["passage"],
    })

  # print(ds_rows)
  # break

relevance_dataset = Dataset.from_list(ds_rows).shuffle(seed=42)#.sort("query_id")#.select(range(4000))
relevance_dataset

100%|██████████| 61469/61469 [00:25<00:00, 2364.47it/s]


Dataset({
    features: ['query_id', 'passage_id', 'anchor', 'positive', 'negative_1', 'negative_2'],
    num_rows: 122938
})

#### **Create baseline and evaluate pretrained model**

In [None]:
from datasets import concatenate_datasets

train_dataset = dataset["train"]
test_dataset = dataset["test"]
corpus_dataset = concatenate_datasets([train_dataset, test_dataset])

corpus_dataset

Dataset({
    features: ['query_id', 'passage_id', 'anchor', 'positive', 'category', 'link', 'source_dataset', 'negative_passages'],
    num_rows: 68301
})

In [None]:
# Convert the datasets to dictionaries
corpus = dict(
    zip(corpus_dataset["passage_id"], corpus_dataset["positive"])
) # Our corpus (cid => document)
queries = dict(
    zip(test_dataset["query_id"], test_dataset["anchor"])
) # Our queries (qid => question)

In [None]:
# Create a mapping of relevant document (1 in our case) for each query
relevant_docs = {}
for row in test_dataset:
  relevant_docs[row["query_id"]] = [row["passage_id"]]

#### **Initialize Embedding model**

In [None]:
import torch
from sentence_transformers import SentenceTransformer, SentenceTransformerModelCardData
from sentence_transformers.models import Transformer, Pooling, Normalize

base_model = "rasyosef/roberta-medium-amharic"

model = SentenceTransformer(
    modules=[
      Transformer(**{"model_name_or_path":base_model, "tokenizer_name_or_path":base_model}),
      Pooling(**{'word_embedding_dimension': 512, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True}),
      Normalize()
    ],
    model_kwargs={"attn_implementation": "sdpa"},
    model_card_data=SentenceTransformerModelCardData(
        language="am",
        license="mit",
        model_name="RoBERTa Amharic Embed Medium"
    )
)

In [None]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator, SequentialEvaluator
from sentence_transformers.util import cos_sim

EMBED_DIM = model.get_sentence_embedding_dimension()
matryoshka_dimensions = [EMBED_DIM, 256]

matryoshka_evaluators = []
# Iterate over the different dimensions
for dim in matryoshka_dimensions:
  ir_evaluator = InformationRetrievalEvaluator(
      queries=queries,
      corpus=corpus,
      relevant_docs=relevant_docs,
      name=f"dim_{dim}",
      truncate_dim=dim,
      score_functions={"cosine": cos_sim},
      batch_size=128,
      corpus_chunk_size=2048,
      show_progress_bar=False
  )
  matryoshka_evaluators.append(ir_evaluator)

# Create a sequential evaluator
evaluator = SequentialEvaluator(matryoshka_evaluators)

In [None]:
# Evaluate the model
results = evaluator(model)

for dim in matryoshka_dimensions:
  key = f"dim_{dim}_cosine_ndcg@10"
  print(f"{key}: {results[key]}")

dim_512_cosine_ndcg@10: 0.06159687872294573
dim_256_cosine_ndcg@10: 0.04575362058670196


In [None]:
# print the main score
for dim in matryoshka_dimensions:
  key = f"dim_{dim}_cosine_recall@5"
  print(f"{key}: {results[key]}")

dim_512_cosine_recall@5: 0.07308142940831869
dim_256_cosine_recall@5: 0.05462800234329233


#### **Define loss function with Matryoshka Representation**

In [None]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

matryoshka_dimensions = [EMBED_DIM, 256]
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

#### **Fine-tune embedding model with** `SentenceTransformersTrainer`

In [None]:
from sentence_transformers import SentenceTransformerTrainingArguments
from sentence_transformers.training_args import BatchSamplers

args = SentenceTransformerTrainingArguments(
    output_dir="roberta-medium-amharic-embedding-matryoshka",
    num_train_epochs=6,
    per_device_train_batch_size=128,
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=128,
    warmup_ratio=0.025,
    learning_rate=6e-5,
    lr_scheduler_type="cosine",
    optim="adamw_torch_fused",
    fp16=True,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    report_to=None,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_dim_256_cosine_ndcg@10",
)

In [None]:
from sentence_transformers import SentenceTransformerTrainer

trainer = SentenceTransformerTrainer(
    model=model,
    args=args, # training arguments
    train_dataset=relevance_dataset.select_columns(
        ["anchor", "positive", "negative_1", "negative_2"]
    ), # training dataset
    loss=train_loss,
    evaluator=evaluator
)

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [None]:
trainer.train() # NDCG@10 = 0.741092



Epoch,Training Loss,Validation Loss,Dim 512 Cosine Accuracy@1,Dim 512 Cosine Accuracy@3,Dim 512 Cosine Accuracy@5,Dim 512 Cosine Accuracy@10,Dim 512 Cosine Precision@1,Dim 512 Cosine Precision@3,Dim 512 Cosine Precision@5,Dim 512 Cosine Precision@10,Dim 512 Cosine Recall@1,Dim 512 Cosine Recall@3,Dim 512 Cosine Recall@5,Dim 512 Cosine Recall@10,Dim 512 Cosine Ndcg@10,Dim 512 Cosine Mrr@10,Dim 512 Cosine Map@100,Dim 256 Cosine Accuracy@1,Dim 256 Cosine Accuracy@3,Dim 256 Cosine Accuracy@5,Dim 256 Cosine Accuracy@10,Dim 256 Cosine Precision@1,Dim 256 Cosine Precision@3,Dim 256 Cosine Precision@5,Dim 256 Cosine Precision@10,Dim 256 Cosine Recall@1,Dim 256 Cosine Recall@3,Dim 256 Cosine Recall@5,Dim 256 Cosine Recall@10,Dim 256 Cosine Ndcg@10,Dim 256 Cosine Mrr@10,Dim 256 Cosine Map@100,Sequential Score
1,2.2412,No log,0.632103,0.773433,0.820738,0.870972,0.632103,0.257811,0.164148,0.087097,0.632103,0.773433,0.820738,0.870972,0.751424,0.71316,0.717148,0.62522,0.765964,0.811951,0.865261,0.62522,0.255321,0.16239,0.086526,0.62522,0.765964,0.811951,0.865261,0.744511,0.705943,0.710109,0.744511
2,0.5094,No log,0.650557,0.792179,0.83304,0.881957,0.650557,0.26406,0.166608,0.088196,0.650557,0.792179,0.83304,0.881957,0.767013,0.730137,0.733881,0.640305,0.785296,0.830258,0.876391,0.640305,0.261765,0.166052,0.087639,0.640305,0.785296,0.830258,0.876391,0.759521,0.721949,0.725868,0.759521
3,0.2201,No log,0.657586,0.801113,0.842999,0.885764,0.657586,0.267038,0.1686,0.088576,0.657586,0.801113,0.842999,0.885764,0.773283,0.737071,0.740643,0.652168,0.793204,0.837288,0.883275,0.652168,0.264401,0.167458,0.088327,0.652168,0.793204,0.837288,0.883275,0.76844,0.731586,0.73507,0.76844
4,0.1298,No log,0.663445,0.803456,0.841828,0.886057,0.663445,0.267819,0.168366,0.088606,0.663445,0.803456,0.841828,0.886057,0.77615,0.740785,0.744492,0.657147,0.799063,0.839192,0.883128,0.657147,0.266354,0.167838,0.088313,0.657147,0.799063,0.839192,0.883128,0.77203,0.736229,0.739889,0.77203
5,0.0954,No log,0.666813,0.805653,0.843585,0.888108,0.666813,0.268551,0.168717,0.088811,0.666813,0.805653,0.843585,0.888108,0.778885,0.743765,0.747289,0.657001,0.801113,0.843439,0.883275,0.657001,0.267038,0.168688,0.088327,0.657001,0.801113,0.843439,0.883275,0.7727,0.736967,0.74062,0.7727
6,0.0843,No log,0.668131,0.806093,0.842999,0.888254,0.668131,0.268698,0.1686,0.088825,0.668131,0.806093,0.842999,0.888254,0.77941,0.744432,0.74794,0.660516,0.799649,0.840949,0.883128,0.660516,0.26655,0.16819,0.088313,0.660516,0.799649,0.840949,0.883128,0.773556,0.738245,0.741923,0.773556


TrainOutput(global_step=2886, training_loss=0.5467256319993746, metrics={'train_runtime': 4882.2759, 'train_samples_per_second': 151.083, 'train_steps_per_second': 0.591, 'total_flos': 0.0, 'train_loss': 0.5467256319993746, 'epoch': 6.0})

In [None]:
# save the best model
trainer.save_model()

#### **Evaluate fine-tuned model against baseline**

In [None]:
from sentence_transformers import SentenceTransformer

fine_tuned_model = SentenceTransformer(
    args.output_dir, device="cuda" if torch.cuda.is_available() else "cpu"
)

# Evaluate the model
results = evaluator(fine_tuned_model)

# print the main score
for dim in matryoshka_dimensions:
  key = f"dim_{dim}_cosine_ndcg@10"
  print(f"{key}: {results[key]}")

dim_512_cosine_ndcg@10: 0.7793775096961253
dim_256_cosine_ndcg@10: 0.7735280051901218


In [None]:
# print the main score
for dim in matryoshka_dimensions:
  key = f"dim_{dim}_cosine_recall@5"
  print(f"{key}: {results[key]}")

dim_512_cosine_recall@5: 0.8428529584065613
dim_256_cosine_recall@5: 0.8409490333919156


### **Push Model to HuggingFace**

In [None]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 510, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 512, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [None]:
import os
from google.colab import userdata

# os.environ["HF_TOKEN"] = userdata.get("HF_WRITE")

# # push model to hub
# trainer.model.push_to_hub("roberta-amharic-embed-medium")

### **Example**

In [None]:
# The sentences to encode
sentences = [
  "የተደጋገመው የመሬት መንቀጥቀጥና የእሳተ ገሞራ ምልክት በአፋር ክልል",
  "በአክሱም ከተማ የሚገኙ ሙስሊም ሴት ተማሪዎች ከሒጃብ መልበስ ጋር በተያያዘ ውዝግብ ከትምህርት ገበታ ውጭ ሆነው እንደሚገኙ የትግራይ እስልምና ጉዳዮች ምክርቤት ስታወቀ። ይህን ለመፍታት ከክልሉ ትምህርት ቢሮ ጋር ንግግር ላይ መሆኑም የክልሉ እስልምና ጉዳዮች ምክርቤት ለዶቼቬለ ገልጿል።",
  "በማዕከላዊ ኢትዮጵያ ክልል ሃድያ ዞን ጊቤ ወረዳ በሚገኙ 12 ቀበሌዎች መሠረታዊ የመንግሥት አገልግሎት መስጫ ተቋማት በሙሉና በከፊል በመዘጋታቸው መቸገራቸውን ነዋሪዎች አመለከቱ። ከባለፈው ዓመት ጀምሮ የጤና፣ የትምህርት እና የግብር አሰባሰብ ሥራዎች በየአካባቢያቸው እየተከናወኑ አለመሆናቸውንም ለዶቼ ቬለ ተናግረዋል።",
  "የሕዝብ ተወካዮች ምክር ቤት አባል እና የቋሚ ኮሚቴ ሰብሳቢ የነበሩት አቶ ክርስቲያን ታደለ እና የአማራ ክልል ምክር ቤት አባል የሆኑት አቶ ዮሐንስ ቧያለው ከቃሊቲ ወደ ቂሊንጦ ማረሚያ ቤት መዛወራቸውን ጠበቃቸው ተናገሩ።",
  "ከተደጋጋሚ መሬት መንቀጥቀጥ በኋላ አፋር ክልል እሳት ከመሬት ውስጥ ሲፈላ ታይቷል፡፡ ከመሬት ውስጥ እሳትና ጭስ የሚተፋው እንፋሎቱ ዛሬ ማለዳውን 11 ሰዓት ግድም ከከባድ ፍንዳታ በኋላየተስተዋለ መሆኑን የአከባቢው ነዋሪዎች እና ባለስልጣናት ለዶቼ ቬለ ተናግረዋል፡፡ አለት የሚያፈናጥር እሳት ነው የተባለው እንፋሎቱ በክልሉ ጋቢረሱ (ዞን 03) ዱለቻ ወረዳ ሰጋንቶ ቀበሌ መከሰቱን የገለጹት የአከባቢው የአይን እማኞች ከዋናው ፍንዳታ በተጨማሪ በዙሪያው ተጨማሪ ፍንዳታዎች መታየት ቀጥሏል ባይ ናቸው፡፡"
]

# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 384]

# 3. Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(similarities)

(5, 512)
tensor([[ 1.0000,  0.0719,  0.1940, -0.0669,  0.6363],
        [ 0.0719,  1.0000,  0.1997,  0.2312,  0.1994],
        [ 0.1940,  0.1997,  1.0000,  0.0721,  0.2905],
        [-0.0669,  0.2312,  0.0721,  1.0000,  0.1320],
        [ 0.6363,  0.1994,  0.2905,  0.1320,  1.0000]])
