In [None]:
! pip install -Uq transformers sentence_transformers datasets

In [None]:
import wandb
wandb.init(mode="disabled")

### **Load Dataset**

In [None]:
from datasets import load_dataset

dataset = load_dataset("yosefw/amharic-news-retrieval-dataset-v2-with-negatives-V2")
dataset

Access to the secret `HF_TOKEN` has not been granted on this notebook.
You will not be requested again.
Please restart the session if you want to be prompted again.


DatasetDict({
    train: Dataset({
        features: ['query_id', 'passage_id', 'query', 'passage', 'category', 'link', 'source_dataset', 'negative_passages'],
        num_rows: 61469
    })
    test: Dataset({
        features: ['query_id', 'passage_id', 'query', 'passage', 'category', 'link', 'source_dataset', 'negative_passages'],
        num_rows: 6832
    })
})

In [None]:
# rename columns
dataset = dataset.rename_column("query", "anchor")
dataset = dataset.rename_column("passage", "positive")
dataset

DatasetDict({
    train: Dataset({
        features: ['query_id', 'passage_id', 'anchor', 'positive', 'category', 'link', 'source_dataset', 'negative_passages'],
        num_rows: 61469
    })
    test: Dataset({
        features: ['query_id', 'passage_id', 'anchor', 'positive', 'category', 'link', 'source_dataset', 'negative_passages'],
        num_rows: 6832
    })
})

In [None]:
from datasets import Dataset
from tqdm import tqdm
import random

ds_rows = []
for row in tqdm(dataset["train"]):
  neg_passages = row["negative_passages"]
  # neg_passages = list(filter(lambda x: x["passage_id"] not in test_passage_ids, neg_passages))
  neg_passages_filtered = neg_passages[:2] + neg_passages[-2:]

  for neg_passage in neg_passages_filtered:
    ds_rows.append({
        "query_id": row["query_id"],
        "passage_id": row["passage_id"],
        "anchor": row["anchor"],
        "positive": row["positive"],
        "negative": neg_passage["passage"],
      })

relevance_dataset = Dataset.from_list(ds_rows).shuffle(seed=42)#.sort("query_id")#.select(range(4000))
relevance_dataset

100%|██████████| 61469/61469 [00:26<00:00, 2296.13it/s]


Dataset({
    features: ['query_id', 'passage_id', 'anchor', 'positive', 'negative'],
    num_rows: 245876
})

### **Initialize SPLADE Model**

In [None]:
from sentence_transformers import SparseEncoder, SparseEncoderModelCardData

# 1. Load a model to finetune with 2. (Optional) model card data
model = SparseEncoder(
    "rasyosef/roberta-medium-amharic",
    model_card_data=SparseEncoderModelCardData(
        language="am",
        license="mit",
        model_name="SPLADE-RoBERTa-Amharic-Medium",
    )
)



### **Evaluator**

In [None]:
from datasets import concatenate_datasets

train_dataset = dataset["train"]
test_dataset = dataset["test"]
corpus_dataset = concatenate_datasets([train_dataset, test_dataset])

corpus_dataset

Dataset({
    features: ['query_id', 'passage_id', 'anchor', 'positive', 'category', 'link', 'source_dataset', 'negative_passages'],
    num_rows: 68301
})

In [None]:
# Convert the datasets to dictionaries
corpus = dict(
    zip(corpus_dataset["passage_id"], corpus_dataset["positive"])
) # Our corpus (cid => document)
queries = dict(
    zip(test_dataset["query_id"], test_dataset["anchor"])
) # Our queries (qid => question)

In [None]:
# Create a mapping of relevant document (1 in our case) for each query
relevant_docs = {}
for row in test_dataset:
  relevant_docs[row["query_id"]] = [row["passage_id"]]

In [None]:
from sentence_transformers.sparse_encoder.evaluation import SparseInformationRetrievalEvaluator

evaluator = SparseInformationRetrievalEvaluator(
    queries=queries,
    corpus=corpus,
    relevant_docs=relevant_docs,
    batch_size=128,
    corpus_chunk_size=2048,
    show_progress_bar=False
)

In [None]:
# evaluator(model)

### **Train**

In [None]:
from sentence_transformers.sparse_encoder.losses import SparseMultipleNegativesRankingLoss, SpladeLoss

# 4. Define a loss function
loss = SpladeLoss(
    model=model,
    loss=SparseMultipleNegativesRankingLoss(model=model),
    query_regularizer_weight=5e-3,
    document_regularizer_weight=3e-3,
)

In [None]:
from sentence_transformers import SparseEncoderTrainer, SparseEncoderTrainingArguments
from sentence_transformers.training_args import BatchSamplers

num_epochs = 4
batch_size = 48
# gradient_accum_steps = 1

# 5. Specify training arguments
run_name = "SPLADE-RoBERTa-Amharic-Medium"
args = SparseEncoderTrainingArguments(
    output_dir=f"models/{run_name}",
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    # gradient_accumulation_steps=gradient_accum_steps,
    per_device_eval_batch_size=batch_size,
    learning_rate=6e-5,
    warmup_ratio=0.05,
    lr_scheduler_type="cosine",
    optim="adamw_torch_fused",
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    # Optional tracking/debugging parameters:
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=2,
    run_name=run_name,  # Will be used in W&B if `wandb` is installed
)

# 7. Create a trainer
trainer = SparseEncoderTrainer(
    model=model,
    args=args,
    train_dataset=relevance_dataset.select_columns(['anchor', 'positive', 'negative']),
    loss=loss,
    evaluator=evaluator,
)



Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [None]:
# Train
trainer.train()

Epoch,Training Loss,Validation Loss,Dot Accuracy@1,Dot Accuracy@3,Dot Accuracy@5,Dot Accuracy@10,Dot Precision@1,Dot Precision@3,Dot Precision@5,Dot Precision@10,Dot Recall@1,Dot Recall@3,Dot Recall@5,Dot Recall@10,Dot Ndcg@10,Dot Mrr@10,Dot Map@100,Query Active Dims,Query Sparsity Ratio,Corpus Active Dims,Corpus Sparsity Ratio,Regularizer Weight
1,46.9605,No log,0.63635,0.801992,0.845929,0.883714,0.63635,0.267331,0.169186,0.088371,0.63635,0.801992,0.845929,0.883714,0.76528,0.72671,0.730368,144.747513,0.995477,354.930954,0.988908,0.002812
2,0.074,No log,0.607645,0.783538,0.832748,0.879613,0.607645,0.261179,0.16655,0.087961,0.607645,0.783538,0.832748,0.879613,0.74761,0.704845,0.70849,95.914322,0.997003,166.01815,0.994812,0.005
3,0.0308,No log,0.620533,0.81034,0.857645,0.896016,0.620533,0.270113,0.171529,0.089602,0.620533,0.81034,0.857645,0.896016,0.765931,0.723379,0.726647,68.488869,0.99786,140.830855,0.995599,0.005
4,0.0179,No log,0.628588,0.810779,0.858084,0.895577,0.628588,0.27026,0.171617,0.089558,0.628588,0.810779,0.858084,0.895577,0.769449,0.72823,0.731418,60.958847,0.998095,117.930273,0.996315,0.005


TrainOutput(global_step=20492, training_loss=11.770830963822796, metrics={'train_runtime': 7901.9466, 'train_samples_per_second': 124.464, 'train_steps_per_second': 2.593, 'total_flos': 0.0, 'train_loss': 11.770830963822796, 'epoch': 4.0, 'document_regularizer_weight': 0.003, 'query_regularizer_weight': 0.005})

In [None]:
# 8. Evaluate the model performance again after training
evaluator(model)

{'dot_accuracy@1': 0.6285881663737551,
 'dot_accuracy@3': 0.8107791446983011,
 'dot_accuracy@5': 0.8580843585237259,
 'dot_accuracy@10': 0.895577035735208,
 'dot_precision@1': 0.6285881663737551,
 'dot_precision@3': 0.2702597148994337,
 'dot_precision@5': 0.17161687170474518,
 'dot_precision@10': 0.0895577035735208,
 'dot_recall@1': 0.6285881663737551,
 'dot_recall@3': 0.8107791446983011,
 'dot_recall@5': 0.8580843585237259,
 'dot_recall@10': 0.895577035735208,
 'dot_ndcg@10': 0.7694492243435073,
 'dot_mrr@10': 0.7282295240884877,
 'dot_map@100': 0.731417730197726,
 'query_active_dims': 60.95884704589844,
 'query_sparsity_ratio': 0.9980950360298156,
 'corpus_active_dims': 117.9302729767245,
 'corpus_sparsity_ratio': 0.9963146789694772}

In [None]:
# 9. Save the trained model
model.save_pretrained(f"./final")

In [None]:
# 10. (Optional) Push it to the Hugging Face Hub

import os
from google.colab import userdata

os.environ["HF_TOKEN"] = userdata.get("HF_WRITE")

# # push model to hub
trainer.model.push_to_hub(run_name, exist_ok=True)

### **Testing**

In [None]:
model

SparseEncoder(
  (0): MLMTransformer({'max_seq_length': 510, 'do_lower_case': False, 'architecture': 'XLMRobertaForMaskedLM'})
  (1): SpladePooling({'pooling_strategy': 'max', 'activation_function': 'relu', 'word_embedding_dimension': 32000})
)

In [None]:
sentences = [
    "የተደጋገመው የመሬት መንቀጥቀጥና የእሳተ ገሞራ ምልክት በአፋር ክልል",
    "በማዕከላዊ ኢትዮጵያ ክልል ሃድያ ዞን ጊቤ ወረዳ በሚገኙ 12 ቀበሌዎች መሠረታዊ የመንግሥት አገልግሎት መስጫ ተቋማት በሙሉና በከፊል በመዘጋታቸው መቸገራቸውን ነዋሪዎች አመለከቱ። ከባለፈው ዓመት ጀምሮ የጤና፣ የትምህርት እና የግብር አሰባሰብ ሥራዎች በየአካባቢያቸው እየተከናወኑ አለመሆናቸውንም ለዶቼ ቬለ ተናግረዋል።",
    "የሕዝብ ተወካዮች ምክር ቤት አባል እና የቋሚ ኮሚቴ ሰብሳቢ የነበሩት አቶ ክርስቲያን ታደለ እና የአማራ ክልል ምክር ቤት አባል የሆኑት አቶ ዮሐንስ ቧያለው ከቃሊቲ ወደ ቂሊንጦ ማረሚያ ቤት መዛወራቸውን ጠበቃቸው ተናገሩ።",
    "ከተደጋጋሚ መሬት መንቀጥቀጥ በኋላ አፋር ክልል እሳት ከመሬት ውስጥ ሲፈላ ታይቷል፡፡ ከመሬት ውስጥ እሳትና ጭስ የሚተፋው እንፋሎቱ ዛሬ ማለዳውን 11 ሰዓት ግድም ከከባድ ፍንዳታ በኋላየተስተዋለ መሆኑን የአከባቢው ነዋሪዎች እና ባለስልጣናት ለዶቼ ቬለ ተናግረዋል፡፡ አለት የሚያፈናጥር እሳት ነው የተባለው እንፋሎቱ በክልሉ ጋቢረሱ (ዞን 03) ዱለቻ ወረዳ ሰጋንቶ ቀበሌ መከሰቱን የገለጹት የአከባቢው የአይን እማኞች ከዋናው ፍንዳታ በተጨማሪ በዙሪያው ተጨማሪ ፍንዳታዎች መታየት ቀጥሏል ባይ ናቸው፡፡"
  ]

embeddings = model.encode(sentences)

similarities = model.similarity(embeddings, embeddings)
print(similarities)

decoded = model.decode(embeddings, top_k=32)
for decoded, sentence in zip(decoded, sentences):
    print(f"Sentence: {sentence}")
    print(f"Decoded: {decoded}")
    print()

tensor([[40.6787,  1.2052,  0.0000, 22.8483],
        [ 1.2052, 74.6232,  3.8863,  6.7954],
        [ 0.0000,  3.8863, 41.9924,  0.0000],
        [22.8483,  6.7954,  0.0000, 74.5851]], device='cuda:0')
Sentence: የተደጋገመው የመሬት መንቀጥቀጥና የእሳተ ገሞራ ምልክት በአፋር ክልል
Decoded: [('▁ገሞራ', 2.13671875), ('ሳተ', 1.8203125), ('▁መንቀጥቀጥ', 1.7978515625), ('▁በአፋር', 1.7685546875), ('▁ምልክት', 1.7080078125), ('▁የአፋር', 1.4609375), ('▁አፋር', 1.3916015625), ('ተደጋገመ', 1.3564453125), ('▁የመሬት', 1.3466796875), ('▁ምልክቶች', 1.185546875), ('▁በመሬት', 1.130859375), ('▁ፍንዳታ', 1.0791015625), ('▁ተአምር', 1.0654296875), ('መሬት', 1.048828125), ('▁የእ', 1.0322265625), ('▁መሬት', 0.97412109375), ('▁በእ', 0.90869140625), ('▁እንስሳ', 0.78173828125), ('▁ጥናቱ', 0.65869140625), ('▁ዓሣ', 0.6572265625), ('▁ምንድን', 0.61572265625), ('▁የድንጋይ', 0.59326171875), ('▁ጎርፍ', 0.59326171875), ('▁በሶማሌ', 0.583984375), ('ሚክ', 0.568359375), ('▁ተመራማሪዎች', 0.517578125), ('ፋር', 0.5087890625), ('▁አሞራ', 0.5087890625), ('▁ማሳያ', 0.5048828125), ('▁ከምድር', 0.49267578125), ('▁ቀበሌ'

In [None]:
stats = SparseEncoder.sparsity(embeddings)
print(f"Sparsity: {stats['sparsity_ratio']:.2%}")  # Typically >99% zeros
print(f"Avg non-zero dimensions per embedding: {stats['active_dims']:.2f}")

Sparsity: 99.69%
Avg non-zero dimensions per embedding: 99.50
