### **Load Dataset**

In [1]:
# Install the HuggingFace Datasets and Evaluate libraries

! pip install --quiet datasets evaluate accelerate

In [2]:
# Load the Amharic news text classification dataset
from datasets import load_dataset

news_dataset = load_dataset("rasyosef/amharic-news-category-classification")
news_dataset

Downloading readme:   0%|          | 0.00/2.14k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/86.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/49971 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['headline', 'category', 'date', 'views', 'article', 'link', 'word_len', 'label'],
        num_rows: 49971
    })
})

In [3]:
raw_datasets = news_dataset['train'].train_test_split(train_size=0.8, seed=42)

print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['headline', 'category', 'date', 'views', 'article', 'link', 'word_len', 'label'],
        num_rows: 39976
    })
    test: Dataset({
        features: ['headline', 'category', 'date', 'views', 'article', 'link', 'word_len', 'label'],
        num_rows: 9995
    })
})


In [4]:
print(raw_datasets['train'][0])
print(raw_datasets['test'][0])
print(raw_datasets['train'].features)

{'headline': 'ዘላቂ  ልማትን ለማረጋገጥ  ያለመው  የአዲስ  አበባ የልማት አጀንዳ  ተቀባይነት አገኘ', 'category': 'ፖለቲካ', 'date': 'July 17, 2015', 'views': 'Unknown', 'article': 'በዛሬው ዕለት 3ኛው የፋይናንስ ጉባኤ ለልማት ጉባኤ ማጠቃለያ ላይ የኤፌዴሪ ጠቅላይ ሚኒስትርና የጉባኤው ፕሬዚዳንት ኃይለማሪያም ደሳለኝ እንደገለጹት ዘላቂ ልማት ለማረጋገጥ የሚያስችለውን የአዲስ አበባ የልማት አጀንዳ መሪ ዕቅድ ተግባራዊ ከአራት ቀን ጉባኤ በኋላ ስምምነትን አግኝቷል ።እንደ ጠቅላይ ሚኒስትር ኃይለማሪያም ገለጻ የፋይናንስ ለልማተ ጉባኤ የተካሄደበት \xa0ሳምንት በዓለም ልማት ታሪክ ላይ ጉልህ ሥፍራ እንዳለው በመጠቆም \xa0በንም \xa0በልማት ወደ ኋላ አይቀርም የሚለውን መርህ \xa0በታላቅ ጉጉት እየጠበቀው ይገኛል ብለዋል ።በጉባኤ \xa0ያደጉ አገራት አብዛኛውን የልማት ድጋፍ ገና በማደግ ላይ ላሉ አገራት ለመመደብ \xa0መስማማታቸው \xa0የጉባኤ ትልቅ ውጤት ነው ያሉት ጠቅላይ ሚኒስትሩ \xa0ያደጉ አገራት ከአጠቃላይ አገራዊ ገቢያቸው \xa0ከዜሮነጥብ7 የሚሆነውን \xa0ለልማት ድጋፍ እንዲያውሉና ከዚህም ውስጥ \xa0ከ ዜሮ ነጥብ 15 እስከ ዜሮ ነጥብ ሃያ ድረስ \xa0በድህነት ጫፍ ላይ ለሚገኙ አገራት እንደሚመደብ አስረድተዋል ።ጉባኤው \xa0የግሉ ዘርፍ \xa0ለልማት ያለውን አስተዋጽኦ አጉልቶ ማውጣቱን የጠቆሙት ጠቅላይ ሚኒስትሩ \xa0 የመንግሥትና የግል ዘርፉ የኢንቨስትመንት መዋለ ንዋይ ትክክለኛ የኢኮኖሚ ዕድገትን እንደሚያመጣ \xa0አብራርተዋል ።ኢትዮጵያ \xa0እኤአ በ2025 \xa0መካከለኛ \xa0ገቢ ካላቸው አገራት ለማሰለፍ ከወዲሁ ውጥን ተይዞ እየተሠራ መሆኑንየጠቀሱት ጠቅላይ ሚኒሰትሩ የውጭ ቀ

In [5]:
# Remove articles that are too short

raw_datasets = raw_datasets.filter(lambda x: x['word_len'] >= 32)
raw_datasets

Filter:   0%|          | 0/39976 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9995 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['headline', 'category', 'date', 'views', 'article', 'link', 'word_len', 'label'],
        num_rows: 38966
    })
    test: Dataset({
        features: ['headline', 'category', 'date', 'views', 'article', 'link', 'word_len', 'label'],
        num_rows: 9735
    })
})

In [6]:
categories = raw_datasets['train'].features['label'].names
categories

['ሀገር አቀፍ ዜና', 'መዝናኛ', 'ስፖርት', 'ቢዝነስ', 'ዓለም አቀፍ ዜና', 'ፖለቲካ']

In [7]:
# Concatenate the title and article
raw_datasets = raw_datasets.map(lambda x: {"full_article" : x["headline"] + "\n" + x["article"]})
raw_datasets

Map:   0%|          | 0/38966 [00:00<?, ? examples/s]

Map:   0%|          | 0/9735 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['headline', 'category', 'date', 'views', 'article', 'link', 'word_len', 'label', 'full_article'],
        num_rows: 38966
    })
    test: Dataset({
        features: ['headline', 'category', 'date', 'views', 'article', 'link', 'word_len', 'label', 'full_article'],
        num_rows: 9735
    })
})

### **Preprocessing the dataset**

In [8]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "rasyosef/bert-small-amharic"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Tokenize the dataset

def tokenize_function(example):
  return tokenizer(example['full_article'], truncation=True, max_length=512)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

# Use a data collator to apply dynamic batches

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='pt')

print(tokenized_datasets)



tokenizer_config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/223k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/608k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

Map:   0%|          | 0/38966 [00:00<?, ? examples/s]

Map:   0%|          | 0/9735 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['headline', 'category', 'date', 'views', 'article', 'link', 'word_len', 'label', 'full_article', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 38966
    })
    test: Dataset({
        features: ['headline', 'category', 'date', 'views', 'article', 'link', 'word_len', 'label', 'full_article', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9735
    })
})


### **Finetuning the Model**

In [9]:
# Load the model

from transformers import AutoModelForSequenceClassification

# roberta-base

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=len(categories),
    id2label = {i: lbl for i, lbl in enumerate(categories)},
    label2id = {lbl: i for i, lbl in enumerate(categories)},
    device_map="cuda"
)

config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/103M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at rasyosef/bert-small-amharic and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from transformers import TrainingArguments

batch_size = 64
epochs = 5

training_args = TrainingArguments(
    output_dir=checkpoint+"-finetuned",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=True,
    seed=42,
)

In [11]:
import evaluate
import numpy as np

def compute_metrics(eval_preds):
  metric1 = evaluate.load("accuracy")
  metric2 = evaluate.load("precision")
  metric3 = evaluate.load("recall")
  metric4 = evaluate.load("f1")

  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)

  accuracy = metric1.compute(predictions=predictions, references=labels)["accuracy"]
  precision = metric2.compute(predictions=predictions, references=labels, average='macro')["precision"]
  recall = metric3.compute(predictions=predictions, references=labels, average='macro')["recall"]
  f1 = metric4.compute(predictions=predictions, references=labels, average='macro')["f1"]

  return {
      "accuracy": accuracy,
      "precision": precision,
      "recall": recall,
      "f1": f1
  }

compute_metrics(([[1,0], [0,1]], [0,1]))

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}

In [12]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4776,0.369994,0.857935,0.78685,0.863417,0.817124
2,0.2854,0.316417,0.884335,0.83609,0.875253,0.853451
3,0.2087,0.3277,0.885259,0.874765,0.845727,0.859123
4,0.153,0.333251,0.889266,0.855452,0.874304,0.864207
5,0.1112,0.353918,0.889471,0.864716,0.859278,0.861901


TrainOutput(global_step=3045, training_loss=0.2471609958091197, metrics={'train_runtime': 683.4138, 'train_samples_per_second': 285.084, 'train_steps_per_second': 4.456, 'total_flos': 7706690641981440.0, 'train_loss': 0.2471609958091197, 'epoch': 5.0})

### **Model Predictions**

In [13]:
# Load metrics and evaluate the model

from torch.utils.data import DataLoader

eval_dataset = tokenized_datasets["test"].remove_columns([
    'headline', 'category', 'date', 'views',
    'article', 'link', 'word_len', 'full_article'
    ]).rename_column("label", "labels").with_format("torch")

print(eval_dataset.column_names)

eval_dataloader = DataLoader(
    eval_dataset,
    shuffle=True,
    batch_size=64,
    collate_fn=data_collator,
)

import evaluate
import torch

y_pred, y_test = [], []

metric = evaluate.load("f1")
model.eval()
for batch in eval_dataloader:
  batch = {k: v.to('cuda') for k, v in batch.items()}
  with torch.no_grad():
    outputs = model(**batch)

  logits = outputs.logits
  predictions = torch.argmax(logits, dim=-1)
  metric.add_batch(predictions=predictions, references=batch["labels"])
  y_pred.extend(predictions.cpu().numpy())
  y_test.extend(batch["labels"].cpu().numpy())
metric.compute(average='macro')

['labels', 'input_ids', 'token_type_ids', 'attention_mask']


{'f1': 0.864207254098607}

In [14]:
len(y_pred), len(y_test)

(9735, 9735)

In [15]:
metric.compute(predictions=y_pred, references=y_test, average='weighted')

{'f1': 0.8900429088796132}

In [16]:
from sklearn import metrics

metrics.confusion_matrix(y_test, y_pred)

array([[3413,    6,   19,  180,  105,  235],
       [  16,   90,    1,    1,    1,    0],
       [   7,    0, 1943,    0,    1,    1],
       [ 108,    5,    0,  593,    6,   66],
       [  55,    3,    0,    4, 1037,    8],
       [ 129,    5,    3,   93,   20, 1581]])

In [17]:
from sklearn import metrics

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.86      0.89      3958
           1       0.83      0.83      0.83       109
           2       0.99      1.00      0.99      1952
           3       0.68      0.76      0.72       778
           4       0.89      0.94      0.91      1107
           5       0.84      0.86      0.85      1831

    accuracy                           0.89      9735
   macro avg       0.86      0.87      0.86      9735
weighted avg       0.89      0.89      0.89      9735

