### **Download Sentiment Dataset**

In [1]:
! pip install -Uq datasets evaluate accelerate

In [2]:
! pip uninstall -y wandb

[0m

In [3]:
from datasets import load_dataset

sentiment_dataset = load_dataset("rasyosef/amharic-sentiment")
sentiment_dataset

Access to the secret `HF_TOKEN` has not been granted on this notebook.
You will not be requested again.
Please restart the session if you want to be prompted again.


DatasetDict({
    train: Dataset({
        features: ['clean_tweet', 'label'],
        num_rows: 2223
    })
    dev: Dataset({
        features: ['clean_tweet', 'label'],
        num_rows: 279
    })
    test: Dataset({
        features: ['clean_tweet', 'label'],
        num_rows: 279
    })
})

In [4]:
sentiment_dataset["train"].features

{'clean_tweet': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None)}

In [5]:
label_names = sentiment_dataset["train"].features["label"].names
label_names

['negative', 'positive']

### **Tokenize Tweets**

In [6]:
from transformers import AutoTokenizer

model_id = "Davlan/afro-xlmr-large"
tokenizer = AutoTokenizer.from_pretrained(model_id)

print(tokenizer.tokenize("ከሀገራቸው ከኢትዮጵያ ከወጡ ግማሽ ምዕተ <mask> ተቆጥሯል።"))

['▁ከ', 'ሀ', 'ገ', 'ራቸው', '▁ከኢትዮጵያ', '▁ከ', 'ወጡ', '▁', 'ግማሽ', '▁', 'ምዕ', 'ተ', ' <mask>', '▁ተ', 'ቆ', 'ጥ', 'ሯል።']


In [7]:
def tokenize_dataset(samples):
  tokenized_samples = tokenizer(samples["clean_tweet"], truncation=True, max_length=512)
  return tokenized_samples

tokenize_dataset({"clean_tweet":["ከሀገራቸው ከኢትዮጵያ ከወጡ ግማሽ"]})

{'input_ids': [[0, 1464, 21608, 5430, 66052, 165627, 1464, 87365, 6, 230446, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [8]:
preprocessed_datasets = sentiment_dataset.map(
    tokenize_dataset,
    batched=True,
    remove_columns=["clean_tweet"]
  )
preprocessed_datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 2223
    })
    dev: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 279
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 279
    })
})

### **Load Amharic BERT model**

In [9]:
id2label={i: label for i, label in enumerate(label_names)}
label2id={label: i for i, label in enumerate(label_names)}

print(id2label, label2id)

{0: 'negative', 1: 'positive'} {'negative': 0, 'positive': 1}


In [10]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=len(label_names),
    id2label=id2label,
    label2id=label2id,
    # device_map="cuda"
)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### **Finetuning**

In [11]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

In [12]:
from transformers import TrainingArguments

batch_size = 32
epochs = 3

training_args = TrainingArguments(
    output_dir=model_id+"-finetuned",
    learning_rate=2e-5,
    # lr_scheduler_type="linear",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    # load_best_model_at_end=True,
    # metric_for_best_model="f1",
    fp16=True,
    seed=42,
)



In [13]:
import evaluate
import numpy as np

def compute_metrics(eval_preds):
  metric1 = evaluate.load("accuracy")
  metric2 = evaluate.load("precision")
  metric3 = evaluate.load("recall")
  metric4 = evaluate.load("f1")

  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)

  accuracy = metric1.compute(predictions=predictions, references=labels)["accuracy"]
  # precision = metric2.compute(predictions=predictions, references=labels, average='macro')["precision"]
  # recall = metric3.compute(predictions=predictions, references=labels, average='macro')["recall"]
  # f1 = metric4.compute(predictions=predictions, references=labels, average='macro')["f1"]

  return {
      "accuracy": accuracy,
      # "precision": precision,
      # "recall": recall,
      # "f1": f1
  }

compute_metrics((np.array([[1,0], [0,1]]), np.array([0,1])))

{'accuracy': 1.0}

In [14]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=preprocessed_datasets["train"],
    eval_dataset=preprocessed_datasets["dev"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [15]:
trainer.evaluate()

{'eval_loss': 0.7379811406135559,
 'eval_model_preparation_time': 0.0089,
 'eval_accuracy': 0.5376344086021505,
 'eval_runtime': 6.523,
 'eval_samples_per_second': 42.772,
 'eval_steps_per_second': 1.38}

In [16]:
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy
1,0.5178,0.328802,0.0089,0.888889
2,0.3566,0.341656,0.0089,0.892473
3,0.2607,0.328719,0.0089,0.896057


TrainOutput(global_step=210, training_loss=0.37836009434291296, metrics={'train_runtime': 406.5334, 'train_samples_per_second': 16.405, 'train_steps_per_second': 0.517, 'total_flos': 1025914457105496.0, 'train_loss': 0.37836009434291296, 'epoch': 3.0})

In [17]:
trainer.evaluate()

{'eval_loss': 0.3287185728549957,
 'eval_model_preparation_time': 0.0089,
 'eval_accuracy': 0.8960573476702509,
 'eval_runtime': 6.0543,
 'eval_samples_per_second': 46.083,
 'eval_steps_per_second': 1.487,
 'epoch': 3.0}

### **Evaluate Model**

In [18]:
import torch

num_samples = len(preprocessed_datasets["test"])
inputs = data_collator([preprocessed_datasets["test"][i] for i in range(num_samples)])
# print(inputs)

y_pred = []

#output = model(**inputs)
for i in range(0, len(inputs["input_ids"]), 5):
  output = model(**{k: v[i:i+5].to("cuda") for k, v in inputs.items()})
  y_pred.extend(np.argmax(output.logits.tolist(), axis=-1))

y_test = np.array(preprocessed_datasets["test"]["label"])

print(y_pred)
print(y_test)

[1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0]
[1 1 0 1 0 0 1 1 0 0 1 0 1 0 1 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1
 0 1 1 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 1 1 1 0 0 1 0
 0 1 0 1 1 0

In [19]:
from sklearn import metrics

metrics.confusion_matrix(y_test, y_pred)

array([[130,  20],
       [ 20, 109]])

In [20]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.87      0.87       150
           1       0.84      0.84      0.84       129

    accuracy                           0.86       279
   macro avg       0.86      0.86      0.86       279
weighted avg       0.86      0.86      0.86       279



### **Test in Pipeline**

In [21]:
from transformers import pipeline

pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
pipe("ቆንጆ ፊልም ነው")

Device set to use cuda:0


[{'label': 'positive', 'score': 0.9790157079696655}]

In [22]:
pipe("አሪፍ ልብስ")

[{'label': 'positive', 'score': 0.8144221305847168}]

In [23]:
pipe("ጅላንፎ")

[{'label': 'negative', 'score': 0.8327473402023315}]

In [24]:
pipe("እዚህ ሰፈር በማታ መጓዝ እፈራለሁ")

[{'label': 'negative', 'score': 0.8777390122413635}]

In [25]:
pipe("ዩክሬን እና ሩስያ ከባድ ውግያ ላይ ናቸው")

[{'label': 'negative', 'score': 0.8125700354576111}]

**Mis-classified**

In [26]:
misclassified = []
for i in range(len(y_pred)):
  if y_pred[i] != y_test[i]:
    misclassified.append(i)

print(misclassified)

[12, 13, 23, 25, 29, 33, 41, 59, 63, 65, 67, 71, 81, 89, 95, 99, 100, 103, 107, 115, 120, 123, 125, 144, 146, 147, 150, 173, 192, 203, 210, 211, 219, 220, 239, 243, 249, 250, 256, 261]


In [27]:
for idx in misclassified:
  input_ids = preprocessed_datasets["test"][idx]["input_ids"]
  label = preprocessed_datasets["test"][idx]["label"]
  print(label, tokenizer.decode(input_ids))

1 <s> እንዲህ አይነት ርዕስ ጽፎ ለበዐል ይጠብቁን ድንግርግር ይላል ግን አሻም በበዓልም አልቀልድም ካለ ምን ይደረግ? 😃 ስሜነህ አያሌውና ሱራፌል ሲጠይቁ እንኳን ብንሰማ ይሻለናል በውነት።</s>
0 <s> ከእንደ አንተ አይነቱ ደደብ መሀይም ጭራቅ የምንላቀቅበት ዘመን ይሁን!!! ሁሌም አብየየየየ</s>
0 <s> ሰሊ እውነት አንቺ አማኝ ነሽ ከዚህ ሂጂ😄</s>
0 <s> ልክ ብለሃል። እኛማ ተተብትበናል።</s>
0 <s> ያሰብከው ሁሉ ይሳካ! በእውነትና እውቀትና በተመሰረትርው ራዕይ ፓርቲ ኢትዮጵያና ህዝቦችዋ ታፍረውና ተከውረው ይኖራሉ።</s>
0 <s> አጋንንቶች ነፍሴን ይመገባሉ። መሪዬን አገለግላለሁ ፡፡ እኔ ነፍሳትን አመጣለሁ</s>
0 <s> እውነት እንነጋገር ከተባለ ሹመት ከምንለው ሹፈት ብንልው አይሻልም? አንድ በወታደራዊ ሳይንስ ከፍተኛ ጀነራልን እንዲሁ ከመጣል ካልሆነ በክብር በጡረታ ማግለል አይሻልም ? ጥፋተኛ አይደሉም ከተባለ ደሞዛቸውን ከፍሎ ይቅርታ ጠይቆ መመለስ ነበር💚💛❤️</s>
0 <s> ከሀገሬው ይልቅ ለቻይናዉያን ምቹ የሆነች ሀገር 🙃...</s>
0 <s> ተዋ😄 እኔ ለሃገሪቷ ስለትምህት ፣ ሰላም፣ ጤና ፣ መሰረተ ልማት ፣ ውጭ ጉዳይ ወዘተ... ይሄን ፖሊሲ ይዤ መጥቻለሁ የሚል የፓርቲዎች ወግ ነው የናፈቀኝ። ቢያንስ እንደማህበረሰብ የፖለቲካ ንቃታችንን ይጨምርልናል። ባንስማማም ውይይት መልመድ አለብን።</s>
1 <s> ኦሮምያ የደጋገዎች አገር ናት እንጂ አጋች ታጋች ዲራማ የሚካሄድ በት ክልል አይደለም</s>
1 <s> 🤣🤣🤣🤣 እሺ ፌሪዬ ሾሚ ለመቀጣት ዝግጁ ነኝ</s>
0 <s> ነገር ፍለጋ በቃ አሪፍ ሚያስብል ምስሎሽ ነው አይደል መስፍኔ</s>
1 <s> ለመልካም ዕድልሲል በአውሮፕላን ሞተር ውስጥ ሳንቲም የወረወረው ግለሰብ ተቀጣ! ሉ የተሳ