### **Download Sentiment Dataset**

In [1]:
! pip install -Uq datasets evaluate accelerate

In [2]:
! pip uninstall -y wandb

[0m

In [3]:
from datasets import load_dataset

sentiment_dataset = load_dataset("rasyosef/amharic-sentiment")
sentiment_dataset

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


DatasetDict({
    train: Dataset({
        features: ['clean_tweet', 'label'],
        num_rows: 2223
    })
    dev: Dataset({
        features: ['clean_tweet', 'label'],
        num_rows: 279
    })
    test: Dataset({
        features: ['clean_tweet', 'label'],
        num_rows: 279
    })
})

In [4]:
sentiment_dataset["train"].features

{'clean_tweet': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None)}

In [5]:
label_names = sentiment_dataset["train"].features["label"].names
label_names

['negative', 'positive']

### **Tokenize Tweets**

In [6]:
from transformers import AutoTokenizer

model_id = "Davlan/afro-xlmr-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)

print(tokenizer.tokenize("ከሀገራቸው ከኢትዮጵያ ከወጡ ግማሽ ምዕተ <mask> ተቆጥሯል።"))

['▁ከ', 'ሀ', 'ገ', 'ራቸው', '▁ከኢትዮጵያ', '▁ከ', 'ወጡ', '▁', 'ግማሽ', '▁', 'ምዕ', 'ተ', ' <mask>', '▁ተ', 'ቆ', 'ጥ', 'ሯል።']


In [7]:
def tokenize_dataset(samples):
  tokenized_samples = tokenizer(samples["clean_tweet"], truncation=True, max_length=512)
  return tokenized_samples

tokenize_dataset({"clean_tweet":["ከሀገራቸው ከኢትዮጵያ ከወጡ ግማሽ"]})

{'input_ids': [[0, 1464, 21608, 5430, 66052, 165627, 1464, 87365, 6, 230446, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [8]:
preprocessed_datasets = sentiment_dataset.map(
    tokenize_dataset,
    batched=True,
    remove_columns=["clean_tweet"]
  )
preprocessed_datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 2223
    })
    dev: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 279
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 279
    })
})

### **Load Amharic BERT model**

In [9]:
id2label={i: label for i, label in enumerate(label_names)}
label2id={label: i for i, label in enumerate(label_names)}

print(id2label, label2id)

{0: 'negative', 1: 'positive'} {'negative': 0, 'positive': 1}


In [10]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=len(label_names),
    id2label=id2label,
    label2id=label2id,
    # device_map="cuda"
)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### **Finetuning**

In [11]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

In [12]:
from transformers import TrainingArguments

batch_size = 32
epochs = 3

training_args = TrainingArguments(
    output_dir=model_id+"-finetuned",
    learning_rate=2e-5,
    # lr_scheduler_type="linear",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    # load_best_model_at_end=True,
    # metric_for_best_model="f1",
    fp16=True,
    seed=16,
)



In [13]:
import evaluate
import numpy as np

def compute_metrics(eval_preds):
  metric1 = evaluate.load("accuracy")
  metric2 = evaluate.load("precision")
  metric3 = evaluate.load("recall")
  metric4 = evaluate.load("f1")

  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)

  accuracy = metric1.compute(predictions=predictions, references=labels)["accuracy"]
  # precision = metric2.compute(predictions=predictions, references=labels, average='macro')["precision"]
  # recall = metric3.compute(predictions=predictions, references=labels, average='macro')["recall"]
  # f1 = metric4.compute(predictions=predictions, references=labels, average='macro')["f1"]

  return {
      "accuracy": accuracy,
      # "precision": precision,
      # "recall": recall,
      # "f1": f1
  }

compute_metrics((np.array([[1,0], [0,1]]), np.array([0,1])))

{'accuracy': 1.0}

In [14]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=preprocessed_datasets["train"],
    eval_dataset=preprocessed_datasets["dev"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [15]:
trainer.evaluate()

{'eval_loss': 0.6882262825965881,
 'eval_model_preparation_time': 0.0038,
 'eval_accuracy': 0.5340501792114696,
 'eval_runtime': 6.1755,
 'eval_samples_per_second': 45.179,
 'eval_steps_per_second': 1.457}

In [16]:
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy
1,0.5511,0.353497,0.0038,0.878136
2,0.3873,0.361035,0.0038,0.860215
3,0.3319,0.364352,0.0038,0.856631


TrainOutput(global_step=210, training_loss=0.4234596161615281, metrics={'train_runtime': 195.4515, 'train_samples_per_second': 34.121, 'train_steps_per_second': 1.074, 'total_flos': 289544980314420.0, 'train_loss': 0.4234596161615281, 'epoch': 3.0})

In [17]:
trainer.evaluate()

{'eval_loss': 0.3643520474433899,
 'eval_model_preparation_time': 0.0038,
 'eval_accuracy': 0.8566308243727598,
 'eval_runtime': 5.1705,
 'eval_samples_per_second': 53.96,
 'eval_steps_per_second': 1.741,
 'epoch': 3.0}

### **Evaluate Model**

In [18]:
import torch

num_samples = len(preprocessed_datasets["test"])
inputs = data_collator([preprocessed_datasets["test"][i] for i in range(num_samples)])
# print(inputs)

y_pred = []

#output = model(**inputs)
for i in range(0, len(inputs["input_ids"]), 5):
  output = model(**{k: v[i:i+5].to("cuda") for k, v in inputs.items()})
  y_pred.extend(np.argmax(output.logits.tolist(), axis=-1))

y_test = np.array(preprocessed_datasets["test"]["label"])

print(y_pred)
print(y_test)

[1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0]
[1 1 0 1 0 0 1 1 0 0 1 0 1 0 1 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1
 0 1 1 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 1 1 1 0 0 1 0
 0 1 0 1 1 0

In [19]:
from sklearn import metrics

metrics.confusion_matrix(y_test, y_pred)

array([[126,  24],
       [ 24, 105]])

In [20]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84       150
           1       0.81      0.81      0.81       129

    accuracy                           0.83       279
   macro avg       0.83      0.83      0.83       279
weighted avg       0.83      0.83      0.83       279



### **Test in Pipeline**

In [21]:
from transformers import pipeline

pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
pipe("ቆንጆ ፊልም ነው")

Device set to use cuda:0


[{'label': 'positive', 'score': 0.9071478247642517}]

In [22]:
pipe("አሪፍ ልብስ")

[{'label': 'positive', 'score': 0.8300437331199646}]

In [23]:
pipe("ጅላንፎ")

[{'label': 'negative', 'score': 0.889720618724823}]

In [24]:
pipe("እዚህ ሰፈር በማታ መጓዝ እፈራለሁ")

[{'label': 'positive', 'score': 0.7218903303146362}]

In [25]:
pipe("ዩክሬን እና ሩስያ ከባድ ውግያ ላይ ናቸው")

[{'label': 'negative', 'score': 0.6785932183265686}]

**Mis-classified**

In [26]:
misclassified = []
for i in range(len(y_pred)):
  if y_pred[i] != y_test[i]:
    misclassified.append(i)

print(misclassified)

[8, 12, 13, 24, 29, 33, 35, 36, 49, 53, 54, 59, 63, 65, 67, 72, 74, 81, 89, 93, 99, 100, 102, 103, 107, 115, 120, 125, 137, 144, 147, 164, 173, 186, 197, 201, 211, 212, 219, 220, 239, 243, 249, 250, 253, 254, 258, 270]


In [27]:
for idx in misclassified:
  input_ids = preprocessed_datasets["test"][idx]["input_ids"]
  label = preprocessed_datasets["test"][idx]["label"]
  print(label, tokenizer.decode(input_ids))

0 <s> እንደሱማ ከሆነ እናንተም ቦታ ላይኖራችሁ ነው</s>
1 <s> እንዲህ አይነት ርዕስ ጽፎ ለበዐል ይጠብቁን ድንግርግር ይላል ግን አሻም በበዓልም አልቀልድም ካለ ምን ይደረግ? 😃 ስሜነህ አያሌውና ሱራፌል ሲጠይቁ እንኳን ብንሰማ ይሻለናል በውነት።</s>
0 <s> ከእንደ አንተ አይነቱ ደደብ መሀይም ጭራቅ የምንላቀቅበት ዘመን ይሁን!!! ሁሌም አብየየየየ</s>
1 <s> ዛሬ እንደ እኔ ምህረት የበዛለት ማን ነው???</s>
0 <s> ያሰብከው ሁሉ ይሳካ! በእውነትና እውቀትና በተመሰረትርው ራዕይ ፓርቲ ኢትዮጵያና ህዝቦችዋ ታፍረውና ተከውረው ይኖራሉ።</s>
0 <s> አጋንንቶች ነፍሴን ይመገባሉ። መሪዬን አገለግላለሁ ፡፡ እኔ ነፍሳትን አመጣለሁ</s>
0 <s> ይገርማል እኔም ለአንድ ወገን እየሰራ እንዳለ ስነግረው ወዲያው ብሎክ አደረገኝ</s>
1 <s> እንዲህ አይነቱን የተባረከና የተቀደሰውን ተግባር እየሰራህ በሸርና በተንኮል ላይ የተሰማሩትን አንጀታቸውን አድብናቸው:: መቼም ይህን ባዩ ግዜ ደም እንባ እንደሚያነቡ እርግጠኛ ነኝ:: የእነዚያ አምላክ ይጠብቅህ🙏🏿</s>
0 <s> በመካከላችን ማንም የለም ፣ በገዛ እጄ ከገደለው ፍጥረት ሁሉ ጋር እኖራለሁ ምንም እንኳን ከእንግዲህ ከእኛ ጋር ባይሆንም</s>
0 <s> ታሪካችን ፡ እንደገና ፡ ተበዉዞ ፡ መስራት ፡ አለበት ፡ እስካሁን ፡ ያለው ፡ የኣንድ ፡ ወገን ፡ ታሪክ ፡ ነው።</s>
0 <s> : በተስፋ የምትጠብቅ እናት በሠላም የላከችው ልጇ ሬሳው በሳጥን ተጭኖ ሲመጣላት የሚሰማትን የሀዘን ስሜት ከባድ ነው ስለዚህ እማራለሁ ብለሁ ብለው ሄደው እሬሳቸው ለቤተ...</s>
0 <s> ከሀገሬው ይልቅ ለቻይናዉያን ምቹ የሆነች ሀገር 🙃...</s>
0 <s> ተዋ😄 እኔ ለሃገሪቷ ስለትምህት ፣ ሰላም፣ ጤና ፣ መሰረ