### **Setup**

In [1]:
! pip install --quiet datasets evaluate accelerate

In [2]:
! pip uninstall -y wandb

[0m

### **Load Dataset**

In [3]:
from datasets import load_dataset

ner_dataset = load_dataset("rasyosef/amharic-named-entity-recognition", split="train")
ner_dataset

Access to the secret `HF_TOKEN` has not been granted on this notebook.
You will not be requested again.
Please restart the session if you want to be prompted again.


Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 3465
})

In [4]:
ner_dataset.features["ner_tags"]

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-TIME', 'I-TIME', 'B-TTL', 'I-TTL'], id=None), length=-1, id=None)

In [5]:
categories = ner_dataset.features["ner_tags"].feature.names

label2id = {
  label: i for i, label in enumerate(categories)
}

id2label = {
  v: k for k, v in label2id.items()
}

print(id2label)
print(label2id)

{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-TIME', 8: 'I-TIME', 9: 'B-TTL', 10: 'I-TTL'}
{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-TIME': 7, 'I-TIME': 8, 'B-TTL': 9, 'I-TTL': 10}


### **Processing the data**

In [6]:
from google.colab import userdata
from transformers import AutoTokenizer

model_id = "Davlan/afro-xlmr-large"
tokenizer = AutoTokenizer.from_pretrained(model_id, add_prefix_space=True)

print(tokenizer.tokenize("ከሀገራቸው ከኢትዮጵያ ከወጡ ግማሽ ምዕተ <mask> ተቆጥሯል።"))

['▁ከ', 'ሀ', 'ገ', 'ራቸው', '▁ከኢትዮጵያ', '▁ከ', 'ወጡ', '▁', 'ግማሽ', '▁', 'ምዕ', 'ተ', ' <mask>', '▁ተ', 'ቆ', 'ጥ', 'ሯል።']


In [7]:
tokenizer.is_fast

True

In [8]:
# Tokenize the pretokenized input by adding is_split_into_words=True
inputs = tokenizer(ner_dataset[0]["tokens"], is_split_into_words=True)
print(inputs.tokens())

['<s>', '▁ኢ', 'ዴ', 'ፓ', '▁በየ', 'ክል', 'ሉ', '▁በሚ', 'ንቀሳቀስ', 'በት', '▁ጊዜ', '▁ሁሉ', '▁የ', 'ሀገሪቱ', 'ን', '▁አ', 'ጠቃ', 'ላይ', '▁ሕግ', 'እን', 'ዲ', 'ሁ', 'ም', '▁የ', 'አካባቢ', 'ውን', '▁ባህል', 'ና', '▁ቋንቋ', '▁አክ', 'ብሮ', '▁በ', 'አካባቢ', 'ው', '▁የሚገኙ', '▁የፖለቲካ', '▁ድርጅቶች', 'ንም', '▁አክ', 'ብሮ', 'ና', '▁መብ', 'ታቸውን', '▁', 'ጠብ', 'ቆ', '▁በ', 'ጨ', 'ዋ', 'ነት', '▁ያስተ', 'ም', 'ራል', '▁፣', '▁ይ', 'ማ', 'ራል', '"', '▁ብ', 'ለ', 'ዋል', '▁።', '</s>']


In [9]:
print(inputs.word_ids())

[None, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 9, 9, 10, 11, 11, 12, 12, 12, 13, 14, 15, 15, 16, 16, 16, 17, 17, 18, 18, 18, 19, 19, 19, 19, 20, 20, 20, 21, 22, 22, 22, 22, 23, 23, 23, 24, None]


In [10]:
def align_labels_with_tokens(labels, word_ids):
  new_labels = []
  current_word = None
  for word_id in word_ids:
    if word_id != current_word: # start of new word
      current_word = word_id
      label = -100 if word_id is None else labels[word_id]
      new_labels.append(label)
    elif word_id is None: # special token
      new_labels.append(-100)
    else:
      # same word as pervious token
      label = labels[word_id]
      # If label is B-XXX change it to I-XXX
      if id2label[label].startswith("B-"):
        label = label2id["I-" + id2label[label][2:]]
      new_labels.append(label)
  return new_labels


In [11]:
labels = ner_dataset[2]["ner_tags"]
inputs = tokenizer(ner_dataset[2]["tokens"], is_split_into_words=True)
word_ids = inputs.word_ids()
print(inputs.tokens())
print(labels)
print(align_labels_with_tokens(labels, word_ids))

['<s>', '▁በ', 'ባህ', 'ር', '▁', 'ዳር', '▁ዩኒቨርስቲ', 'ና', '▁በ', 'ጅ', 'ማ', '▁', 'መም', 'ህ', 'ራን', '▁ኮ', 'ሌ', 'ጅ', '▁ለ', 'ነ', 'ባር', '▁ተማሪዎች', '▁የምግብ', 'ና', '▁የመ', 'ኝ', 'ታ', '▁አገልግሎት', '▁ሊያ', 'ቆም', '▁እንደሚችል', '▁ባለፈው', '▁ዓመት', '▁የተሰጠ', 'ው', '▁ማሳ', 'ሰ', 'ቢያ', '▁ከዚህ', '▁ዓመት', '▁ጀምሮ', '▁ተግባራዊ', '▁ይሆናል', '▁በመ', 'ባሉ', '▁ተማሪዎች', '▁ከፍተኛ', '▁ስጋት', 'ና', '▁ጭ', 'ን', 'ቀት', '▁ላይ', '▁', 'መው', 'ደ', 'ቃቸው', 'ን', '▁ተጠ', 'ቆመ', '▁።', '</s>']
[3, 4, 4, 3, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[-100, 3, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]


In [12]:
def tokenize_and_align_labels(examples):
  tokenized_inputs = tokenizer(
      examples['tokens'], truncation=True, is_split_into_words=True
  )
  all_labels = examples['ner_tags']
  new_labels = []
  for i, labels in enumerate(all_labels):
    word_ids = tokenized_inputs.word_ids(i)
    new_labels.append(align_labels_with_tokens(labels, word_ids))

  tokenized_inputs['labels'] = new_labels
  return tokenized_inputs

In [13]:
tokenized_datasets = ner_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=ner_dataset.column_names,
)
tokenized_datasets

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 3465
})

In [14]:
preprocessed_datasets = tokenized_datasets.train_test_split(test_size=0.2, seed=16)
preprocessed_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2772
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 693
    })
})

### **Finetuning**

In [15]:
# Data Collator
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [16]:
batch = data_collator([preprocessed_datasets["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    5,    6,    6,    6,    6,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0, -100],
        [-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]])

### **Metrics**

In [17]:
!pip install --quiet seqeval

In [18]:
import evaluate

metric = evaluate.load("seqeval")

In [19]:
labels = preprocessed_datasets["train"][3]["labels"]
print(labels)
labels = [categories[i] for i in labels[1:-1]]
print(labels), labels[19]

[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


(None, 'O')

In [20]:
predictions = labels.copy()
predictions[19] = "O"
metric.compute(predictions=[predictions], references=[labels])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


{'overall_precision': 0.0,
 'overall_recall': 0.0,
 'overall_f1': 0.0,
 'overall_accuracy': 1.0}

In [21]:
import numpy as np

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[categories[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [categories[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

### **Defining the Model**

In [22]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_id,
    id2label=id2label,
    label2id=label2id
)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# Number of labels
model.config.num_labels

11

### **Fine-tuning the model**

In [24]:
from transformers import TrainingArguments

args = TrainingArguments(
    "xlm-roberta-base-finetuned-ner",
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    learning_rate=4e-5,
    lr_scheduler_type="linear",
    num_train_epochs=8,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

In [25]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=preprocessed_datasets["train"],
    eval_dataset=preprocessed_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.247,0.119639,0.612648,0.695847,0.651603,0.964072
2,0.1052,0.103572,0.658111,0.719416,0.687399,0.967726
3,0.072,0.116639,0.662289,0.792368,0.721513,0.965605
4,0.0463,0.126028,0.698,0.783389,0.738234,0.968263
5,0.0297,0.136629,0.707661,0.787879,0.745619,0.968902
6,0.0184,0.142083,0.704457,0.815937,0.75611,0.969183
7,0.0106,0.161811,0.722782,0.804714,0.761551,0.970282
8,0.0068,0.166965,0.730533,0.800224,0.763792,0.970307


TrainOutput(global_step=1848, training_loss=0.06699519756036404, metrics={'train_runtime': 2963.2361, 'train_samples_per_second': 7.484, 'train_steps_per_second': 0.624, 'total_flos': 4849543903169448.0, 'train_loss': 0.06699519756036404, 'epoch': 8.0})

In [26]:
trainer.evaluate()

{'eval_loss': 0.16696497797966003,
 'eval_precision': 0.7305327868852459,
 'eval_recall': 0.8002244668911336,
 'eval_f1': 0.763792179967863,
 'eval_accuracy': 0.9703071497930189,
 'eval_runtime': 17.3374,
 'eval_samples_per_second': 39.971,
 'eval_steps_per_second': 3.345,
 'epoch': 8.0}

### **Testing**

In [27]:
from transformers import pipeline

ner_pipe = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

Device set to use cuda:0


In [28]:
ind = 3
text = " ".join(ner_dataset[ind]["tokens"])
print(text)
print(" ".join([str(i) for i in ner_dataset[ind]["ner_tags"]]))
ner_pipe(text)

የሻዕቢያው መሪ ለዚህ ማስፈራሪያቸው እንደ አብነት የተጠቀሙበት ኦጋዴንን ሲሆን የኢሕአዴግ መንግሥት ከቅኝ ግዛት ውሎቹ ላፈንግጥ ቢል ያንን ግዛት ለዘለዓለሙ ሊያጣው እንደሚችል ግልጽ ሊሆንለት ይገባል ሲሉ አስጠንቅቀዋል ።
3 0 0 0 0 0 0 5 0 3 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


[{'entity_group': 'ORG',
  'score': 0.9989479,
  'word': 'የሻዕቢያው',
  'start': 0,
  'end': 6},
 {'entity_group': 'LOC',
  'score': 0.9986769,
  'word': 'ኦጋዴንን',
  'start': 40,
  'end': 45},
 {'entity_group': 'ORG',
  'score': 0.99818915,
  'word': 'የኢሕአዴግ መንግሥት',
  'start': 50,
  'end': 62}]

In [29]:
ind = 12
text = " ".join(ner_dataset[ind]["tokens"])
print(text)
print(" ".join([str(i) for i in ner_dataset[ind]["ner_tags"]]))
ner_pipe(text)

በናዝሬት አጠቃላይ ዕድሮች የመሰብሰቢያ አዳራሽ በተካሄደው በዚሁ ስብሰባ ላይ በአዳራሽ ውስጥ በመቀመጥና በመቆም ፣ ከውጭም በመስኮት ቁጥሩ እስከ 1400 የሆነ ሕዝብ መገኘቱን የገለጡት አቶ ልደቱ መታፈስ ሊኖር ይችላል" የሚል ማስፈራሪያ በከተማው ተሠራጭቶ የነበረ ቢሆንም ቁጥራቸው የበዛ ወጣቶች ጐልማሶችና አዛውንቶች በስብሰባው ላይ መካፈላቸውን ፣ ብዙውም በአዳራሽ ጥበት ምክንያትም መመለሱን አመልክተዋል ።
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 9 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


[{'entity_group': 'TTL',
  'score': 0.9998479,
  'word': 'አቶ',
  'start': 117,
  'end': 119},
 {'entity_group': 'PER',
  'score': 0.9998122,
  'word': 'ልደቱ',
  'start': 120,
  'end': 123}]