### **Setup**

In [1]:
! pip install --quiet datasets evaluate accelerate

In [2]:
! pip uninstall -y wandb

[0m

### **Load Dataset**

In [3]:
from datasets import load_dataset

ner_dataset = load_dataset("rasyosef/amharic-named-entity-recognition", split="train")
ner_dataset

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 3465
})

In [4]:
ner_dataset.features["ner_tags"]

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-TIME', 'I-TIME', 'B-TTL', 'I-TTL'], id=None), length=-1, id=None)

In [5]:
categories = ner_dataset.features["ner_tags"].feature.names

label2id = {
  label: i for i, label in enumerate(categories)
}

id2label = {
  v: k for k, v in label2id.items()
}

print(id2label)
print(label2id)

{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-TIME', 8: 'I-TIME', 9: 'B-TTL', 10: 'I-TTL'}
{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-TIME': 7, 'I-TIME': 8, 'B-TTL': 9, 'I-TTL': 10}


### **Processing the data**

In [6]:
from transformers import AutoTokenizer

model_id = "rasyosef/roberta-base-amharic"
tokenizer = AutoTokenizer.from_pretrained(model_id, add_prefix_space=True)

print(tokenizer.tokenize("ከሀገራቸው ከኢትዮጵያ ከወጡ ግማሽ ምዕተ <mask> ተቆጥሯል።"))

['▁ከ', 'ሀገራቸው', '▁ከኢትዮጵያ', '▁ከወጡ', '▁ግማሽ', '▁ምዕተ', ' <mask>', '▁ተቆጥ', 'ሯል።']


In [7]:
tokenizer.is_fast

True

In [8]:
# Tokenize the pretokenized input by adding is_split_into_words=True
inputs = tokenizer(ner_dataset[0]["tokens"], is_split_into_words=True)
print(inputs.tokens())

['<s>', '▁ኢዴፓ', '▁በየክልሉ', '▁በሚ', 'ንቀሳቀስ', 'በት', '▁ጊዜ', '▁ሁሉ', '▁የሀገሪቱን', '▁አጠቃላይ', '▁ሕግ', 'እንዲሁም', '▁የአካባቢውን', '▁ባህልና', '▁ቋንቋ', '▁አክብሮ', '▁በአካባቢው', '▁የሚገኙ', '▁የፖለቲካ', '▁ድርጅቶችን', 'ም', '▁አክብሮ', 'ና', '▁መብታቸውን', '▁ጠብቆ', '▁በ', 'ጨዋ', 'ነት', '▁ያስተምራል', '▁', '፣', '▁ይማራል', '"', '▁ብለዋል', '▁', '።', '</s>']


In [9]:
print(inputs.word_ids())

[None, 0, 1, 2, 2, 2, 3, 4, 5, 6, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 16, 16, 17, 18, 19, 19, 19, 20, 21, 21, 22, 22, 23, 24, 24, None]


In [10]:
def align_labels_with_tokens(labels, word_ids):
  new_labels = []
  current_word = None
  for word_id in word_ids:
    if word_id != current_word: # start of new word
      current_word = word_id
      label = -100 if word_id is None else labels[word_id]
      new_labels.append(label)
    elif word_id is None: # special token
      new_labels.append(-100)
    else:
      # same word as pervious token
      label = labels[word_id]
      # If label is B-XXX change it to I-XXX
      if id2label[label].startswith("B-"):
        label = label2id["I-" + id2label[label][2:]]
      new_labels.append(label)
  return new_labels


In [11]:
labels = ner_dataset[2]["ner_tags"]
inputs = tokenizer(ner_dataset[2]["tokens"], is_split_into_words=True)
word_ids = inputs.word_ids()
print(inputs.tokens())
print(labels)
print(align_labels_with_tokens(labels, word_ids))

['<s>', '▁በባህር', '▁ዳር', '▁ዩኒቨርስቲ', 'ና', '▁በጅማ', '▁መምህራን', '▁ኮሌጅ', '▁ለነ', 'ባር', '▁ተማሪዎች', '▁የምግብ', 'ና', '▁የመኝታ', '▁አገልግሎት', '▁ሊያ', 'ቆም', '▁እንደሚችል', '▁ባለፈው', '▁ዓመት', '▁የተሰጠው', '▁ማሳሰቢያ', '▁ከዚህ', '▁ዓመት', '▁ጀምሮ', '▁ተግባራዊ', '▁ይሆናል', '▁በመ', 'ባሉ', '▁ተማሪዎች', '▁ከፍተኛ', '▁ስጋት', 'ና', '▁ጭንቀት', '▁ላይ', '▁', 'መውደቃቸው', 'ን', '▁ተጠቆመ', '▁', '።', '</s>']
[3, 4, 4, 3, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[-100, 3, 4, 4, 4, 3, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]


In [12]:
def tokenize_and_align_labels(examples):
  tokenized_inputs = tokenizer(
      examples['tokens'], truncation=True, is_split_into_words=True
  )
  all_labels = examples['ner_tags']
  new_labels = []
  for i, labels in enumerate(all_labels):
    word_ids = tokenized_inputs.word_ids(i)
    new_labels.append(align_labels_with_tokens(labels, word_ids))

  tokenized_inputs['labels'] = new_labels
  return tokenized_inputs

In [13]:
tokenized_datasets = ner_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=ner_dataset.column_names,
)
tokenized_datasets

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 3465
})

In [14]:
preprocessed_datasets = tokenized_datasets.train_test_split(test_size=0.2, seed=16)
preprocessed_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2772
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 693
    })
})

### **Finetuning**

In [15]:
# Data Collator
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [16]:
batch = data_collator([preprocessed_datasets["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    5,    6,    6,    6,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0, -100],
        [-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]])

### **Metrics**

In [17]:
!pip install --quiet seqeval

In [18]:
import evaluate

metric = evaluate.load("seqeval")

In [19]:
labels = preprocessed_datasets["train"][3]["labels"]
print(labels)
labels = [categories[i] for i in labels[1:-1]]
print(labels), labels[19]

[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


(None, 'O')

In [20]:
predictions = labels.copy()
predictions[19] = "O"
metric.compute(predictions=[predictions], references=[labels])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


{'overall_precision': 0.0,
 'overall_recall': 0.0,
 'overall_f1': 0.0,
 'overall_accuracy': 1.0}

In [21]:
import numpy as np

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[categories[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [categories[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

### **Defining the Model**

In [22]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_id,
    id2label=id2label,
    label2id=label2id,
)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at rasyosef/roberta-base-amharic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# Number of labels
model.config.num_labels

11

### **Fine-tuning the model**

In [24]:
from transformers import TrainingArguments

args = TrainingArguments(
    "xlm-roberta-base-finetuned-ner",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    lr_scheduler_type="linear",
    num_train_epochs=8,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    seed=42
)

In [25]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=preprocessed_datasets["train"],
    eval_dataset=preprocessed_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2181,0.11002,0.609658,0.680135,0.642971,0.961615
2,0.0732,0.107563,0.713203,0.739618,0.726171,0.966668
3,0.0393,0.139403,0.687234,0.725028,0.705625,0.965989
4,0.0207,0.148391,0.699459,0.72615,0.712555,0.964971
5,0.0119,0.146824,0.741525,0.785634,0.762943,0.968704
6,0.0051,0.171483,0.732252,0.810325,0.769313,0.968855
7,0.003,0.175868,0.74739,0.803591,0.774473,0.969383
8,0.0022,0.183561,0.748697,0.805836,0.776216,0.969383


TrainOutput(global_step=1392, training_loss=0.04669905702273051, metrics={'train_runtime': 625.8569, 'train_samples_per_second': 35.433, 'train_steps_per_second': 2.224, 'total_flos': 983047760691336.0, 'train_loss': 0.04669905702273051, 'epoch': 8.0})

In [26]:
trainer.evaluate()

{'eval_loss': 0.1835612654685974,
 'eval_precision': 0.748696558915537,
 'eval_recall': 0.8058361391694725,
 'eval_f1': 0.7762162162162162,
 'eval_accuracy': 0.9693827532898458,
 'eval_runtime': 4.2649,
 'eval_samples_per_second': 162.489,
 'eval_steps_per_second': 10.317,
 'epoch': 8.0}

### **Testing**

In [27]:
from transformers import pipeline

ner_pipe = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

Device set to use cuda:0


In [28]:
ind = 3
text = " ".join(ner_dataset[ind]["tokens"])
print(text)
print(" ".join([str(i) for i in ner_dataset[ind]["ner_tags"]]))
ner_pipe(text)

የሻዕቢያው መሪ ለዚህ ማስፈራሪያቸው እንደ አብነት የተጠቀሙበት ኦጋዴንን ሲሆን የኢሕአዴግ መንግሥት ከቅኝ ግዛት ውሎቹ ላፈንግጥ ቢል ያንን ግዛት ለዘለዓለሙ ሊያጣው እንደሚችል ግልጽ ሊሆንለት ይገባል ሲሉ አስጠንቅቀዋል ።
3 0 0 0 0 0 0 5 0 3 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


[{'entity_group': 'ORG',
  'score': 0.99936485,
  'word': 'የሻዕቢያው',
  'start': 0,
  'end': 6},
 {'entity_group': 'LOC',
  'score': 0.9986171,
  'word': 'ኦጋዴንን',
  'start': 40,
  'end': 45},
 {'entity_group': 'ORG',
  'score': 0.99921405,
  'word': 'የኢሕአዴግ መንግሥት',
  'start': 50,
  'end': 62}]

In [29]:
ind = 12
text = " ".join(ner_dataset[ind]["tokens"])
print(text)
print(" ".join([str(i) for i in ner_dataset[ind]["ner_tags"]]))
ner_pipe(text)

በናዝሬት አጠቃላይ ዕድሮች የመሰብሰቢያ አዳራሽ በተካሄደው በዚሁ ስብሰባ ላይ በአዳራሽ ውስጥ በመቀመጥና በመቆም ፣ ከውጭም በመስኮት ቁጥሩ እስከ 1400 የሆነ ሕዝብ መገኘቱን የገለጡት አቶ ልደቱ መታፈስ ሊኖር ይችላል" የሚል ማስፈራሪያ በከተማው ተሠራጭቶ የነበረ ቢሆንም ቁጥራቸው የበዛ ወጣቶች ጐልማሶችና አዛውንቶች በስብሰባው ላይ መካፈላቸውን ፣ ብዙውም በአዳራሽ ጥበት ምክንያትም መመለሱን አመልክተዋል ።
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 9 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


[{'entity_group': 'TTL',
  'score': 0.9998541,
  'word': 'አቶ',
  'start': 117,
  'end': 119},
 {'entity_group': 'PER',
  'score': 0.9998771,
  'word': 'ልደቱ',
  'start': 120,
  'end': 123}]