# Task 3: Training and Evaluating an NER Model with Hugging Face Transformers

In this task, we fine-tuned a pretrained transformer model (e.g., XLM-R) for Named Entity Recognition (NER) on the Amharic dataset.

## Key Steps

- **Set up TrainingArguments:** Configure hyperparameters like learning rate, batch size, epochs, and evaluation strategy.
- **Define compute_metrics:** Use `seqeval` to calculate accuracy and F1 scores on evaluation data.
- **Initialize Trainer:** Pass the model, training arguments, datasets, tokenizer, data collator, and metric function.
- **Train the model:** Run `trainer.train()` to fine-tune.
- **Evaluate and save:** Evaluate model performance on the test set and save the fine-tuned model.

## Output

Training logs show loss, accuracy, and F1 score per epoch. Final evaluation metrics provide a quantitative measure of model performance.

---

This process enables effective fine-tuning of transformer-based NER models on custom datasets with ease.

import sys
sys.path.append('C:/Users/saron/OneDrive/Desktop/kifya/week4/ethioMart-ner')  
from utils.data_utils import parse_conll
from datasets import Dataset
import pandas as pd

tokens, tags = parse_conll("../data/labeled_data.conll")

data = [{"tokens": t, "ner_tags": l} for t, l in zip(tokens, tags)]
dataset = Dataset.from_list(data)
dataset = dataset.train_test_split(test_size=0.2)


In [2]:
from transformers import AutoTokenizer

label_list = ['O', 'B-LOC', 'I-LOC', 'B-Product', 'I-Product', 'B-PRICE', 'I-PRICE']
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

model_name = "Davlan/afro-xlmr-base"  # or "GeezTech/bert-tiny-amharic"
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [3]:
from transformers import DataCollatorForTokenClassification

# Step 3: Build label2id and id2label mappings
unique_labels = set(label for split in dataset.values() for doc in split["ner_tags"] for label in doc)
label_list = sorted(unique_labels)
label2id = {label: idx for idx, label in enumerate(label_list)}
id2label = {idx: label for label, idx in label2id.items()}

# Step 4: Tokenization and alignment
def tokenize_and_align(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(label2id[label[word_idx]])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Step 5: Apply to dataset
tokenized_dataset = dataset.map(tokenize_and_align, batched=True)

# Step 6: Create the data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [4]:
from transformers import AutoModelForTokenClassification

model_name = "Davlan/xlm-roberta-base-ner-hrl"

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)


config.json:   0%|          | 0.00/980 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

In [10]:
import sys
from transformers import TrainingArguments, Trainer
from seqeval.metrics import accuracy_score, f1_score
import logging

logging.basicConfig(level=logging.INFO)

args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=1,
    report_to=None,
    disable_tqdm=False,
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_preds = [[label_list[p] for p, l in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)]
    return {
        "accuracy": accuracy_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds),
    }

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Starting training...")
sys.stdout.flush()
trainer.train()
print("Training finished!")
sys.stdout.flush()

metrics = trainer.evaluate()
print("Evaluation metrics:", metrics)
sys.stdout.flush()

trainer.save_model("../models/amharic_ner_xlmr")


Starting training...


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1699,0.108169,0.981112,0.853081




Training finished!


Evaluation metrics: {'eval_loss': 0.10816903412342072, 'eval_accuracy': 0.9811117107393416, 'eval_f1': 0.8530805687203791, 'eval_runtime': 4.6795, 'eval_samples_per_second': 2.137, 'eval_steps_per_second': 0.427, 'epoch': 1.0}


In [11]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.10816903412342072, 'eval_accuracy': 0.9811117107393416, 'eval_f1': 0.8530805687203791, 'eval_runtime': 4.8258, 'eval_samples_per_second': 2.072, 'eval_steps_per_second': 0.414, 'epoch': 1.0}
