Import pacakes to use later and read function to read the conll file

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from datasets import Dataset, load_metric
import numpy as np
import pandas as pd
import time

# Function to read CONLL file
def read_conll_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        sentences = []
        current_sentence = {"tokens": [], "ner_tags": []}

        for line in f:
            line = line.strip()

            if not line:  # Sentence boundary
                if current_sentence["tokens"]:
                    sentences.append(current_sentence)
                    current_sentence = {"tokens": [], "ner_tags": []}
                continue

            parts = line.split('\t')
            if len(parts) == 2:
                token, tag = parts
                current_sentence["tokens"].append(token)
                current_sentence["ner_tags"].append(tag)

    return sentences



Upload .conll file

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
input_file = "amharic_ner_conll_labeled_output.conll"

## Load and Split CoNLL Dataset
- Defines a utility to load a CoNLL-formatted NER dataset and split it into train, validation, and test sets:

- Reads token-label sequences using read_conll_file

- Converts data to a DataFrame for easy splitting

- Uses train_test_split to partition the data

- Returns a Hugging Face DatasetDict ready for model training and evaluation

In [None]:
from datasets import Dataset, DatasetDict
import pandas as pd
from sklearn.model_selection import train_test_split

def load_and_split_conll(file_path, val_size=0.2, test_size=0.1):
    # Read the single CONLL file
    data = read_conll_file(file_path)

    # Convert to pandas DataFrame for easy splitting
    df = pd.DataFrame(data)

    # Split into train, validation, and test
    train_df, temp_df = train_test_split(df, test_size=val_size + test_size, random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=test_size/(val_size + test_size), random_state=42)

    # Create DatasetDict
    return DatasetDict({
        "train": Dataset.from_pandas(train_df),
        "validation": Dataset.from_pandas(val_df),
        "test": Dataset.from_pandas(test_df)
    })



## compare_models(dataset) – Model Benchmarking Function

- This function benchmarks multiple multilingual transformer models for Amharic NER using a given dataset.

- Models evaluated: xlm-roberta-base, distilbert-base-multilingual-cased, and bert-base-multilingual-cased

- Dynamically extracts all unique NER labels and creates label2id/id2label mappings

- Tokenizes input tokens with proper label alignment across subword tokens using fast tokenizers

- Fine-tunes each model using Hugging Face's Trainer API with consistent hyperparameters (batch size, learning rate, epochs)

- Tracks and logs F1-score, precision, recall, accuracy, training time, and model size for each model

- Selects the best model based on validation F1-score

- Returns a results table and the top-performing model's stats

In [None]:
MODELS_TO_COMPARE = [
    "xlm-roberta-base",
    "distilbert-base-multilingual-cased",
    "bert-base-multilingual-cased"
]

def compare_models(dataset):
    results = []

    # Get label list
    global label_list
    all_tags = set()
    for split in dataset.values():
        for tags in split["ner_tags"]:
            all_tags.update(tags)
    label_list = sorted(all_tags)

    # Create label mappings
    label2id = {label: i for i, label in enumerate(label_list)}
    id2label = {i: label for i, label in enumerate(label_list)}

    # Define MAX_LENGTH, LEARNING_RATE, BATCH_SIZE, NUM_EPOCHS, and device
    # These variables were used in the original code but not defined.
    # You should set appropriate values for these.
    MAX_LENGTH = 128  
    LEARNING_RATE = 2e-5 
    BATCH_SIZE = 16 
    NUM_EPOCHS = 3 

    import torch
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


    # Define the compute_metrics function
    # This function was used in the original code but not defined.
    # install the 'seqeval' library to use the metric.
    !pip install -q seqeval
    from datasets import load_metric
    metric = load_metric("seqeval")

    def compute_metrics(p, label_list):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [
            [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        results = metric.compute(predictions=true_predictions, references=true_labels)
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }


    for model_name in MODELS_TO_COMPARE: # Iterate only through model names
        try:
            print(f"\n{'='*50}")
            print(f"Training {model_name}")
            print(f"{'='*50}")

            # Load tokenizer - FORCE fast version
            tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

            # Tokenize function with fast tokenizer
            def tokenize_and_align_labels(examples):
                tokenized_inputs = tokenizer(
                    examples["tokens"],
                    truncation=True,
                    is_split_into_words=True,
                    max_length=MAX_LENGTH,
                    padding="max_length"
                )

                labels = []
                for i, label_seq in enumerate(examples["ner_tags"]):
                    word_ids = tokenized_inputs.word_ids(batch_index=i)
                    previous_word_idx = None
                    label_ids = []
                    for word_idx in word_ids:
                        if word_idx is None:
                            label_ids.append(-100)
                        elif word_idx != previous_word_idx:
                            # Use .get with a default of -100 to handle potential missing tags
                            label_ids.append(label2id.get(label_seq[word_idx], -100))
                        else:
                            label_ids.append(-100)
                        previous_word_idx = word_idx
                    labels.append(label_ids)

                tokenized_inputs["labels"] = labels
                return tokenized_inputs

            # Tokenize dataset
            tokenized_datasets = dataset.map(
                tokenize_and_align_labels,
                batched=True,
                remove_columns=dataset["train"].column_names
            )

            # Load model - Use AutoModelForTokenClassification
            model = AutoModelForTokenClassification.from_pretrained(
                model_name,
                num_labels=len(label_list),
                id2label=id2label,
                label2id=label2id
            ).to(device)

            # Training arguments
            training_args = TrainingArguments(
                output_dir=f"./results_{model_name.replace('/', '-')}",
                eval_strategy="epoch",
                learning_rate=LEARNING_RATE,
                per_device_train_batch_size=BATCH_SIZE,
                per_device_eval_batch_size=BATCH_SIZE,
                num_train_epochs=NUM_EPOCHS,
                weight_decay=0.01,
                save_total_limit=2,
                save_strategy="epoch",
                load_best_model_at_end=True,
                metric_for_best_model="f1",
                logging_dir=f"./logs_{model_name.replace('/', '-')}",
                logging_steps=10,
                report_to="none"
            )

            # Trainer
            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=tokenized_datasets["train"],
                eval_dataset=tokenized_datasets["validation"],
                tokenizer=tokenizer,
                data_collator=DataCollatorForTokenClassification(tokenizer),
                compute_metrics=lambda p: compute_metrics(p, label_list)
            )

            # Train
            start_time = time.time()
            trainer.train()
            training_time = time.time() - start_time

            # Evaluate
            eval_results = trainer.evaluate()

            results.append({
                "model_name": model_name,
                "f1_score": eval_results["eval_f1"],
                "precision": eval_results["eval_precision"],
                "recall": eval_results["eval_recall"],
                "accuracy": eval_results["eval_accuracy"],
                "training_time": training_time,
                "model_size": sum(p.numel() for p in model.parameters())
            })

        except Exception as e:
            print(f"Error training {model_name}: {str(e)}")
            import traceback
            traceback.print_exc() # Print the full traceback for debugging
            continue

    if not results:
        raise ValueError("All models failed to train. Check error messages above.")

    # Find the best model based on F1 score
    best_model_index = np.argmax([r["f1_score"] for r in results])
    best_model = results[best_model_index]

    return pd.DataFrame(results), best_model

Create dataset

In [None]:
dataset = load_and_split_conll("amharic_ner_conll_labeled_output.conll", val_size=0.2, test_size=0.1)
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', '__index_level_0__'],
        num_rows: 6
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', '__index_level_0__'],
        num_rows: 2
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', '__index_level_0__'],
        num_rows: 2
    })
})

Compare Models

In [None]:


comparison_results, best_model = compare_models(dataset)
print(f"Best model: {best_model['model_name']}")
print(comparison_results)



Training xlm-roberta-base


Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,2.402538,0.12,0.441176,0.188679,0.032
2,No log,2.330938,0.12,0.441176,0.188679,0.032
3,No log,2.291824,0.12,0.441176,0.188679,0.032


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))



Training distilbert-base-multilingual-cased


Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.872117,0.066667,0.025641,0.037037,0.46988
2,No log,1.685045,0.090909,0.025641,0.04,0.692771
3,No log,1.599197,0.111111,0.025641,0.041667,0.704819


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))



Training bert-base-multilingual-cased


Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.344515,0.0,0.0,0.0,0.716867
2,No log,1.106431,0.0,0.0,0.0,0.716867
3,No log,1.054486,0.0,0.0,0.0,0.716867


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Best model: xlm-roberta-base
                           model_name  f1_score  precision    recall  \
0                    xlm-roberta-base  0.188679   0.120000  0.441176   
1  distilbert-base-multilingual-cased  0.041667   0.111111  0.025641   
2        bert-base-multilingual-cased  0.000000   0.000000  0.000000   

   accuracy  training_time  model_size  
0  0.032000     461.666065   277459977  
1  0.704819     180.002606   134741001  
2  0.716867     171.515865   177269769  


Best Model: xlm-roberta-base based on F1 score, despite lower accuracy, indicating stronger performance on correctly identifying entities.

distilbert and mBERT showed higher overall token-level accuracy but failed to capture entities effectively.