In [None]:
# ------------------------
# SETUP AND IMPORTS
# ------------------------
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import numpy as np
from seqeval.metrics import classification_report, f1_score
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# ----------------------------
# READ DATA FROM GOOGLE DRIVE
# ----------------------------

def read_conll(filepath):
    sentences, labels = [], []
    with open(filepath, encoding="utf-8") as f:
        tokens, tags = [], []
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append(tokens)
                    labels.append(tags)
                    tokens, tags = [], []
            else:
                splits = line.split()
                tokens.append(splits[0])
                tags.append(splits[1])
        if tokens:
            sentences.append(tokens)
            labels.append(tags)
    return sentences, labels

# Corrected file path to access the file from Google Drive
sentences, ner_tags = read_conll("/content/drive/MyDrive/labeled_amharic.conll")

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/labeled_amharic.conll'

In [None]:
data = {"tokens": sentences, "ner_tags": ner_tags}
dataset = Dataset.from_dict(data)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
label_list = sorted({tag for seq in ner_tags for tag in seq})
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

In [None]:
# Use the Hugging Face Hub model name instead of a local path
model_checkpoint = "bert-base-cased"

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=len(label_list), id2label=id2label, label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding="max_length", max_length=128
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(label2id[label[word_idx]])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 40/40 [00:00<00:00, 721.38 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 430.30 examples/s]


In [9]:
# Ensure transformers is up to date

training_args = TrainingArguments(
    output_dir="./../data/results",
    eval_strategy="epoch",
    save_strategy="epoch", # Added to match eval_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return {"f1": f1_score(true_labels, true_predictions)}

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)
trainer = Trainer(
    model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()

NameError: name 'training_args' is not defined

In [None]:
trainer.save_model("../../data/models/amharic-ner-model")
tokenizer.save_pretrained("../../data/amharic-ner-model")

NameError: name 'trainer' is not defined

In [None]:
# --------------------------
# TASK 4 - COMPARING MODELS
# --------------------------

model_checkpoints_to_compare = [
    "bert-base-cased", # Already trained this one, but including for completeness
    "xlm-roberta-base",
    "distilbert-base-uncased", # Note: DistilBERT might not be ideal for multilingual tasks without a multilingual version
    "bert-base-multilingual-cased",
]

for model_checkpoint in model_checkpoints_to_compare:
    print(f"Fine-tuning model: {model_checkpoint}")

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForTokenClassification.from_pretrained(
        model_checkpoint, num_labels=len(label_list), id2label=id2label, label2id=label2id
    )

    # Re-tokenize and align labels for each tokenizer (if necessary, though the current function should be general)
    # If the tokenizer is significantly different, you might need to adjust tokenize_and_align_labels
    tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)


    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f"./results/{model_checkpoint.replace('/', '-')}", # Save results in a model-specific directory
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=5, # You might want to adjust the number of epochs
        weight_decay=0.01,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        report_to="none" # Avoid logging to wandb for each individual model if not needed
    )

    # Create Trainer
    trainer = Trainer(
        model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Save the fine-tuned model and tokenizer
    trainer.save_model(f"./fine-tuned-models/{model_checkpoint.replace('/', '-')}")
    tokenizer.save_pretrained(f"./fine-tuned-models/{model_checkpoint.replace('/', '-')}")

    print(f"Finished fine-tuning {model_checkpoint}\n")

Fine-tuning model: bert-base-cased


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 40/40 [00:00<00:00, 356.03 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 243.15 examples/s]


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`