In [None]:
!pip install transformers datasets torch evaluate seqeval

Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downlo

In [None]:
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split

def load_conll_file(file_path):
    sentences, labels = [], []
    current_sentence, current_labels = [], []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                token, label = line.split()
                current_sentence.append(token)
                current_labels.append(label)
            else:
                if current_sentence:
                    sentences.append(current_sentence)
                    labels.append(current_labels)
                    current_sentence, current_labels = [], []
        if current_sentence:
            sentences.append(current_sentence)
            labels.append(current_labels)
    return sentences, labels

# Load dataset
sentences, labels = load_conll_file('/content/amharic_ner_conll_100.txt')

# Create label mapping
unique_labels = sorted(set(label for sent in labels for label in sent))
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}

# Convert to Dataset
data = {'tokens': sentences, 'ner_tags': [[label2id[label] for label in sent_labels] for sent_labels in labels]}
dataset = Dataset.from_dict(data)

# Split into train and validation
train_dataset, val_dataset = dataset.train_test_split(test_size=0.2, seed=42).values()

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
import torch
import evaluate
import time

# Model configurations
models = {
    'xlm-roberta': 'xlm-roberta-base',
    'distilbert': 'distilbert-base-multilingual-cased',
    'mbert': 'bert-base-multilingual-cased'
}

# Training arguments
training_args = TrainingArguments(
    output_dir='/content/results',
    eval_strategy='steps',
    eval_steps=50,
    save_strategy='steps',
    save_steps=50,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=15,
    weight_decay=0.01,
    logging_steps=10,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    report_to="none"  # Disable wandb logging
)

In [None]:
from seqeval.metrics import classification_report
import numpy as np

# Load evaluation metric
metric = evaluate.load('seqeval')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    pred_labels = [[id2label[p] for p, l in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)]
    results = metric.compute(predictions=pred_labels, references=true_labels)
    return {
        'accuracy': results['overall_accuracy'],
        'f1': results['overall_f1'],
        'precision': results['overall_precision'],
        'recall': results['overall_recall']
    }

# Store results
results = {}

for model_name, model_path in models.items():
    print(f"\n🚀 Fine-tuning {model_name}...")
    start_time = time.time()

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForTokenClassification.from_pretrained(
        model_path,
        num_labels=len(unique_labels),
        id2label=id2label,
        label2id=label2id
    )

    # Tokenize dataset
    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True, padding=True)
        labels = []
        for i, label in enumerate(examples['ner_tags']):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(-100)
                previous_word_idx = word_idx
            labels.append(label_ids)
        tokenized_inputs['labels'] = labels
        return tokenized_inputs

    tokenized_train = train_dataset.map(tokenize_and_align_labels, batched=True)
    tokenized_val = val_dataset.map(tokenize_and_align_labels, batched=True)

    # Add class weights
    class_weights = torch.tensor([1.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0]).to('cuda' if torch.cuda.is_available() else 'cpu')
    model.class_weights = class_weights  # Ensure model uses weights in loss

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        compute_metrics=compute_metrics
    )

    # Train and measure time
    trainer.train()
    training_time = time.time() - start_time

    # Evaluate
    eval_results = trainer.evaluate()
    inference_start = time.time()
    trainer.predict(tokenized_val)
    inference_time = time.time() - inference_start

    # Store results
    results[model_name] = {
        'accuracy': eval_results['eval_accuracy'],
        'f1': eval_results['eval_f1'],
        'precision': eval_results['eval_precision'],
        'recall': eval_results['eval_recall'],
        'training_time': training_time,
        'inference_time': inference_time
    }

    # Save model
    trainer.save_model(f'/content/{model_name}_ner_model')
    print(f"✅ {model_name} training complete! Model saved.")


🚀 Fine-tuning xlm-roberta...


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,0.8022,0.622653,0.825243,0.648649,1.0,0.48


  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


✅ xlm-roberta training complete! Model saved.

🚀 Fine-tuning distilbert...


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,0.627,0.648427,0.834951,0.604651,0.722222,0.52


✅ distilbert training complete! Model saved.

🚀 Fine-tuning mbert...


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,0.5947,0.626395,0.883495,0.734694,0.75,0.72


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,0.5947,0.626395,0.883495,0.734694,0.75,0.72


✅ mbert training complete! Model saved.


In [None]:
import pandas as pd

# Create comparison table
comparison = pd.DataFrame({
    'Model': list(results.keys()),
    'Accuracy': [results[m]['accuracy'] for m in results],
    'F1': [results[m]['f1'] for m in results],
    'Precision': [results[m]['precision'] for m in results],
    'Recall': [results[m]['recall'] for m in results],
    'Training Time (s)': [results[m]['training_time'] for m in results],
    'Inference Time (s)': [results[m]['inference_time'] for m in results]
})

print("\n📊 Model Comparison:")
print(comparison)

# Select best model
best_model = comparison.loc[comparison['F1'].idxmax()]
print(f"\n🏆 Best Model: {best_model['Model']}")
print(f"F1: {best_model['F1']:.4f}, Accuracy: {best_model['Accuracy']:.4f}, "
      f"Training Time: {best_model['Training Time (s)']:.2f}s, "
      f"Inference Time: {best_model['Inference Time (s)']:.2f}s")


📊 Model Comparison:
         Model  Accuracy        F1  Precision  Recall  Training Time (s)  \
0  xlm-roberta  0.825243  0.648649   1.000000    0.48        1058.347344   
1   distilbert  0.834951  0.604651   0.722222    0.52         511.651932   
2        mbert  0.883495  0.734694   0.750000    0.72         863.214552   

   Inference Time (s)  
0            2.883138  
1            1.035541  
2            1.981572  

🏆 Best Model: mbert
F1: 0.7347, Accuracy: 0.8835, Training Time: 863.21s, Inference Time: 1.98s
