In [18]:
import os
import time
import json
import pandas as pd
from transformers import pipeline

print("--- [MODEL COMPARISON NOTEBOOK] ---")

# --- 1. Configuration & Setup
PROJECT_ROOT = os.getcwd()

# --- Define paths to your four fine-tuned models and test files---
MODEL_A_PATH = os.path.join(PROJECT_ROOT, "models", "ethio-ner-xlm-roberta-base", "best-model")
MODEL_B_PATH = os.path.join(PROJECT_ROOT, "models", "ethio-ner-distilbert-base-multilingual-cased", "best-model")
MODEL_C_PATH = os.path.join(PROJECT_ROOT, "models", "ethio-ner-bert-tiny-amharic", "best-model")
MODEL_D_PATH = os.path.join(PROJECT_ROOT, "models", "ethio-ner-afroxlmr-large-ner-masakhaner-1.0_2.0", "best-model")
TEST_DATA_PATH = os.path.join(PROJECT_ROOT, "data", "data_splits", "test.conll")

print(f"[*] Project Root: {PROJECT_ROOT}")
print(f"\n[*] Model A (Generalist): {MODEL_A_PATH}")
print(f"[*] Model B (Speed Focus): {MODEL_B_PATH}")
print(f"[*] Model C (Monolingual Specialist): {MODEL_C_PATH}")
print(f"[*] Model D (NER Specialist): {MODEL_D_PATH}")

--- [MODEL COMPARISON NOTEBOOK] ---
[*] Project Root: /content/EthioMart-Amharic-NER-Project

[*] Model A (Generalist): /content/EthioMart-Amharic-NER-Project/models/ethio-ner-xlm-roberta-base/best-model
[*] Model B (Speed Focus): /content/EthioMart-Amharic-NER-Project/models/ethio-ner-distilbert-base-multilingual-cased/best-model
[*] Model C (Monolingual Specialist): /content/EthioMart-Amharic-NER-Project/models/ethio-ner-bert-tiny-amharic/best-model
[*] Model D (NER Specialist): /content/EthioMart-Amharic-NER-Project/models/ethio-ner-afroxlmr-large-ner-masakhaner-1.0_2.0/best-model


In [19]:
print("\n[*] Loading NER pipelines...")
# Use a try-except block to handle environments with/without a GPU
try:
    pipe_a = pipeline("ner", model=MODEL_A_PATH, device=0, aggregation_strategy="simple")
    pipe_b = pipeline("ner", model=MODEL_B_PATH, device=0, aggregation_strategy="simple")
    pipe_c = pipeline("ner", model=MODEL_C_PATH, device=0, aggregation_strategy="simple")
    pipe_d = pipeline("ner", model=MODEL_D_PATH, device=0, aggregation_strategy="simple")
    print("[+] Pipelines loaded successfully onto GPU.")
except Exception as e:
    print(f"[!] GPU not available or error occurred: {e}. Loading models on CPU.")
    pipe_a = pipeline("ner", model=MODEL_A_PATH, aggregation_strategy="simple")
    pipe_b = pipeline("ner", model=MODEL_B_PATH, aggregation_strategy="simple")
    pipe_c = pipeline("ner", model=MODEL_C_PATH, aggregation_strategy="simple")
    pipe_d = pipeline("ner", model=MODEL_D_PATH, aggregation_strategy="simple")

# Load the raw text sentences from the test file for the speed benchmark
def load_test_sentences(file_path):
    """Loads just the text of sentences from a CoNLL file."""
    sentences = []
    with open(file_path, "r", encoding="utf-8") as f:
        current_sentence = []
        for line in f:
            line = line.strip()
            if line and " " in line:
                current_sentence.append(line.split()[0])
            elif current_sentence:
                sentences.append(" ".join(current_sentence))
                current_sentence = []
        if current_sentence: # Add the last sentence if file doesn't end with newline
             sentences.append(" ".join(current_sentence))
    return sentences

test_sentences = load_test_sentences(TEST_DATA_PATH)
if not test_sentences:
    raise ValueError("No sentences found in the test file. Cannot run speed test.")

print(f"\n[*] Loaded {len(test_sentences)} sentences for speed benchmark.")


[*] Loading NER pipelines...


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


[+] Pipelines loaded successfully onto GPU.

[*] Loaded 3 sentences for speed benchmark.


In [20]:
# --- Run Inference Speed Benchmark ---
def benchmark_model(pipe, model_name):
    """Measures the inference speed of a given pipeline."""
    print(f"\n[*] Benchmarking {model_name}...")
    start_time = time.time()
    for sentence in test_sentences:
        pipe(sentence)
    end_time = time.time()
    total_time = end_time - start_time
    avg_time = total_time / len(test_sentences)
    print(f"[+] {model_name} completed in {total_time:.4f}s. Average: {avg_time:.4f}s/sentence.")
    return avg_time

benchmark_model(pipe_a, "Model A (XLM-Roberta)")
benchmark_model(pipe_b, "Model B (DistilBERT)")
benchmark_model(pipe_c, "Model C (Amharic-BERT)")
benchmark_model(pipe_d, "Model D (MasakhaNER-Large)")


[*] Benchmarking Model A (XLM-Roberta)...
[+] Model A (XLM-Roberta) completed in 0.5034s. Average: 0.1678s/sentence.

[*] Benchmarking Model B (DistilBERT)...
[+] Model B (DistilBERT) completed in 0.0381s. Average: 0.0127s/sentence.

[*] Benchmarking Model C (Amharic-BERT)...
[+] Model C (Amharic-BERT) completed in 0.0269s. Average: 0.0090s/sentence.

[*] Benchmarking Model D (MasakhaNER-Large)...
[+] Model D (MasakhaNER-Large) completed in 0.0941s. Average: 0.0314s/sentence.


0.03138240178426107