In [None]:
import os
import time
import json
import pandas as pd
from transformers import pipeline

SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
PROJECT_ROOT = os.path.dirname(SCRIPT_DIR)

MODEL_A_OUTPUT_DIR = os.path.join(PROJECT_ROOT, "models", "ethio-ner-xlm-roberta-base")
MODEL_B_OUTPUT_DIR = os.path.join(PROJECT_ROOT, "models", "ethio-ner-afro-xlm-roberta-base")

MODEL_A_PATH = os.path.join(MODEL_A_OUTPUT_DIR, "best-model")
MODEL_B_PATH = os.path.join(MODEL_B_OUTPUT_DIR, "best-model")

TEST_DATA_PATH = os.path.join(PROJECT_ROOT, "data", "data_splits", "test.conll")

In [None]:
print("--- [MODEL COMPARISON] ---")
print(f"[*] Model A: {MODEL_A_PATH}")
print(f"[*] Model B: {MODEL_B_PATH}")
print("--------------------------\n")


print("[*] Loading NER pipelines...")
# Use a try-except block to handle environments with/without a GPU
try:
    pipe_a = pipeline("ner", model=MODEL_A_PATH, device=0, aggregation_strategy="simple")
    pipe_b = pipeline("ner", model=MODEL_B_PATH, device=0, aggregation_strategy="simple")
    print("[+] Pipelines loaded successfully onto GPU.")
except Exception as e:
    print(f"[!] GPU not available or error occurred: {e}. Loading models on CPU.")
    pipe_a = pipeline("ner", model=MODEL_A_PATH, aggregation_strategy="simple")
    pipe_b = pipeline("ner", model=MODEL_B_PATH, aggregation_strategy="simple")

In [None]:
def load_test_sentences(file_path):
    """Loads just the text of sentences from a CoNLL file."""
    sentences = []
    with open(file_path, "r", encoding="utf-8") as f:
        current_sentence = []
        for line in f:
            line = line.strip()
            if line and " " in line:
                current_sentence.append(line.split()[0])
            elif current_sentence:
                sentences.append(" ".join(current_sentence))
                current_sentence = []
        if current_sentence: 
             sentences.append(" ".join(current_sentence))
    return sentences

test_sentences = load_test_sentences(TEST_DATA_PATH)
if not test_sentences:
    raise ValueError("No sentences found in the test file. Cannot run speed test.")

print(f"\n[*] Loaded {len(test_sentences)} sentences for speed benchmark.")

# Inference Speed Benchmark ---
print("[*] Benchmarking Model A (xlm-roberta-base)...")
start_time_a = time.time()
for sentence in test_sentences:
    pipe_a(sentence)
end_time_a = time.time()
total_time_a = end_time_a - start_time_a
avg_time_a = total_time_a / len(test_sentences)

print(f"[*] Benchmarking Model B (afro-xlm-roberta-base)...")
start_time_b = time.time()
for sentence in test_sentences:
    pipe_b(sentence)
end_time_b = time.time()
total_time_b = end_time_b - start_time_b
avg_time_b = total_time_b / len(test_sentences)

In [None]:
print("\n[*] Assembling comparison data...")
metrics_a = {"f1": 0.0, "precision": 0.0, "recall": 0.0} # For xlm-roberta-base
metrics_b = {"f1": 0.0, "precision": 0.0, "recall": 0.0} # For afro-xlm-roberta-base

comparison_data = {
    "Model": ["XLM-Roberta-Base (Generalist)", "Afro-XLM-R-Base (Specialist)"],
    "F1-Score (Test Set)": [metrics_a['f1'], metrics_b['f1']],
    "Precision": [metrics_a['precision'], metrics_b['precision']],
    "Recall": [metrics_a['recall'], metrics_b['recall']],
    "Avg. Inference Time (s/sentence)": [avg_time_a, avg_time_b]
}

df_comparison = pd.DataFrame(comparison_data)

# Format the output for better viewing
pd.set_option('display.float_format', '{:.4f}'.format)

print("\n\n--- EthioMart NER Model Comparison ---")
print(df_comparison.to_markdown(index=False))