In [3]:
import torch
from torch.utils.data import DataLoader, SequentialSampler
from transformers import (
    AutoTokenizer,
    CamembertForTokenClassification,
    DataCollatorForTokenClassification
)
from datasets import load_dataset

import os

# -------------------------
# Reuse tokenize_and_align_labels function
# -------------------------
def tokenize_and_align_labels(examples, tokenizer):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True
    )

    all_labels = []
    for i in range(len(examples["tokens"])):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        labels = []
        for word_id in word_ids:
            if word_id is None:
                labels.append(-100)  # ignore special tokens
            else:
                upos_id = examples["upos"][i][word_id]
                labels.append(upos_id)
        all_labels.append(labels)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs

def evaluate_model_on_dataset(
    model_dir,           # e.g. "./my_camembert_pos_model_fr_gsd"
    ud_subset_name,      # e.g. "fr_gsd", "fr_ftb", etc.
    batch_size=16
):
    """
    Loads the saved model/tokenizer from model_dir,
    downloads the UD dataset (ud_subset_name),
    tokenizes the test set,
    and computes test accuracy.
    """

    print(f"\n=== Evaluating model at: {model_dir} on dataset: {ud_subset_name} ===")

    # 1) Load the dataset
    dataset = load_dataset(
        "universal_dependencies",
        ud_subset_name,
        trust_remote_code=True
    )

    # 2) Inspect labels (UPOS)
    upos_feature = dataset["train"].features["upos"]
    all_label_strings = upos_feature.feature.names
    num_labels = len(all_label_strings)

    # 3) Load the tokenizer and model from disk
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = CamembertForTokenClassification.from_pretrained(model_dir)
    model.eval()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # 4) Tokenize the dataset
    def my_map_fn(examples):
        return tokenize_and_align_labels(examples, tokenizer)

    tokenized_dataset = dataset.map(
        my_map_fn,
        batched=True
    )

    # Convert to PyTorch format
    tokenized_dataset.set_format(
        type="torch",
        columns=["input_ids", "attention_mask", "labels"]
    )

    # 5) Create a DataLoader for the test set
    test_dataset = tokenized_dataset["test"]
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)

    test_dataloader = DataLoader(
        test_dataset,
        sampler=SequentialSampler(test_dataset),
        batch_size=batch_size,
        collate_fn=data_collator
    )

    # 6) Loop over test set to compute loss & accuracy
    test_loss = 0.0
    test_correct = 0
    test_total   = 0

    with torch.no_grad():
        for batch in test_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)

            # Accumulate test loss
            test_loss += outputs.loss.item()

            # Compute predictions
            logits = outputs.logits
            preds = logits.argmax(dim=-1)
            labels = batch["labels"]

            # Only consider positions where label != -100
            mask = labels != -100
            test_correct += (preds[mask] == labels[mask]).sum().item()
            test_total   += mask.sum().item()

    avg_test_loss = test_loss / len(test_dataloader)
    test_accuracy = test_correct / test_total if test_total > 0 else 0.0

    print(f"Test Loss: {avg_test_loss:.4f} | Test Accuracy: {test_accuracy:.4f}")

    return avg_test_loss, test_accuracy

# -------------------------
# Example usage:
# -------------------------
if __name__ == "__main__":

    models_and_datasets = [
        ("./my_camembert_pos_model_fr_gsd",     "fr_gsd"),
        ("./my_camembert_pos_model_fr_ftb",     "fr_ftb"),
        ("./my_camembert_pos_model_fr_partut",  "fr_partut"),
        ("./my_camembert_pos_model_squioa",     "fr_sequoia"),  
        ("./my_camembert_pos_model_fr_spoken", "fr_spoken")   
    ]

    for model_dir, ud_subset in models_and_datasets:
        evaluate_model_on_dataset(model_dir, ud_subset)



=== Evaluating model at: ./my_camembert_pos_model_fr_gsd on dataset: fr_gsd ===


Using the latest cached version of the module from C:\Users\rlarabi\.cache\huggingface\modules\datasets_modules\datasets\universal_dependencies\1ac001f0e8a0021f19388e810c94599f3ac13cc45d6b5b8c69f7847b2188bdf7 (last modified on Sun Dec 22 01:48:43 2024) since it couldn't be found locally at universal_dependencies, or remotely on the Hugging Face Hub.


Map:   0%|          | 0/14449 [00:00<?, ? examples/s]

Map:   0%|          | 0/1476 [00:00<?, ? examples/s]

Map:   0%|          | 0/416 [00:00<?, ? examples/s]

Test Loss: 0.0793 | Test Accuracy: 0.9786

=== Evaluating model at: ./my_camembert_pos_model_fr_ftb on dataset: fr_ftb ===


Using the latest cached version of the module from C:\Users\rlarabi\.cache\huggingface\modules\datasets_modules\datasets\universal_dependencies\1ac001f0e8a0021f19388e810c94599f3ac13cc45d6b5b8c69f7847b2188bdf7 (last modified on Sun Dec 22 01:48:43 2024) since it couldn't be found locally at universal_dependencies, or remotely on the Hugging Face Hub.


Map:   0%|          | 0/14759 [00:00<?, ? examples/s]

Map:   0%|          | 0/1235 [00:00<?, ? examples/s]

Map:   0%|          | 0/2541 [00:00<?, ? examples/s]

Test Loss: 2.1138 | Test Accuracy: 0.2826

=== Evaluating model at: ./my_camembert_pos_model_fr_partut on dataset: fr_partut ===


Using the latest cached version of the module from C:\Users\rlarabi\.cache\huggingface\modules\datasets_modules\datasets\universal_dependencies\1ac001f0e8a0021f19388e810c94599f3ac13cc45d6b5b8c69f7847b2188bdf7 (last modified on Sun Dec 22 01:48:43 2024) since it couldn't be found locally at universal_dependencies, or remotely on the Hugging Face Hub.


Map:   0%|          | 0/803 [00:00<?, ? examples/s]

Map:   0%|          | 0/107 [00:00<?, ? examples/s]

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

Test Loss: 0.8896 | Test Accuracy: 0.9689

=== Evaluating model at: ./my_camembert_pos_model_squioa on dataset: fr_sequoia ===


Using the latest cached version of the module from C:\Users\rlarabi\.cache\huggingface\modules\datasets_modules\datasets\universal_dependencies\1ac001f0e8a0021f19388e810c94599f3ac13cc45d6b5b8c69f7847b2188bdf7 (last modified on Sun Dec 22 01:48:43 2024) since it couldn't be found locally at universal_dependencies, or remotely on the Hugging Face Hub.


Map:   0%|          | 0/2231 [00:00<?, ? examples/s]

Map:   0%|          | 0/412 [00:00<?, ? examples/s]

Map:   0%|          | 0/456 [00:00<?, ? examples/s]

Test Loss: 0.4097 | Test Accuracy: 0.9837

=== Evaluating model at: ./my_camembert_pos_model_fr_spoken on dataset: fr_spoken ===


Using the latest cached version of the module from C:\Users\rlarabi\.cache\huggingface\modules\datasets_modules\datasets\universal_dependencies\1ac001f0e8a0021f19388e810c94599f3ac13cc45d6b5b8c69f7847b2188bdf7 (last modified on Sun Dec 22 01:48:43 2024) since it couldn't be found locally at universal_dependencies, or remotely on the Hugging Face Hub.


Map:   0%|          | 0/1167 [00:00<?, ? examples/s]

Map:   0%|          | 0/909 [00:00<?, ? examples/s]

Map:   0%|          | 0/730 [00:00<?, ? examples/s]

Test Loss: 0.7796 | Test Accuracy: 0.9674
