In [25]:
import pandas as pd
import torch
import numpy as np
import json

from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer
from seqeval.metrics import classification_report, f1_score

In [None]:
# Load model + tokenizer

MODEL_DIR = "snips_ner_model_full"
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForTokenClassification.from_pretrained(MODEL_DIR)
model.eval()

# Label maps

with open("ner_id2label.json") as f:
    id2label = {int(k): v for k, v in json.load(f).items()}
label2id = {v: k for k, v in id2label.items()}

In [None]:
# Load test CSV

df = pd.read_csv("dataset/snips_ner_test.csv")

# Group into sentences

def group_sentences(df):
    sentences = []
    for sent_id, grp in df.groupby("sentence_id"):
        grp = grp.sort_values("token_id")
        tokens = grp["token"].tolist()
        slots = grp["slot"].tolist()
        sentences.append({"tokens": tokens, "slots": slots})
    return sentences

test_sentences = group_sentences(df)

In [None]:
# Build encodings (re-tokenize so we get word_ids())

def build_encodings(sentences, tokenizer, label2id, max_len=64):
    all_tokens = [s["tokens"] for s in sentences]
    all_slots  = [s["slots"]  for s in sentences]

    encodings = tokenizer(
        all_tokens,
        is_split_into_words=True,
        padding="max_length",
        truncation=True,
        max_length=max_len,
        return_attention_mask=True,
    )

    labels = []
    for i, slots in enumerate(all_slots):
        word_ids = encodings.word_ids(batch_index=i)
        seq = []
        prev_wid = None

        for wid in word_ids:
            if wid is None:
                seq.append(-100)
            else:
                label = slots[wid]

                # SAME SUBWORD LOGIC AS TRAINING
                if wid != prev_wid:
                    seq.append(label2id[label])
                else:
                    if label.startswith("B-"):
                        label = "I-" + label[2:]
                    seq.append(label2id[label])

                prev_wid = wid

        labels.append(seq)

    return encodings, torch.tensor(labels)

test_encodings, test_labels = build_encodings(test_sentences, tokenizer, label2id)

In [None]:
# Build Dataset

class NerDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return self.labels.shape[0]

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

test_dataset = NerDataset(test_encodings, test_labels)

In [None]:
# Decode predictions back to labels

def decode_predictions(encodings, labels, pred_ids, id2label):
    y_true = []
    y_pred = []

    for i in range(len(labels)):
        true_seq = []
        pred_seq = []

        word_ids = encodings.word_ids(batch_index=i)

        for j, wid in enumerate(word_ids):
            if wid is None:
                continue
            if labels[i][j].item() == -100:
                continue

            true_label = id2label[int(labels[i][j])]
            pred_label = id2label[int(pred_ids[i][j])]

            true_seq.append(true_label)
            pred_seq.append(pred_label)

        y_true.append(true_seq)
        y_pred.append(pred_seq)

    return y_true, y_pred

In [None]:
# Run predictions

trainer = Trainer(model=model)

preds = trainer.predict(test_dataset)
logits = preds.predictions
pred_ids = np.argmax(logits, axis=-1)

y_true, y_pred = decode_predictions(
    test_encodings, test_labels, pred_ids, id2label
)

# Final SEQEVAL evaluation

print("\n===== SEQEVAL NER REPORT =====")
print(classification_report(y_true, y_pred))
print("Entity-Level F1:", f1_score(y_true, y_pred))




===== SEQEVAL NER REPORT =====
                            precision    recall  f1-score   support

                     album       0.00      0.00      0.00        13
                    artist       0.93      0.95      0.94       109
               best_rating       1.00      1.00      1.00        51
                      city       0.93      0.92      0.92        71
     condition_description       0.96      1.00      0.98        22
     condition_temperature       1.00      1.00      1.00        21
                   country       0.98      0.95      0.97        44
                   cuisine       1.00      0.91      0.95        11
          current_location       1.00      0.94      0.97        17
               entity_name       0.65      0.72      0.68        18
                  facility       1.00      1.00      1.00         7
                     genre       0.29      0.67      0.40         3
            geographic_poi       1.00      1.00      1.00        16
             lo