In [1]:
%pip install transformers datasets seqeval torch evaluate


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import numpy as np
import datasets
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer, 
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)
import torch

  from .autonotebook import tqdm as notebook_tqdm





In [3]:
raw_datasets = load_dataset("conll2003", trust_remote_code=True)
# Inspect features
print(raw_datasets["train"].features)
# e.g., features: ['tokens', 'pos_tags', 'chunk_tags', 'ner_tags']


{'id': Value(dtype='string', id=None), 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None), 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}


In [4]:
label_list = raw_datasets["train"].features["ner_tags"].feature.names
num_labels = len(label_list)
print(label_list)  # e.g. ['O', 'B-PER', 'I-PER', …]


['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [5]:
model_name = "roberta-base"  
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
model = AutoModelForTokenClassification.from_pretrained(
    model_name, 
    num_labels=num_labels,
)


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
    )
    aligned_labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special token: will be ignored by loss
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx]) # First token of a word
            else:
                # Subsequent tokens of a word: set to -100 to ignore
                label_ids.append(-100)
            previous_word_idx = word_idx
        aligned_labels.append(label_ids)
    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs

# Apply to all splits
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)


Map: 100%|██████████| 3250/3250 [00:00<00:00, 16383.02 examples/s]
Map: 100%|██████████| 3250/3250 [00:00<00:00, 16383.02 examples/s]


In [7]:
data_collator = DataCollatorForTokenClassification(tokenizer)


In [8]:
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=2)

    true_labels = [
        [label_list[l] for l in label_seq if l != -100]
        for label_seq in labels
    ]
    true_preds = [
        [label_list[p] for (p, l) in zip(pred_seq, label_seq) if l != -100]
        for pred_seq, label_seq in zip(preds, labels)
    ]
    results = metric.compute(predictions=true_preds, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall":    results["overall_recall"],
        "f1":        results["overall_f1"],
        "accuracy":  results["overall_accuracy"],
    }


In [10]:
training_args = TrainingArguments(
    output_dir="./ner-model",
    eval_strategy="epoch",  # Changed from evaluation_strategy
    save_strategy="epoch",  # Added to match eval_strategy
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)


In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0473,0.047193,0.927635,0.940592,0.934069,0.988922
2,0.0206,0.037413,0.94985,0.956244,0.953036,0.991725
3,0.0134,0.037072,0.953718,0.960619,0.957156,0.992465


TrainOutput(global_step=2634, training_loss=0.042035400494084545, metrics={'train_runtime': 362.7534, 'train_samples_per_second': 116.12, 'train_steps_per_second': 7.261, 'total_flos': 1022948654606748.0, 'train_loss': 0.042035400494084545, 'epoch': 3.0})

In [12]:
metrics = trainer.evaluate()
print(metrics)


{'eval_loss': 0.03707153722643852, 'eval_precision': 0.953717627401838, 'eval_recall': 0.9606193200942443, 'eval_f1': 0.9571560325312317, 'eval_accuracy': 0.9924652466804252, 'eval_runtime': 6.6059, 'eval_samples_per_second': 491.988, 'eval_steps_per_second': 30.882, 'epoch': 3.0}


In [13]:
# Save the fine-tuned model and tokenizer
model_save_path = "./fine-tuned-ner-model"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model and tokenizer saved to {model_save_path}")

Model and tokenizer saved to ./fine-tuned-ner-model


In [15]:
def ner_predict(sentence: str):
    tokens = sentence.split() # Split sentence into words
    # Tokenize the words, preserving original word boundaries
    tokenized_inputs_obj = tokenizer(
        tokens,
        is_split_into_words=True,
        return_tensors="pt"
    )

    # Get word_ids before moving tensors to device and converting to dict
    word_ids = tokenized_inputs_obj.word_ids()

    # Move tokenized inputs to the same device as the model
    tokenized_inputs_dict = {k: v.to(model.device) for k, v in tokenized_inputs_obj.items()}

    # Perform inference
    with torch.no_grad(): # Disable gradient calculations for inference
        logits = model(**tokenized_inputs_dict).logits
    
    predictions = torch.argmax(logits, dim=2)
    predicted_token_ids = predictions[0].tolist() # Get predictions for the first (and only) sequence

    # Align predictions with original words
    # Each original word might be split into multiple sub-tokens
    # We'll take the label of the first sub-token for each word
    word_labels = {} # Using a dictionary to store label for each word_idx to ensure one label per word
    for token_idx, word_idx_val in enumerate(word_ids):
        if word_idx_val is None:  # Skip special tokens like [CLS], [SEP]
            continue
        if word_idx_val not in word_labels: # If this is the first sub-token for this word_idx
            word_labels[word_idx_val] = label_list[predicted_token_ids[token_idx]]
            
    # Reconstruct the list of (word, label) pairs in the original order
    final_entities = []
    for i in range(len(tokens)):
        if i in word_labels: # Check if the word_idx was processed (it should be)
            final_entities.append((tokens[i], word_labels[i]))
        else:
            # This case should ideally not happen if all words are tokenized and processed
            final_entities.append((tokens[i], "O")) # Default to 'O' if label not found
            
    return final_entities

test_sentence = "Barack Obama was born in Hawaii ."
predicted_entities = ner_predict(test_sentence)
print(f"Sentence: {test_sentence}")
print(f"Predicted entities: {predicted_entities}")
# Expected output format: e.g. [('Barack', 'B-PER'), ('Obama', 'I-PER'), ('was', 'O'), ('born', 'O'), ('in', 'O'), ('Hawaii', 'B-LOC'), ('.', 'O')]

Sentence: Barack Obama was born in Hawaii .
Predicted entities: [('Barack', 'B-PER'), ('Obama', 'I-PER'), ('was', 'O'), ('born', 'O'), ('in', 'O'), ('Hawaii', 'B-LOC'), ('.', 'O')]
