# Mini Project — Fine-Tuning NER on a Custom Small Dataset

# IMPORTS

In [1]:
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
    pipeline
)

# Custom NER Dataset


In [2]:
dataset = [
    {
        "tokens": ["Iron", "Man", "is", "played", "by", "Robert", "Downey", "Jr."],
        "ner_tags": [
            "B-CHARACTER", "I-CHARACTER", "O", "O", "O",
            "B-ACTOR", "I-ACTOR", "I-ACTOR"
        ]
    },
    {
        "tokens": ["Batman", "appears", "in", "The", "Dark", "Knight"],
        "ner_tags": [
            "B-CHARACTER", "O", "O",
            "B-MOVIE", "I-MOVIE", "I-MOVIE"
        ]
    }
]

# Label Mapping

In [3]:
label_list = [
    "O",
    "B-CHARACTER", "I-CHARACTER",
    "B-ACTOR", "I-ACTOR",
    "B-MOVIE", "I-MOVIE"
]

label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}


# Load Tokenizer & Model

In [4]:
model_name = "bert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Tokenization + Label Alignment

In [5]:
def tokenize_and_align_labels(example):
    tokenized = tokenizer(
        example["tokens"],
        is_split_into_words=True,
        truncation=True
    )

    labels = []
    word_ids = tokenized.word_ids()
    previous_word_idx = None

    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(label2id[example["ner_tags"][word_idx]])
        else:
            labels.append(-100)
        previous_word_idx = word_idx

    tokenized["labels"] = labels
    return tokenized

# Convert to HF Dataset

In [6]:
raw_dataset = Dataset.from_list(dataset)

tokenized_dataset = raw_dataset.map(
    tokenize_and_align_labels,
    batched=False
)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

# Data Collator (CRITICAL)

In [7]:
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer
)


# Training Arguments

In [8]:
training_args = TrainingArguments(
    output_dir="./ner_model",
    per_device_train_batch_size=8,
    num_train_epochs=5,
    logging_steps=5,
    save_strategy="no",
    report_to="none"   # disables wandb
)


# Trainer

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)


# Train

In [11]:
trainer.train()



Step,Training Loss
5,1.3236


TrainOutput(global_step=5, training_loss=1.3235636711120606, metrics={'train_runtime': 5.5083, 'train_samples_per_second': 1.815, 'train_steps_per_second': 0.908, 'total_flos': 61244195760.0, 'train_loss': 1.3235636711120606, 'epoch': 5.0})

# Inference Test

In [12]:
ner_pipeline = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple"
)

text = "Iron Man is played by Robert Downey Jr."
results = ner_pipeline(text)

for r in results:
    print(f"{r['word']} → {r['entity_group']}")

Device set to use cpu


Iron Man → CHARACTER
Robert Down → ACTOR
Jr → ACTOR
