# Mini Project — Fine-Tuning NER on a Custom Small Dataset

## Create a Small Custom Dataset

In [1]:
dataset = [
    {
        "tokens": ["Iron", "Man", "is", "played", "by", "Robert", "Downey", "Jr."],
        "ner_tags": ["B-CHARACTER", "I-CHARACTER", "O", "O", "O", "B-ACTOR", "I-ACTOR", "I-ACTOR"]
    },
    {
        "tokens": ["Batman", "appears", "in", "The", "Dark", "Knight"],
        "ner_tags": ["B-CHARACTER", "O", "O", "B-MOVIE", "I-MOVIE", "I-MOVIE"]
    }
]


In [9]:
from datasets import Dataset

raw_dataset = Dataset.from_list(dataset)


In [10]:
tokenized_dataset = raw_dataset.map(
    tokenize_and_align_labels,
    batched=False
)


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

# Convert Labels to IDs

In [2]:
label_list = [
    "O",
    "B-CHARACTER", "I-CHARACTER",
    "B-ACTOR", "I-ACTOR",
    "B-MOVIE", "I-MOVIE"
]

label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}


# Load Pretrained Model & Tokenizer

In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "bert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer
)


# Tokenization + Label Alignment (Most Important Part)

In [5]:
def tokenize_and_align_labels(example):
    tokenized = tokenizer(
        example["tokens"],
        is_split_into_words=True,
        truncation=True
    )

    labels = []
    word_ids = tokenized.word_ids()

    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)   # ignored in loss
        elif word_idx != previous_word_idx:
            labels.append(label2id[example["ner_tags"][word_idx]])
        else:
            labels.append(-100)
        previous_word_idx = word_idx

    tokenized["labels"] = labels
    return tokenized


# Training Setup (Trainer API)

In [6]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
     output_dir="./ner_model",
    per_device_train_batch_size=8,
    num_train_epochs=5,
    logging_steps=10,
    save_strategy="no"

)

# Train the Model

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

trainer.train()


Step,Training Loss


TrainOutput(global_step=5, training_loss=1.2880943298339844, metrics={'train_runtime': 18.7012, 'train_samples_per_second': 0.535, 'train_steps_per_second': 0.267, 'total_flos': 61244195760.0, 'train_loss': 1.2880943298339844, 'epoch': 5.0})

# Final Output

In [14]:
from transformers import pipeline

ner_pipeline = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple"
)

text = "Iron Man is played by Robert Downey Jr."

ner_pipeline(text)


Device set to use cpu


[{'entity_group': 'CHARACTER',
  'score': np.float32(0.28912336),
  'word': 'Iron',
  'start': 0,
  'end': 4},
 {'entity_group': 'ACTOR',
  'score': np.float32(0.44155085),
  'word': 'Robert Downey Jr',
  'start': 22,
  'end': 38}]