# Token Classification

In [1]:
#!pip install transformers datasets seqeval wandb

In [None]:
import numpy as np
from datasets import load_dataset
from transformers import (AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer,
                          DataCollatorForTokenClassification)
from transformers import pipeline
from seqeval.metrics import classification_report

### Example of Token Classification

In [72]:
example_pipeline = pipeline("token-classification", model="dbmdz/bert-large-cased-finetuned-conll03-english")

example_text = "Elon Musk is the CEO of Tesla, which is headquartered in Palo Alto."

output = example_pipeline(example_text)

print("\nToken Classification Output:")
for entity in output:
    print(entity)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0



Token Classification Output:
{'entity': 'I-PER', 'score': 0.9996063, 'index': 1, 'word': 'El', 'start': 0, 'end': 2}
{'entity': 'I-PER', 'score': 0.9991404, 'index': 2, 'word': '##on', 'start': 2, 'end': 4}
{'entity': 'I-PER', 'score': 0.9993499, 'index': 3, 'word': 'Mu', 'start': 5, 'end': 7}
{'entity': 'I-PER', 'score': 0.99846965, 'index': 4, 'word': '##sk', 'start': 7, 'end': 9}
{'entity': 'I-ORG', 'score': 0.9976349, 'index': 9, 'word': 'Te', 'start': 24, 'end': 26}
{'entity': 'I-ORG', 'score': 0.9928018, 'index': 10, 'word': '##sla', 'start': 26, 'end': 29}
{'entity': 'I-LOC', 'score': 0.9963245, 'index': 16, 'word': 'Pa', 'start': 57, 'end': 59}
{'entity': 'I-LOC', 'score': 0.9900282, 'index': 17, 'word': '##lo', 'start': 59, 'end': 61}
{'entity': 'I-LOC', 'score': 0.9958049, 'index': 18, 'word': 'Alto', 'start': 62, 'end': 66}


We can use aggregation_strategy="simple" to merge subword tokens into whole words.

In [73]:
example_pipeline = pipeline("token-classification", model="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple")

output = example_pipeline(example_text)

print("\nToken Classification Output:")
for entity in output:
    print(entity)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0



Token Classification Output:
{'entity_group': 'PER', 'score': 0.9991415, 'word': 'Elon Musk', 'start': 0, 'end': 9}
{'entity_group': 'ORG', 'score': 0.99521834, 'word': 'Tesla', 'start': 24, 'end': 29}
{'entity_group': 'LOC', 'score': 0.9940526, 'word': 'Palo Alto', 'start': 57, 'end': 66}


### Full Token Classification Workflow

1- Dataset Preparation

In [74]:
dataset = load_dataset("conll2003")
label_list = dataset["train"].features["ner_tags"].feature.names
print("Label list:", dataset["train"].features["ner_tags"].feature)

Label list: ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None)


2- Tokenizer Initialization

In [75]:
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

3- Data Preprocessing

In [76]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word = None
        label_ids = []
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)  # Special tokens get -100 label
            elif word_id != previous_word:
                label_ids.append(label[word_id])  # First token of the word gets the label
            else:
                label_ids.append(label[word_id])  # Subsequent subwords get the same label
            previous_word = word_id
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

4- Model Loading

In [77]:
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in id2label.items()}

model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(label_list), id2label=id2label, label2id=label2id,)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


5- Data Collation & Training Configuration

In [78]:
data_collator = DataCollatorForTokenClassification(tokenizer)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
)

6- Evaluation Metrics

In [79]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    pred_labels = [[id2label[p] for (p, l) in zip(prediction, label) if l != -100]
                   for prediction, label in zip(predictions, labels)]
    report = classification_report(true_labels, pred_labels, output_dict=True)
    return {
        "f1": report["micro avg"]["f1-score"],
        "precision": report["micro avg"]["precision"],
        "recall": report["micro avg"]["recall"],
    }

7- Model Training & Evaluation

In [80]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.evaluate(eval_dataset=tokenized_datasets["test"])

Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.0804,0.065056,0.926906,0.924877,0.928943
2,0.036,0.078396,0.935175,0.937833,0.932532
3,0.0151,0.066233,0.94586,0.944211,0.947515


{'eval_loss': 0.17521657049655914,
 'eval_f1': 0.8811644779134907,
 'eval_precision': 0.8723520320029093,
 'eval_recall': 0.8901567863438167,
 'eval_runtime': 9.3875,
 'eval_samples_per_second': 367.83,
 'eval_steps_per_second': 46.019,
 'epoch': 3.0}

8- Model Saving & Inference

In [84]:
model_path = "./ner_trained_model"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

trained_model = AutoModelForTokenClassification.from_pretrained(model_path)
trained_tokenizer = AutoTokenizer.from_pretrained(model_path)

ner_pipeline = pipeline("token-classification", model=trained_model, tokenizer=trained_tokenizer)

output = ner_pipeline("Elon Musk is the CEO of Tesla, which is headquartered in Palo Alto.")
print("\nToken Classification Output:")
for entity in output:
    print(entity)

Device set to use cuda:0



Token Classification Output:
{'entity': 'B-PER', 'score': 0.95738465, 'index': 1, 'word': 'El', 'start': 0, 'end': 2}
{'entity': 'B-PER', 'score': 0.98268753, 'index': 2, 'word': '##on', 'start': 2, 'end': 4}
{'entity': 'I-PER', 'score': 0.9949065, 'index': 3, 'word': 'Mu', 'start': 5, 'end': 7}
{'entity': 'I-PER', 'score': 0.99306446, 'index': 4, 'word': '##sk', 'start': 7, 'end': 9}
{'entity': 'B-ORG', 'score': 0.9387017, 'index': 9, 'word': 'Te', 'start': 24, 'end': 26}
{'entity': 'B-ORG', 'score': 0.87641644, 'index': 10, 'word': '##sla', 'start': 26, 'end': 29}
{'entity': 'B-LOC', 'score': 0.9974934, 'index': 16, 'word': 'Pa', 'start': 57, 'end': 59}
{'entity': 'B-LOC', 'score': 0.9968714, 'index': 17, 'word': '##lo', 'start': 59, 'end': 61}
{'entity': 'I-LOC', 'score': 0.9965964, 'index': 18, 'word': 'Alto', 'start': 62, 'end': 66}
