<a href="https://colab.research.google.com/github/rickygrosvenor-pramanick/finetuning/blob/main/complete-finetuning/supervised_finetuning_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q transformers datasets seqeval

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [2]:
!curl -L -o conll2003.zip https://data.deepai.org/conll2003.zip
!unzip -q conll2003.zip -d conll2003
!ls conll2003

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  959k  100  959k    0     0  3961k      0 --:--:-- --:--:-- --:--:-- 3966k
metadata  test.txt  train.txt  valid.txt


In [3]:
import numpy as np
from datasets import load_dataset
from transformers import (
    BertTokenizerFast,
    BertForTokenClassification,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments
)
from seqeval.metrics import classification_report

MODEL_NAME = "bert-base-uncased"
OUTPUT_DIR = "./bert-ner"

In [4]:
from datasets import Dataset, DatasetDict

def read_conll(path):
    tokens_list, tags_list = [], []
    tokens, tags = [], []
    with open(path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    tokens_list.append(tokens)
                    tags_list.append(tags)
                    tokens, tags = [], []
            else:
                parts = line.split()
                # SKIP the doc‑start markers
                if parts[0] == "-DOCSTART-":
                    continue
                tokens.append(parts[0])
                tags.append(parts[-1])
        if tokens:                          # catch last sentence
            tokens_list.append(tokens)
            tags_list.append(tags)
    return tokens_list, tags_list

# point these at your actual Colab paths
train_tokens, train_tags = read_conll("conll2003/train.txt")
val_tokens,   val_tags   = read_conll("conll2003/valid.txt")
test_tokens,  test_tags  = read_conll("conll2003/test.txt")

train_ds = Dataset.from_dict({"tokens": train_tokens, "ner_tags": train_tags})
val_ds   = Dataset.from_dict({"tokens": val_tokens,   "ner_tags": val_tags})
test_ds  = Dataset.from_dict({"tokens": test_tokens,  "ner_tags": test_tags})

raw_datasets = DatasetDict({
    "train": train_ds,
    "validation": val_ds,
    "test": test_ds
})


In [5]:
train_ds[1]

{'tokens': ['Peter', 'Blackburn'], 'ner_tags': ['B-PER', 'I-PER']}

In [6]:
# Build a Label Map
from collections import Counter

# flatten all tags to get the unique set in train
all_tags = [tag for seq in raw_datasets["train"]["ner_tags"] for tag in seq]
unique_tags = list(Counter(all_tags).keys())

# mapping each tag to a particular class (number)
label2id = {tag: i for i, tag in enumerate(unique_tags)}
id2label = {i: tag for tag, i in label2id.items()}
num_labels = len(unique_tags)

print("Tags:", unique_tags)
print("Num labels:", num_labels)

Tags: ['B-ORG', 'O', 'B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-MISC', 'I-LOC']
Num labels: 9


In [7]:
from transformers import BertTokenizerFast, BertForTokenClassification

MODEL_NAME = "bert-base-uncased"

tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
model     = BertForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def tokenize_and_align_labels(examples):
    tokenized = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        padding="max_length",
        truncation=True,
        max_length=128
    )
    aligned_labels = []
    for i, label_seq in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label_seq[word_idx]])
            else:
                # you can also use -100 here to ignore subtokens
                label_ids.append(label2id[label_seq[word_idx]])
            previous_word_idx = word_idx
        aligned_labels.append(label_ids)
    tokenized["labels"] = aligned_labels
    return tokenized

# apply to all splits
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=["tokens", "ner_tags"]
)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [9]:
ex = tokenized_datasets["train"][0]
words = tokenizer.convert_ids_to_tokens(ex["input_ids"])
labels = [id2label[l] if l != -100 else "-" for l in ex["labels"]]
print(list(zip(words, labels)))

[('[CLS]', '-'), ('eu', 'B-ORG'), ('rejects', 'O'), ('german', 'B-MISC'), ('call', 'O'), ('to', 'O'), ('boycott', 'O'), ('british', 'B-MISC'), ('lamb', 'O'), ('.', 'O'), ('[SEP]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-'), ('[PAD]', '-')

In [16]:
from transformers import DataCollatorForTokenClassification, Trainer, TrainingArguments
import numpy as np
from seqeval.metrics import classification_report

data_collator = DataCollatorForTokenClassification(tokenizer)

training_args = TrainingArguments(
    output_dir="./bert-ner",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=10,
    no_cuda=False,
    report_to="none" # Disable Weights & Biases logging
)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    labels = p.label_ids
    true_preds, true_labels = [], []
    for pred_seq, label_seq in zip(preds, labels):
        seq_preds, seq_labels = [], []
        for p_id, l_id in zip(pred_seq, label_seq):
            if l_id != -100:
                seq_preds.append(id2label[p_id])
                seq_labels.append(id2label[l_id])
        true_preds.append(seq_preds)
        true_labels.append(seq_labels)
    report = classification_report(true_labels, true_preds, output_dict=True)
    return {
        "precision": report["weighted avg"]["precision"],
        "recall":    report["weighted avg"]["recall"],
        "f1":        report["weighted avg"]["f1-score"]
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [17]:
trainer.train()
metrics = trainer.evaluate()
print(metrics)

Step,Training Loss
10,1.0708
20,0.5276
30,0.3884
40,0.2986
50,0.2675
60,0.2921
70,0.2023
80,0.161
90,0.1821
100,0.1358


{'eval_loss': 0.05781187862157822, 'eval_precision': 0.9421438045442502, 'eval_recall': 0.9492948287441235, 'eval_f1': 0.9456556187039571, 'eval_runtime': 25.5184, 'eval_samples_per_second': 127.359, 'eval_steps_per_second': 7.994, 'epoch': 3.0}


In [18]:
import torch

# 1. Prep device and model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# 2. Tokenize and keep the BatchEncoding
sentence = ["Barack", "Obama", "visited", "Toronto", "."]
encoding = tokenizer(
    sentence,
    is_split_into_words=True,
    return_tensors="pt",
    padding=True
)

# 3. Extract word_ids *before* converting to plain dict
word_ids = encoding.word_ids(batch_index=0)

# 4. Move everything to the GPU
encoding = {k: v.to(device) for k, v in encoding.items()}

# 5. Forward pass
with torch.no_grad():
    logits = model(**encoding).logits

# 6. Get predicted class IDs
pred_ids = logits.argmax(-1).squeeze().tolist()

# 7. Map back to (word, tag)
results = []
for idx, wid in enumerate(word_ids):
    if wid is not None:
        results.append(( sentence[wid], id2label[pred_ids[idx]] ))

print(results)
# → [('Barack','B-PER'), ('Obama','I-PER'), ('visited','O'), …]


[('Barack', 'B-PER'), ('Obama', 'I-PER'), ('visited', 'O'), ('Toronto', 'B-LOC'), ('.', 'O')]
