# Token classification (PyTorch)

Install the Datasets, Evaluate, Transformers and Accelerate libraries to run this notebook.

In [1]:
!pip install -q datasets evaluate transformers[sentencepiece]
!pip install -q accelerate

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/536.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m266.2/536.7 kB[0m [31m8.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h

---

To run the training on a TPU, you will need to uncomment and run the following line:

In [2]:
# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl

If running this notebook in Colab, please ensure that your Hugging Face `HF_TOKEN` is added to your Colab secrets.

Alternatively, please login to Hugging Face by running the following cell.

In [3]:
# !huggingface-cli login

In [4]:
import os
import random
import numpy as np
import torch

def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)

In [5]:
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'

---

We shall use the CoNLL-2003 (Conference on Computational Natural Language Learning) dataset.

Dataset page: https://huggingface.co/datasets/conll2003

In [6]:
from datasets import load_dataset

raw_datasets = load_dataset("conll2003")
raw_datasets

Downloading data:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/312k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/283k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [7]:
example0 = raw_datasets['train'][0]
for k, v in example0.items():
    print(f"{k}: {v}")

id: 0
tokens: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
pos_tags: [22, 42, 16, 21, 35, 37, 16, 21, 7]
chunk_tags: [11, 21, 11, 12, 21, 22, 11, 12, 0]
ner_tags: [3, 0, 7, 0, 0, 0, 7, 0, 0]


In [8]:
len(example0['tokens']), len(example0['pos_tags']), len(example0['chunk_tags']), len(example0['ner_tags'])

(9, 9, 9, 9)

In [9]:
features = raw_datasets['train'].features
features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None),
 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

In [10]:
ner_feature = features['ner_tags']
ner_feature

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

**Note:** `ner_feature` is a `Sequence` object with a `ClassLabel` object inside it. In other words, it's a sequence of class labels.

In [11]:
label_names = ner_feature.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [12]:
words = example0['tokens']
labels = example0['ner_tags']
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)
print(line1)
print(line2)

EU    rejects German call to boycott British lamb . 
B-ORG O       B-MISC O    O  O       B-MISC  O    O 


In [13]:
example4 = raw_datasets['train'][4]
for k, v in example4.items():
    print(f"{k}: {v}")

id: 4
tokens: ['Germany', "'s", 'representative', 'to', 'the', 'European', 'Union', "'s", 'veterinary', 'committee', 'Werner', 'Zwingmann', 'said', 'on', 'Wednesday', 'consumers', 'should', 'buy', 'sheepmeat', 'from', 'countries', 'other', 'than', 'Britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.']
pos_tags: [22, 27, 21, 35, 12, 22, 22, 27, 16, 21, 22, 22, 38, 15, 22, 24, 20, 37, 21, 15, 24, 16, 15, 22, 15, 12, 16, 21, 38, 17, 7]
chunk_tags: [11, 11, 12, 13, 11, 12, 12, 11, 12, 12, 12, 12, 21, 13, 11, 12, 21, 22, 11, 13, 11, 1, 13, 11, 17, 11, 12, 12, 21, 1, 0]
ner_tags: [5, 0, 0, 0, 0, 3, 4, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0]


In [14]:
words = example4['tokens']
labels = example4['ner_tags']
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)
print(line1)
print(line2)

Germany 's representative to the European Union 's veterinary committee Werner Zwingmann said on Wednesday consumers should buy sheepmeat from countries other than Britain until the scientific advice was clearer . 
B-LOC   O  O              O  O   B-ORG    I-ORG O  O          O         B-PER  I-PER     O    O  O         O         O      O   O         O    O         O     O    B-LOC   O     O   O          O      O   O       O 


In [15]:
raw_datasets = raw_datasets.remove_columns(['id', 'pos_tags', 'chunk_tags'])
raw_datasets = raw_datasets.rename_column('tokens', 'words')
raw_datasets = raw_datasets.rename_column('ner_tags', 'labels')
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['words', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['words', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['words', 'labels'],
        num_rows: 3453
    })
})

In [16]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.is_fast

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

True

In [17]:
tokenizer.model_max_length

512

In [18]:
example0 = raw_datasets['train'][0]
for k, v in example0.items():
    print(f"{k}: {v}")

words: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
labels: [3, 0, 7, 0, 0, 0, 7, 0, 0]


In [19]:
inputs = tokenizer(example0['words'], is_split_into_words=True)
inputs

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [20]:
type(inputs)

**Note:** An example in a Hugging Face `Dataset` (whether raw or tokenized) is a dict. But two things are `BatchEncoding` objects:

- The return value of any fast tokenizer (as we see above).
- A batch obtained from a data loader.

In [21]:
print(inputs.tokens())

['[CLS]', 'EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'la', '##mb', '.', '[SEP]']


Alt:

In [22]:
print(tokenizer.convert_ids_to_tokens(inputs['input_ids']))

['[CLS]', 'EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'la', '##mb', '.', '[SEP]']


In [23]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

Some context to understand the function below:

In [24]:
len(example0['labels']), len(inputs.word_ids())

(9, 12)

Each label in `raw_datasets` corresponds to a word, whereas each word ID corresponds to a subword.

In [25]:
# Reminder:
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

Notice that the `'B-XXX'` tokens have odd indices.

In [26]:
def align_labels_with_tokens(labels, word_ids):
    """
    This function returns labels corresponding to each subword (for a single example).
    Args:
        - labels: The labels corresponding to each word.
        - word_ids: The word IDs corresponding to each subword.
    """
    new_labels = []
    current_word_id = None
    for word_id in word_ids:
        # This loop goes over each word ID within an example.
        if word_id is None:
            # Special token...
            new_labels.append(-100)
        else:
            if word_id != current_word_id:
                # Start of a new word...
                label = labels[word_id]
                new_labels.append(label)
            else:
                # Continuation of the same word...
                label = labels[word_id]
                if label % 2 == 1:
                    # If the label is 'B-XXX', then change it to 'I-XXX':
                    label += 1
                new_labels.append(label)
        current_word_id = word_id
    return new_labels

**Note:** An entity in `raw_datasets` may be comprised of a `'B-XXX'` word & zero or more `'I-XXX'` words. The `if label % 2 == 1:` block above is ensuring that the labels of tokens that do not begin words are `'I-XXX'` (even if the labels of the associated words are `'B-XXX'`).

In [27]:
print(example0['words'])
print(example0['labels'])

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
[3, 0, 7, 0, 0, 0, 7, 0, 0]


In [28]:
# Test:
print(inputs.tokens())
print(align_labels_with_tokens(example0['labels'], inputs.word_ids()))

['[CLS]', 'EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'la', '##mb', '.', '[SEP]']
[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]


Let's see what a `BatchEncoding` object's `word_ids(idx)` method call returns.

In [29]:
examples = raw_datasets['train'][:2]
for k, v in examples.items():
    print(f"{k}: {v}")

words: [['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], ['Peter', 'Blackburn']]
labels: [[3, 0, 7, 0, 0, 0, 7, 0, 0], [1, 2]]


In [30]:
encoding = tokenizer(examples['words'], truncation=True, max_length=tokenizer.model_max_length, is_split_into_words=True)
encoding

{'input_ids': [[101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], [101, 1943, 14428, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1]]}

In [31]:
encoding.word_ids(0)

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [32]:
encoding.word_ids(1)

[None, 0, 1, None]

In [33]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['words'], truncation=True, max_length=tokenizer.model_max_length, is_split_into_words=True)
    all_labels = examples['labels']
    new_labels = []
    for i, labels in enumerate(all_labels):
        # This loop goes over each of the examples.
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    tokenized_inputs['labels'] = new_labels
    return tokenized_inputs

In [34]:
# Test:
tokenize_and_align_labels(examples)

{'input_ids': [[101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], [101, 1943, 14428, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1]], 'labels': [[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100], [-100, 1, 2, -100]]}

In [35]:
tokenized_datasets = raw_datasets.map(tokenize_and_align_labels, batched=True, remove_columns=['words'])
tokenized_datasets

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3453
    })
})

In [36]:
example0 = tokenized_datasets['train'][0]
for k, v in example0.items():
    print(f"{k}: {v}")

labels: [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]
input_ids: [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102]
token_type_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
attention_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


---

In [37]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [38]:
[tokenized_datasets['train'][i] for i in range(2)]

[{'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100],
  'input_ids': [101,
   7270,
   22961,
   1528,
   1840,
   1106,
   21423,
   1418,
   2495,
   12913,
   119,
   102],
  'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 {'labels': [-100, 1, 2, -100],
  'input_ids': [101, 1943, 14428, 102],
  'token_type_ids': [0, 0, 0, 0],
  'attention_mask': [1, 1, 1, 1]}]

In [39]:
batch = data_collator([tokenized_datasets['train'][i] for i in range(2)])
batch

{'input_ids': tensor([[  101,  7270, 22961,  1528,  1840,  1106, 21423,  1418,  2495, 12913,
           119,   102],
        [  101,  1943, 14428,   102,     0,     0,     0,     0,     0,     0,
             0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])}

In [40]:
batch['labels']

tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])

In [41]:
tokenizer.batch_decode(batch['input_ids'])

['[CLS] EU rejects German call to boycott British lamb. [SEP]',
 '[CLS] Peter Blackburn [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]']

In [42]:
for i in range(2):
    print(tokenized_datasets['train'][i]['labels'])

[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]
[-100, 1, 2, -100]


In [43]:
!pip install -q seqeval

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [44]:
import evaluate

metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [45]:
labels = raw_datasets['train'][0]['labels']
labels = [label_names[l] for l in labels]
labels

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [46]:
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])

{'MISC': {'precision': 1.0,
  'recall': 0.5,
  'f1': 0.6666666666666666,
  'number': 2},
 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 0.6666666666666666,
 'overall_f1': 0.8,
 'overall_accuracy': 0.8888888888888888}

Notice that the `predictions` and `references` arguments are both lists of lists. In other words, they are batches.

In [47]:
import numpy as np

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove special tokens and convert to string labels:
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels] # Here, `labels` refers to a batch of labels.
    true_predictions = [
        [label_names[p] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        'precision': all_metrics['overall_precision'],
        'recall': all_metrics['overall_recall'],
        'f1': all_metrics['overall_f1'],
        'accuracy': all_metrics['overall_accuracy']
    }

**Note:**

- Shape of logits: (batch size, sequence length, number of classes).
- Shape of predictions: (batch size, sequence length).
- Shape of labels: (batch size, sequence length).

For the sake of understanding, let's say we have a batch size of 2, and our arrays look like the following:

In [48]:
predictions = np.array(
    [[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
     [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]]
)
labels = np.array(
    [[-100,    3,    0,    0,    0,    0,    0,    7,    0,    0,    0, -100],
     [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]]
)

Then:

In [49]:
for prediction, label in zip(predictions, labels):
    print("First level zip:")
    print(prediction)
    print(label)
    print("Second level zip:")
    for p, l in zip(prediction, label):
        print(f"{p}, {l}")
    print("---")

First level zip:
[-100    3    0    7    0    0    0    7    0    0    0 -100]
[-100    3    0    0    0    0    0    7    0    0    0 -100]
Second level zip:
-100, -100
3, 3
0, 0
7, 0
0, 0
0, 0
0, 0
7, 7
0, 0
0, 0
0, 0
-100, -100
---
First level zip:
[-100    1    2 -100 -100 -100 -100 -100 -100 -100 -100 -100]
[-100    1    2 -100 -100 -100 -100 -100 -100 -100 -100 -100]
Second level zip:
-100, -100
1, 1
2, 2
-100, -100
-100, -100
-100, -100
-100, -100
-100, -100
-100, -100
-100, -100
-100, -100
-100, -100
---


This explains the nested list comprehension used to derive `true_predictions`.

**Note:** The above function computes the metrics at a token level (not at a word level), leaving out special tokens. (However, the example above the `compute_metrics` function definition computes the metrics at a word level!) In most projects, it will probably make more sense to compute the metrics at a word level. This is one more reason to assign `-100` to subsequent subword tokens that do not begin words! (Doing so will filter them out from the metric computation.)

In [50]:
id2label = {id: label for id, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [51]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, id2label=id2label, label2id=label2id)
model

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [52]:
# Just for comparison:
from transformers import AutoModelForSequenceClassification

model2 = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=9)
model2

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

The only visible difference between the two models is the `BertPooler` inside `BertModel` in `model2`. However, there may be some other differences in the `forward` methods of the models.

In [53]:
model.config.num_labels

9

In [54]:
!pip install -q wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.4/195.4 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.5/258.5 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [55]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [56]:
import wandb
import os

os.environ['WANDB_PROJECT'] = "bert-base-cased-finetuned-conll2003-ner"

In [57]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="bert-base-cased-finetuned-conll2003-ner",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to="wandb",
    logging_strategy="steps",
    logging_steps=1,
    push_to_hub=True
)

**Note:** Since `evaluation_strategy` is set to `"epoch"`, the `Trainer` will log the validation set metrics at the end of each epoch. (It won't have access to the validation set metrics before the epoch is completed.) However, it will log the training loss at the end of each step. If we want to log the training loss at the end of each epoch, we can specify `logging_strategy="epoch"` and get rid of the `logging_steps=1` argument.

From the <a href="https://huggingface.co/docs/transformers/v4.37.2/en/main_classes/trainer#transformers.TrainingArguments.logging_steps" target="_blank">documentation</a> of `TrainingArguments`:

- **logging_steps** (`int` or `float`, *optional*, defaults to `500`) — Number of update steps between two logs if `logging_strategy="steps"`. Should be an integer or a float in range [0,1). If smaller than 1, will be interpreted as ratio of total training steps.

In [58]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [59]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33msadhaklal[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0125,0.072899,0.909523,0.933861,0.921531,0.980985
2,0.0001,0.055781,0.926529,0.94867,0.937469,0.986239
3,0.0001,0.057843,0.936558,0.951531,0.943985,0.986696


TrainOutput(global_step=5268, training_loss=0.06704907628345602, metrics={'train_runtime': 596.568, 'train_samples_per_second': 70.609, 'train_steps_per_second': 8.831, 'total_flos': 921792849708600.0, 'train_loss': 0.06704907628345602, 'epoch': 3.0})

In [60]:
wandb.finish()

VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▇█
eval/f1,▁▆█
eval/loss,█▁▂
eval/precision,▁▅█
eval/recall,▁▇█
eval/runtime,▁▇█
eval/samples_per_second,█▂▁
eval/steps_per_second,█▂▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
eval/accuracy,0.9867
eval/f1,0.94399
eval/loss,0.05784
eval/precision,0.93656
eval/recall,0.95153
eval/runtime,10.3099
eval/samples_per_second,315.232
eval/steps_per_second,39.477
train/epoch,3.0
train/global_step,5268.0


In [61]:
trainer.push_to_hub(commit_message="Training complete")

CommitInfo(commit_url='https://huggingface.co/sadhaklal/bert-base-cased-finetuned-conll2003-ner/commit/79131fea4864f1d2faf8220921bd78751dd3a35a', commit_message='Training complete', commit_description='', oid='79131fea4864f1d2faf8220921bd78751dd3a35a', pr_url=None, pr_revision=None, pr_num=None)

---

In [62]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()
    # Remove special tokens and convert to string labels:
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_predictions, true_labels

**Questions:**

1. Since we use the `torch.no_grad()` context manager for model evaluation, is there a need to call the `detach` method?
2. Similarly, are there specific reasons for calling the `cpu`, `clone` and `numpy` methods? Or is the notebook author just being extra careful?

In [63]:
from accelerate import Accelerator
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm

def training_function():
    accelerator = Accelerator()
    repo_name = "bert-base-cased-finetuned-conll2003-ner-v2"
    wandb_config = {
        'batch_size': 8,
        'learning_rate': 2e-5,
        'num_epochs': 3,
        'lr_scheduler_type': "linear",
        'num_warmup_steps': 0
    }
    wandb.init(
        project=repo_name,
        config=wandb_config,
        notes="Logging min of `train_loss` & `eval_loss`, and max of `precision`, `recall`, `F1` and `accuracy`."
    )
    wandb.define_metric("train_loss", summary="min")
    wandb.define_metric("eval_loss", summary="min")
    wandb.define_metric("precision", summary="max")
    wandb.define_metric("recall", summary="max")
    wandb.define_metric("F1", summary="max")
    wandb.define_metric("accuracy", summary="max")

    train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, batch_size=wandb_config['batch_size'], collate_fn=data_collator)
    eval_dataloader = DataLoader(tokenized_datasets['validation'], shuffle=False, batch_size=wandb_config['batch_size'], collate_fn=data_collator)
    model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, id2label=id2label, label2id=label2id)
    model.to(accelerator.device) # Optional.
    optimizer = AdamW(model.parameters(), lr=wandb_config['learning_rate'])
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(model, optimizer, train_dataloader, eval_dataloader)

    num_update_steps_per_epoch = len(train_dataloader) # Note: We should ALWAYS do this length calculation after the `accelerator.prepare()` method call.
    num_training_steps = wandb_config['num_epochs'] * num_update_steps_per_epoch
    lr_scheduler = get_scheduler(
        wandb_config['lr_scheduler_type'],
        optimizer=optimizer,
        num_warmup_steps=wandb_config['num_warmup_steps'],
        num_training_steps=num_training_steps
    )

    progress_bar = tqdm(range(num_training_steps))
    for epoch in range(wandb_config['num_epochs']):
        model.train()
        train_loss = 0
        for batch in train_dataloader:
            batch = {k: v.to(accelerator.device) for k, v in batch.items()} # Optional.
            output = model(**batch)
            loss = output.loss
            train_loss += loss.item()
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
        train_loss /= len(tokenized_datasets['train'])
        train_loss = round(train_loss, 4)

        model.eval()
        eval_loss = 0
        metric = evaluate.load("seqeval")
        for batch in eval_dataloader:
            batch = {k: v.to(accelerator.device) for k, v in batch.items()} # Optional.
            with torch.no_grad():
                output = model(**batch)
            loss = output.loss
            eval_loss += loss.item()
            predictions = output.logits.argmax(dim=-1)
            labels = batch['labels']

            predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
            labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

            predictions_gathered = accelerator.gather(predictions)
            labels_gathered = accelerator.gather(labels)

            true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
            metric.add_batch(predictions=true_predictions, references=true_labels)
        eval_loss /= len(tokenized_datasets['validation'])
        eval_loss = round(eval_loss, 4)
        results = metric.compute()
        precision = results['overall_precision']
        recall = results['overall_recall']
        f1 = results['overall_f1']
        accuracy = results['overall_accuracy']
        print(f"Epoch: {epoch}, Training Loss: {train_loss}, Evaluation Loss: {eval_loss}, Precision: {precision}, Recall: {recall}, F1: {f1}, Accuracy: {accuracy}")

        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        if accelerator.is_main_process:
            wandb.log({'train_loss': train_loss, 'eval_loss': eval_loss, 'precision': precision, 'recall': recall, 'F1': f1, 'accuracy': accuracy})

            print("Pushing model...")
            unwrapped_model.push_to_hub(repo_name, commit_message=f"epoch: {epoch}, f1: {f1}, accuracy: {accuracy}")
        print("---")
    accelerator.wait_for_everyone()
    if accelerator.is_main_process:
        wandb.finish()
        print("---")
        print("Pushing tokenizer...")
        tokenizer.push_to_hub(repo_name, commit_message="pushing tokenizer")
    print("Done!")

In [64]:
from accelerate import notebook_launcher

notebook_launcher(training_function)

Launching training on one GPU.


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112721622221973, max=1.0…

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/5268 [00:00<?, ?it/s]

Epoch: 0, Training Loss: 0.0175, Evaluation Loss: 0.0105, Precision: 0.9060700773153479, Recall: 0.9269606193200942, F1: 0.9163963064636886, Accuracy: 0.9790133631600636
Pushing model...


model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

---
Epoch: 1, Training Loss: 0.0051, Evaluation Loss: 0.0066, Precision: 0.9211822660098522, Recall: 0.9441265567149109, F1: 0.9325132978723404, Accuracy: 0.9857685288750221
Pushing model...


README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

---
Epoch: 2, Training Loss: 0.0026, Evaluation Loss: 0.0068, Precision: 0.9242772667542707, Recall: 0.9469875462807136, F1: 0.9354945968412303, Accuracy: 0.9858421145581916
Pushing model...


model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

---


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
F1,▁▇█
accuracy,▁██
eval_loss,█▁▁
precision,▁▇█
recall,▁▇█
train_loss,█▂▁


---
Pushing tokenizer...
Done!


---

In [65]:
from transformers import pipeline

model_checkpoint = "sadhaklal/bert-base-cased-finetuned-conll2003-ner-v2"
token_classifier = pipeline("token-classification", model=model_checkpoint, aggregation_strategy="simple")

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [66]:
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

[{'entity_group': 'PER',
  'score': 0.9942076,
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': 0.9840309,
  'word': 'Hugging Face',
  'start': 33,
  'end': 45},
 {'entity_group': 'LOC',
  'score': 0.9981262,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]