## install dependencies

In [14]:
%pip install seqeval
%pip install transformers
%pip install datasets
%pip install numpy
%pip install evaluate

/usr/bin/fish: /home/r4jmund/soft/miniconda3/envs/tf/lib/libtinfo.so.6: no version information available (required by /usr/bin/fish)


Note: you may need to restart the kernel to use updated packages.
/usr/bin/fish: /home/r4jmund/soft/miniconda3/envs/tf/lib/libtinfo.so.6: no version information available (required by /usr/bin/fish)
Note: you may need to restart the kernel to use updated packages.
/usr/bin/fish: /home/r4jmund/soft/miniconda3/envs/tf/lib/libtinfo.so.6: no version information available (required by /usr/bin/fish)
Note: you may need to restart the kernel to use updated packages.
/usr/bin/fish: /home/r4jmund/soft/miniconda3/envs/tf/lib/libtinfo.so.6: no version information available (required by /usr/bin/fish)
Note: you may need to restart the kernel to use updated packages.
/usr/bin/fish: /home/r4jmund/soft/miniconda3/envs/tf/lib/libtinfo.so.6: no version information available (required by /usr/bin/fish)
Note: you may need to restart the kernel to use updated packages.


## Basic imports and parameters

In [15]:
import tensorflow as tf
from pathlib import Path

START_MODEL_CHECKPOINT = "bert-base-cased"
TRAIN_BATCH_SIZE = 10
TRAIN_EPOCHS = 10

BEST_MODEL_CHECKPOINT = Path('./model/best_model.h5')

# Parameters for learning rate - we use linear decay for learning rate
INITIAL_LEARNING_RATE = 1e-5
WEIGHT_DECAY_RATE = 0.01

## GPU config for optimal memory usage

In [16]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
  tf.config.experimental.set_memory_growth(gpu, True)

## Loading dataset

In [17]:
from datasets import load_dataset

raw_datasets = load_dataset("conll2003")

Found cached dataset conll2003 (/home/r4jmund/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)
100%|██████████| 3/3 [00:00<00:00, 805.46it/s]


### Dataset preview

In [18]:
raw_datasets, raw_datasets["train"][0]["tokens"], raw_datasets["train"][0]["ner_tags"]

(DatasetDict({
     train: Dataset({
         features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
         num_rows: 14041
     })
     validation: Dataset({
         features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
         num_rows: 3250
     })
     test: Dataset({
         features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
         num_rows: 3453
     })
 }),
 ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
 [3, 0, 7, 0, 0, 0, 7, 0, 0])

### Print label names

In [19]:
ner_feature = raw_datasets["train"].features["ner_tags"]
label_names = ner_feature.feature.names
ner_feature, label_names

(Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None),
 ['O',
  'B-PER',
  'I-PER',
  'B-ORG',
  'I-ORG',
  'B-LOC',
  'I-LOC',
  'B-MISC',
  'I-MISC'])

### Dataset preview - first record with full label names

In [20]:
words = raw_datasets["train"][0]["tokens"]
labels = raw_datasets["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

EU    rejects German call to boycott British lamb . 
B-ORG O       B-MISC O    O  O       B-MISC  O    O 


## Creating tokenizer

In [21]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(START_MODEL_CHECKPOINT)

### Tokenizer - test

In [22]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

### Aligning labels with tokens - after tokenizer

We use tokenizer for sub-word tokenization, but that means we need to align NER labels accordingly. This is what function below does - it takes labels and word_ids, which is list of word IDs as an argument. word_ids can be obtained from the Dataset class using its method word_ids.

In [23]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

### Aligning labels test

As we can see, after calling align_labels_with_tokens, new labels are aligned with the tokenized inputs.

In [24]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[3, 0, 7, 0, 0, 0, 7, 0, 0]
[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]


## Tokenizing and aligning labels on the whole dataset

Here we tokenize and align whole label using previously instantiated tokenizer. We do that using map function for efficient batched data processing.
We automatically truncate the input to the maximum model input size, but padding is being done during training.

In [25]:
def tokenize_and_align_labels(dataset):
    tokenized_inputs = tokenizer(
        dataset["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = dataset["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [26]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

Loading cached processed dataset at /home/r4jmund/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98/cache-26d057f0f1662018.arrow
Loading cached processed dataset at /home/r4jmund/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98/cache-c87b2ef1e5f67e1d.arrow
Loading cached processed dataset at /home/r4jmund/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98/cache-ba39b941f4bb174f.arrow


## Instantiating DataCollator for dynamic input padding

In [27]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer, return_tensors="tf"
)

In [28]:
# DataCollator test on the dataset first two rows.

print ("First two rows before padding:")
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

print ("First two rows after padding:")
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
print(batch["labels"])
print ("Works as intended :)")

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


First two rows before padding:
[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]
[-100, 1, 2, -100]
First two rows after padding:
tf.Tensor(
[[-100    3    0    7    0    0    0    7    0    0    0 -100]
 [-100    1    2 -100 -100 -100 -100 -100 -100 -100 -100 -100]], shape=(2, 12), dtype=int64)
Works as intended :)


## Final data preparation - creating tf.data.Dataset from the underlying Dataset

Here we cast the Dataset type to the one accepted by Keras. We also specify our data collator for dynamic padding and select batch_size.

In [29]:
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=TRAIN_BATCH_SIZE,
)

tf_eval_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=TRAIN_BATCH_SIZE,
)

In [30]:
# Dictionaries for easy id2label and label2id conversion
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}
id2label, label2id

({0: 'O',
  1: 'B-PER',
  2: 'I-PER',
  3: 'B-ORG',
  4: 'I-ORG',
  5: 'B-LOC',
  6: 'I-LOC',
  7: 'B-MISC',
  8: 'I-MISC'},
 {'O': 0,
  'B-PER': 1,
  'I-PER': 2,
  'B-ORG': 3,
  'I-ORG': 4,
  'B-LOC': 5,
  'I-LOC': 6,
  'B-MISC': 7,
  'I-MISC': 8})

## Model instantiation

In [31]:
from transformers import TFAutoModelForTokenClassification

model = TFAutoModelForTokenClassification.from_pretrained(
    START_MODEL_CHECKPOINT,
    id2label=id2label,
    label2id=label2id,
)

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
model.config.num_labels
# Should be 9 - our model has to have the same number of labels as dataset for optimal results

9

## Creating optimizer for training, compiling model

In [33]:
from transformers import create_optimizer

# Train in mixed-precision float16
# Comment this line out if you're using a GPU that will not benefit from this
tf.keras.mixed_precision.set_global_policy("mixed_float16")

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_train_steps = len(tf_train_dataset) * TRAIN_EPOCHS

optimizer, schedule = create_optimizer(
    init_lr=INITIAL_LEARNING_RATE,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=WEIGHT_DECAY_RATE,
)
model.compile(optimizer=optimizer, metrics=["accuracy"])

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce GTX 1660 Ti with Max-Q Design, compute capability 7.5


2023-07-30 01:50:31.602152: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355


## Defining metrics calculation function

In [34]:
import numpy as np
import evaluate

metric = evaluate.load("seqeval")
def calculate_metrics(model, dataset):

    all_predictions = []
    all_labels = []
    for batch in dataset:
        logits = model.predict_on_batch(batch)["logits"]
        labels = batch["labels"]
        predictions = np.argmax(logits, axis=-1)
        for prediction, label in zip(predictions, labels):
            for predicted_idx, label_idx in zip(prediction, label):
                if label_idx == -100:
                    continue
                all_predictions.append(label_names[predicted_idx])
                all_labels.append(label_names[label_idx])
    return metric.compute(predictions=[all_predictions], references=[all_labels])

## Creating callback for saving the best model

In [35]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=BEST_MODEL_CHECKPOINT,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)


### Evaluating model performance with randomly initialized head - just for comparison as a starting point

In [22]:
initial_metrics = calculate_metrics(model, tf_eval_dataset)
initial_metrics

2023-07-30 01:26:19.182400: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_3' with dtype int64 and shape [12,48]
	 [[{{node Placeholder/_3}}]]
2023-07-30 01:26:23.215393: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_3' with dtype int64 and shape [12,52]
	 [[{{node Placeholder/_3}}]]
2023-07-30 01:26:25.523315: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_3' with dtype int64 and shape [12,49

{'LOC': {'precision': 0.0121667098110277,
  'recall': 0.05117038649972782,
  'f1': 0.019659102792010878,
  'number': 1837},
 'MISC': {'precision': 0.00683311432325887,
  'recall': 0.05639913232104121,
  'f1': 0.012189404594467886,
  'number': 922},
 'ORG': {'precision': 0.0006925207756232687,
  'recall': 0.0037285607755406414,
  'f1': 0.0011680878402055834,
  'number': 1341},
 'PER': {'precision': 0.0053711253478289,
  'recall': 0.04505971769815418,
  'f1': 0.009598149754264238,
  'number': 1842},
 'overall_precision': 0.006156436633428925,
 'overall_recall': 0.03938067990575564,
 'overall_f1': 0.010648221883461128,
 'overall_accuracy': 0.2086889974686525}

## Fitting the model

In [36]:
model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    epochs=TRAIN_EPOCHS,
    callbacks=[model_checkpoint_callback]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fe174067b50>

## Evaluating model performance after fine-tuning

In [37]:
metrics_after_fit = calculate_metrics(model, tf_eval_dataset)
metrics_after_fit

2023-07-30 02:34:06.311679: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_3' with dtype int64 and shape [10,48]
	 [[{{node Placeholder/_3}}]]
2023-07-30 02:34:07.863037: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_3' with dtype int64 and shape [10,37]
	 [[{{node Placeholder/_3}}]]
2023-07-30 02:34:10.285147: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_3' with dtype int64 and shape [10,52

{'LOC': {'precision': 0.9631635969664138,
  'recall': 0.9678824169842134,
  'f1': 0.9655172413793104,
  'number': 1837},
 'MISC': {'precision': 0.8700623700623701,
  'recall': 0.9078091106290672,
  'f1': 0.8885350318471338,
  'number': 922},
 'ORG': {'precision': 0.9053168244719592,
  'recall': 0.9269202087994034,
  'f1': 0.9159911569638909,
  'number': 1341},
 'PER': {'precision': 0.9610458911419424,
  'recall': 0.9777415852334419,
  'f1': 0.9693218514531755,
  'number': 1842},
 'overall_precision': 0.934599504541701,
 'overall_recall': 0.9523729384045776,
 'overall_f1': 0.9434025172959907,
 'overall_accuracy': 0.9865632542532525}

## Model test - random ten test dataset predictions

In [38]:
from transformers import pipeline
import random

model.load_weights(BEST_MODEL_CHECKPOINT)
token_classifier = pipeline(
    "token-classification", model=model, aggregation_strategy="simple", tokenizer=tokenizer
)

max_length = len(raw_datasets['test'])


for i in range(10):
    id = random.randint(0, max_length)
    drawn_data = raw_datasets["test"][id]
    # print(drawn_data)
    print ("ID: ", drawn_data["id"], "Input: ", drawn_data["tokens"])
    print ("Expected: ", [id2label[x] for x in drawn_data["ner_tags"]])
    print ("Output:", token_classifier(drawn_data['tokens']))
    print("_______________________________________________________________")


ID:  2972 Input:  ['Crystal', 'Palace', '21', '9', '8', '4', '46', '22', '35']
Expected:  ['B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Output: [[{'entity_group': 'ORG', 'score': 0.77750933, 'word': 'Crystal', 'start': 0, 'end': 7}], [{'entity_group': 'LOC', 'score': 0.5660404, 'word': 'Palace', 'start': 0, 'end': 6}], [], [], [], [], [], [], []]
_______________________________________________________________
ID:  1222 Input:  ['The', 'weather', 'in', 'the', 'capital', 'Port', 'Louis', 'was', 'heavily', 'cloudy', 'on', 'Friday', 'afternoon', 'with', 'occasional', 'showers', '.']
Expected:  ['O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Output: [[], [], [], [], [], [{'entity_group': 'LOC', 'score': 0.98062485, 'word': 'Port', 'start': 0, 'end': 4}], [{'entity_group': 'PER', 'score': 0.9869804, 'word': 'Louis', 'start': 0, 'end': 5}], [], [], [], [], [], [], [], [], [], []]
____________________________________________________________