<a href="https://colab.research.google.com/github/srvmishra/Language-Models/blob/main/CONLL_2003_NER_BERT_Base_Cased.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install seqeval
!pip install evaluate
!pip install datasets



### Imports

In [2]:
import numpy as np
import pandas as pd
import markdown
import evaluate

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification

from huggingface_hub import notebook_login

### Download and Tokenize Dataset

In [3]:
token_dataset = load_dataset('conll2003')
token_dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

For token classification tasks, the document is split into words and each word has its label. It still has to go though sub word tokenization. Any dataset presented in this format can be used for a token level classification.

In [4]:
print(token_dataset['train'][0]['tokens'])
print(token_dataset['train'][0]['pos_tags'])
print(token_dataset['train'][0]['ner_tags'])
print(token_dataset['train'][0]['chunk_tags'])

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
[22, 42, 16, 21, 35, 37, 16, 21, 7]
[3, 0, 7, 0, 0, 0, 7, 0, 0]
[11, 21, 11, 12, 21, 22, 11, 12, 0]


In [5]:
token_dataset['train'].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None),
 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

In [6]:
print(token_dataset['train'].features['ner_tags'].feature.names)

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [7]:
words = token_dataset['train'][0]['tokens']
pos_labels = token_dataset['train'][0]['pos_tags']
ner_labels = token_dataset['train'][0]['ner_tags']
chunk_labels = token_dataset['train'][0]['chunk_tags']

pos_names = token_dataset['train'].features['pos_tags'].feature.names
ner_names = token_dataset['train'].features['ner_tags'].feature.names
chunk_names = token_dataset['train'].features['chunk_tags'].feature.names

line1 = ""
line2 = ""
line3 = ""
line4 = ""
for word, pos_label, ner_label, chunk_label in zip(words, pos_labels, ner_labels, chunk_labels):
    pos_label_name = pos_names[pos_label]
    ner_label_name = ner_names[ner_label]
    chunk_label_name = chunk_names[chunk_label]

    max_length = max(len(word), len(pos_label_name), len(ner_label_name), len(chunk_label_name))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += ner_label_name + " " * (max_length - len(ner_label_name) + 1)
    line3 += pos_label_name + " " * (max_length - len(pos_label_name) + 1)
    line4 += chunk_label_name + " " * (max_length - len(chunk_label_name) + 1)

print(line1)
print(line2)
print(line3)
print(line4)

EU    rejects German call to   boycott British lamb . 
B-ORG O       B-MISC O    O    O       B-MISC  O    O 
NNP   VBZ     JJ     NN   TO   VB      JJ      NN   . 
B-NP  B-VP    B-NP   I-NP B-VP I-VP    B-NP    I-NP O 


In [8]:
model_ckpt = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
print(tokenizer.is_fast)

True


In [9]:
model_inputs = tokenizer(token_dataset['train'][0]['tokens'], is_split_into_words=True)
model_inputs

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

Since each word can be split into multiple sub word tokens, and there are special tokens, we must align each token with its label.

In [10]:
print(tokenizer.decode(model_inputs['input_ids']))
print(model_inputs.tokens())
print(model_inputs.word_ids())

[CLS] EU rejects German call to boycott British lamb. [SEP]
['[CLS]', 'EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'la', '##mb', '.', '[SEP]']
[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]


In [11]:
def align_labels_with_tokens(labels_list, word_ids, typ='pos'):
  new_labels = []
  current_id = None

  for word_id in word_ids:
    if word_id is None:
      new_labels.append(-100)
    elif word_id != current_id:
      current_id = word_id
      label = -100 if word_id is None else labels_list[word_id]
      new_labels.append(label)
    else:
      label = labels_list[word_id]
      # we only need to change B- to I- for NER and CHUNK labels, not for POS labels
      if typ != 'pos':
        if label%2 == 1:
          label = label + 1
      new_labels.append(label)

  return new_labels

In [12]:
print('NER Ids ', align_labels_with_tokens(ner_labels, model_inputs.word_ids(), typ='ner'))
print('CHUNK Ids ', align_labels_with_tokens(chunk_labels, model_inputs.word_ids(), typ='chunk'))
print('POS Ids ', align_labels_with_tokens(pos_labels, model_inputs.word_ids(), typ='pos'))

NER Ids  [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]
CHUNK Ids  [-100, 11, 21, 11, 12, 21, 22, 11, 12, 12, 0, -100]
POS Ids  [-100, 22, 42, 16, 21, 35, 37, 16, 21, 21, 7, -100]


In [13]:
def tokenize_and_align_labels(examples):
  tokenized_examples = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)
  labels = []

  # if typ == 'ner':
  #   labels = tokenized_examples['ner_tags']
  # elif typ == 'pos':
  #   labels = tokenized_examples['pos_tags']
  # else:
  #   labels = tokenized_examples['chunk_tags']

  # choose the labels appropriately for the task
  # labels are present only in the original dataset
  # so we use examples
  ner_labels = examples['ner_tags']

  for i, label in enumerate(ner_labels):
    word_ids = tokenized_examples.word_ids(i)
    labels.append(align_labels_with_tokens(label, word_ids, typ='ner'))

  tokenized_examples['labels'] = labels
  return tokenized_examples

In [14]:
tokenized_dataset = token_dataset.map(tokenize_and_align_labels, batched=True)
print('New Columns: ', tokenized_dataset['train'].column_names)
tokenized_dataset = tokenized_dataset.remove_columns(token_dataset['train'].column_names)
print('After Removing: ', tokenized_dataset['train'].column_names)

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

New Columns:  ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels']
After Removing:  ['input_ids', 'token_type_ids', 'attention_mask', 'labels']


For token classification, inputs and labels must be padded in the same way. We use the `DataCollatorForTokenClassification` collator. -100 is used as the padding token by default.

In [15]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

### Define Metrics

In [16]:
metrics = evaluate.load('seqeval')
# change for the task
names = ner_names

def compute_metrics(eval_preds):
  # inside everything is numpy
  logits, labels = eval_preds
  preds = logits.argmax(axis=-1)

  # ignore the special tokens and padding ones where we have -100
  labels_list = [[names[l] for l in label if l != -100] for label in labels]
  preds_list = [[names[p] for (p, l) in zip(pred, label) if l != -100] for (pred, label) in zip(preds, labels)]

  perfs = metrics.compute(predictions=preds_list, references=labels_list)
  return {
        "precision": perfs["overall_precision"],
        "recall": perfs["overall_recall"],
        "f1": perfs["overall_f1"],
        "accuracy": perfs["overall_accuracy"]
    }


### Create Model and label to id mappings

In [17]:
label2id = {name: i for i, name in enumerate(ner_names)}
id2label = {i: name for name, i in label2id.items()}

In [18]:
model = AutoModelForTokenClassification.from_pretrained(model_ckpt,
                                                        id2label=id2label,
                                                        label2id=label2id)
print(model.config.num_labels)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


9


### Define Trainer and Fine tune Model

In [19]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [20]:
batch_size = 64
num_steps = len(tokenized_dataset['train'])//batch_size
model_name = f'srvmishra832/CoNLL2003_NER_BERT_Base_Cased'
training_arguments = TrainingArguments(output_dir=model_name,
                                       per_device_train_batch_size=batch_size,
                                       per_device_eval_batch_size=batch_size,
                                       disable_tqdm=False, learning_rate=2e-5,
                                       logging_steps=num_steps,
                                       evaluation_strategy='epoch', save_strategy='epoch',
                                       log_level='error', num_train_epochs=5,
                                       weight_decay=0.01, push_to_hub=True)



In [21]:
trainer = Trainer(model=model, args=training_arguments,
                  train_dataset=tokenized_dataset['train'],
                  eval_dataset=tokenized_dataset['validation'],
                  data_collator=data_collator,
                  compute_metrics=compute_metrics,
                  tokenizer=tokenizer)
trainer.train()
trainer.push_to_hub()

  trainer = Trainer(model=model, args=training_arguments,
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msrvmishra832[0m ([33msrvmishra832-indian-institute-of-science-bangalore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.298,0.07917,0.846263,0.899529,0.872084,0.976924
2,0.0647,0.06172,0.908838,0.936217,0.922324,0.983016
3,0.0394,0.057444,0.920742,0.944295,0.93237,0.984562
4,0.0286,0.055915,0.919495,0.94379,0.931484,0.985548
5,0.0222,0.057117,0.924951,0.945809,0.935264,0.985459


events.out.tfevents.1742886298.0874f1d8944b.8661.0:   0%|          | 0.00/9.26k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/srvmishra832/CoNLL2003_NER_BERT_Base_Cased/commit/65ba45f4f784c5c7abab4fc0c666f53fb528caa9', commit_message='End of training', commit_description='', oid='65ba45f4f784c5c7abab4fc0c666f53fb528caa9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/srvmishra832/CoNLL2003_NER_BERT_Base_Cased', endpoint='https://huggingface.co', repo_type='model', repo_id='srvmishra832/CoNLL2003_NER_BERT_Base_Cased'), pr_revision=None, pr_num=None)

### Predict on the Test Set and Print the Metrics

In [23]:
test_predictions = trainer.predict(tokenized_dataset['test'])
metrics_df = pd.DataFrame.from_dict(test_predictions.metrics, orient='index')
print(metrics_df.to_markdown())

|                         |          0 |
|:------------------------|-----------:|
| test_loss               |   0.155152 |
| test_precision          |   0.870738 |
| test_recall             |   0.908817 |
| test_f1                 |   0.88937  |
| test_accuracy           |   0.97047  |
| test_runtime            |  12.6569   |
| test_samples_per_second | 272.815    |
| test_steps_per_second   |   4.266    |
