# Named Entity Recognition (NER)

In [1]:
#@ DOWNLOAD REQUIRED LIBRARIES AND DEPENDENCIES
%%bash
pip install transformers[torch] -q
pip install datasets tokenizers -q
pip install seqeval -q

In [2]:
#@ INSTALLING THE REQUIRED LIBRARIES AND DEPENDENCIES
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from transformers import BertTokenizerFast

import numpy as np
import datasets

In [3]:
#@ LOADING THE HUGGING FACE DATASETS
conll_data = datasets.load_dataset("conll2003")
conll_data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [4]:
conll_data.shape                 # checking shape of the data

{'train': (14041, 5), 'validation': (3250, 5), 'test': (3453, 5)}

In [5]:
# let's explore the train dataset
print(conll_data["train"][0])
print("\n The NER tags used in this datasets are: \n", conll_data["train"].features["ner_tags"])

{'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

 The NER tags used in this datasets are: 
 Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)


In [6]:
print(conll_data["train"].description)                 # description of the dataset

The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on
four types of named entities: persons, locations, organizations and names of miscellaneous entities that do
not belong to the previous three groups.

The CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on
a separate line and there is an empty line after each sentence. The first item on each line is a word, the second
a part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags
and the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only
if two phrases of the same type immediately follow each other, the first word of the second phrase will have tag
B-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2
tagging scheme, whereas the original dataset uses IOB1.

For 

In [7]:
#@ Loading the BERT Tokenizer for tokenization
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

**Note:**

- Transformers are often pretrained with subword tokenizers, meaning that even if your inputs have been split into words already, each of those words could be split again by the tokenizer.

- This means that we need to do some processing on our labels as the input ids returned by the tokenizer are longer than the lists of labels our dataset contain.

- This is happening, first because some special tokens might be added (we can a `[CLS]` and a `[SEP]` above) and then because of those possible splits of words in multiple tokens:

In [8]:
#@ Inspecting output of variables
conll_data["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [9]:
example_text = conll_data["train"][0]
tokenized_input = tokenizer(example_text["tokens"], is_split_into_words = True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
word_ids = tokenized_input.word_ids()

print(word_ids)
print(tokenized_input)

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None]
{'input_ids': [101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [10]:
tokens

['[CLS]',
 'eu',
 'rejects',
 'german',
 'call',
 'to',
 'boycott',
 'british',
 'lamb',
 '.',
 '[SEP]']

- The input ids returned by the tokenizer are longer than the lists of labels our dataset contains

In [12]:
def tokenize_and_align_labels(examples, label_all_tokens = True):
    """
    Tokenizes a list of examples and aligns corresponding labels to tokenized inputs.

    Args:
        examples (dict): A dictionary containing the input tokens and corresponding labels.

        label_all_tokens (bool): If True, assigns labels to all tokens in the input.
        If False, assigns labels only to the first token of each word.

    Returns:
        dict: A dictionary containing the tokenized inputs with aligned labels.
        """

    tokenized_inputs = tokenizer(examples["tokens"], truncation = True, is_split_into_words = True)

    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index = i)
        previous_word_idx = None

        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)

            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

In [13]:
conll_data['train'][4:5]

{'id': ['4'],
 'tokens': [['Germany',
   "'s",
   'representative',
   'to',
   'the',
   'European',
   'Union',
   "'s",
   'veterinary',
   'committee',
   'Werner',
   'Zwingmann',
   'said',
   'on',
   'Wednesday',
   'consumers',
   'should',
   'buy',
   'sheepmeat',
   'from',
   'countries',
   'other',
   'than',
   'Britain',
   'until',
   'the',
   'scientific',
   'advice',
   'was',
   'clearer',
   '.']],
 'pos_tags': [[22,
   27,
   21,
   35,
   12,
   22,
   22,
   27,
   16,
   21,
   22,
   22,
   38,
   15,
   22,
   24,
   20,
   37,
   21,
   15,
   24,
   16,
   15,
   22,
   15,
   12,
   16,
   21,
   38,
   17,
   7]],
 'chunk_tags': [[11,
   11,
   12,
   13,
   11,
   12,
   12,
   11,
   12,
   12,
   12,
   12,
   21,
   13,
   11,
   12,
   21,
   22,
   11,
   13,
   11,
   1,
   13,
   11,
   17,
   11,
   12,
   12,
   21,
   1,
   0]],
 'ner_tags': [[5,
   0,
   0,
   0,
   0,
   3,
   4,
   0,
   0,
   0,
   1,
   2,
   0,
   0,
   0,
   0,
   0,


In [16]:
#@ Let's apply the function to the whole dataset
tokenized_datasets = conll_data.map(tokenize_and_align_labels, batched = True)
tokenized_datasets['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0],
 'input_ids': [101,
  7327,
  19164,
  2446,
  2655,
  2000,
  17757,
  2329,
  12559,
  1012,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100]}

In [17]:
#@ DEFINING THE MODEL
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels = 9)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
#@ DEFINING THE TRAINING ARGUMENTS
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    "test-ner",
    evaluation_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 3,
    weight_decay = 0.01,
)

In [19]:
#@ INSTANTIATING THE DATA COLLATOR AND EVALUATION METRICS
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = datasets.load_metric("seqeval")

  metric = datasets.load_metric("seqeval")


In [20]:
#@ Let's test the metric on example
example = conll_data['train'][0]
label_list = conll_data["train"].features["ner_tags"].feature.names

In [21]:
for i in example["ner_tags"]:
    print(i)

3
0
7
0
0
0
7
0
0


In [22]:
labels = [label_list[i] for i in example["ner_tags"]]

In [23]:
# computing the metrics
metric.compute(predictions = [labels], references = [labels])

{'MISC': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [24]:
import numpy as np
from typing import List, Tuple

def compute_metrics(eval_preds):
    """
    Computes evaluation metrics for named entity recognition (NER) tasks based on predicted logits and true labels.

    Args:
        eval_preds (tuple): A tuple containing two lists: predicted logits and true labels.
            - Predicted logits: A list of lists of float values representing the model's predictions.
            - True labels: A list of lists of integers representing the ground truth labels.

    Returns:
        dict: A dictionary containing computed metrics, including precision, recall, F1 score, and accuracy.
    """
    pred_logits, labels = eval_preds

    pred_logits = np.argmax(pred_logits, axis=2)

    # Filter out labels with the special value -100
    predictions = [
        [label_list[eval_pred] for eval_pred, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    true_labels = [
        [label_list[l] for eval_pred, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    results = metric.compute(predictions=predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [25]:
#@ INSTANTIATING THE TRAINER
trainer = Trainer(
    model,
    args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['validation'],
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

trainer.train();

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2214,0.065681,0.91631,0.933326,0.92474,0.98216
2,0.0449,0.056375,0.931271,0.939814,0.935523,0.985162
3,0.0266,0.058532,0.93447,0.944401,0.939409,0.985702


In [26]:
#@ SAVING THE MODEL AND TOKENIZER
model.save_pretrained("/content/drive/MyDrive/Projects/Fine Tuning BERT for NER/ner_model")
tokenizer.save_pretrained("/content/drive/MyDrive/Projects/Fine Tuning BERT for NER/tokenizer")

('/content/drive/MyDrive/Projects/Fine Tuning BERT for NER/tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/Projects/Fine Tuning BERT for NER/tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/Projects/Fine Tuning BERT for NER/tokenizer/vocab.txt',
 '/content/drive/MyDrive/Projects/Fine Tuning BERT for NER/tokenizer/added_tokens.json',
 '/content/drive/MyDrive/Projects/Fine Tuning BERT for NER/tokenizer/tokenizer.json')

In [27]:
id2label = {
    str(i): label for i, label in enumerate(label_list)
}

print(id2label)

label2id = {
    label: str(i) for i, label in enumerate(label_list)
}
print(label2id)

{'0': 'O', '1': 'B-PER', '2': 'I-PER', '3': 'B-ORG', '4': 'I-ORG', '5': 'B-LOC', '6': 'I-LOC', '7': 'B-MISC', '8': 'I-MISC'}
{'O': '0', 'B-PER': '1', 'I-PER': '2', 'B-ORG': '3', 'I-ORG': '4', 'B-LOC': '5', 'I-LOC': '6', 'B-MISC': '7', 'I-MISC': '8'}


In [28]:
#@ LOADING THE MODEL
import json
config = json.load(open("/content/drive/MyDrive/Projects/Fine Tuning BERT for NER/ner_model/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id
json.dump(config, open("/content/drive/MyDrive/Projects/Fine Tuning BERT for NER/ner_model/config.json", "w"))

In [29]:
#@ LOADING THE FINETUNED CLASSIFICATION MODEL
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("/content/drive/MyDrive/Projects/Fine Tuning BERT for NER/ner_model")

In [30]:
#@ Let's do some prediction
from transformers import pipeline
nlp = pipeline("ner",
               model = model_fine_tuned,
               tokenizer = tokenizer,
               )

example = "Sundar Pichai is CEO of Google"
ner_results = nlp(example)
print(ner_results)

[{'entity': 'B-PER', 'score': 0.99862564, 'index': 1, 'word': 'sun', 'start': 0, 'end': 3}, {'entity': 'B-PER', 'score': 0.99862754, 'index': 2, 'word': '##dar', 'start': 3, 'end': 6}, {'entity': 'I-PER', 'score': 0.99837035, 'index': 3, 'word': 'pic', 'start': 7, 'end': 10}, {'entity': 'I-PER', 'score': 0.99800485, 'index': 4, 'word': '##hai', 'start': 10, 'end': 13}, {'entity': 'B-ORG', 'score': 0.98913145, 'index': 8, 'word': 'google', 'start': 24, 'end': 30}]


**The END**