<a href="https://colab.research.google.com/github/sudheer-muthyala/Transformers_HuggingFace_Tensorflow/blob/main/transformers_name_entity_recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
!pip install evaluate
!pip install seqeval

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from datasets import load_dataset
from transformers import RobertaTokenizerFast, DataCollatorForTokenClassification, TFRobertaForTokenClassification, create_optimizer
import evaluate

In [None]:
dataset = load_dataset("conll2003")

In [None]:
dataset["train"][20]

{'id': '20',
 'tokens': ['Rare',
  'Hendrix',
  'song',
  'draft',
  'sells',
  'for',
  'almost',
  '$',
  '17,000',
  '.'],
 'pos_tags': [22, 22, 21, 21, 42, 15, 30, 3, 11, 7],
 'chunk_tags': [11, 12, 12, 12, 21, 13, 11, 12, 12, 0],
 'ner_tags': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]}

In [None]:
model_id = "roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained(model_id, add_prefix_space=True)

In [None]:
tk_check = tokenizer(dataset["train"][20]["tokens"], is_split_into_words=True)
tk_check

{'input_ids': [0, 28751, 16544, 15072, 2214, 2479, 7683, 13, 818, 68, 601, 6, 151, 479, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tk_check.word_ids()

[None, 0, 1, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 9, None]

In [None]:
def align_labels_with_tokens(labels, word_ids):
  new_labels = []
  current_word = None
  for word in word_ids:
    if word != current_word:
      current_word = word
      label = -100 if word == None else labels[word]
      new_labels.append(label)
    elif word == None:
      new_labels.append(-100)
    else:
      label = labels[word]
      if label % 2 == 1:
        label +=1
      new_labels.append(label)

  return new_labels

In [None]:
def tokenize_preprocess(dataset):
  out = tokenizer(dataset["tokens"], truncation=True, is_split_into_words=True)
  out["labels"] = align_labels_with_tokens(dataset["ner_tags"], out.word_ids())
  return out

tokenized_dataset = dataset.map(tokenize_preprocess, remove_columns=["id", "tokens", "pos_tags", "chunk_tags", "ner_tags"])

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [None]:
BATCH_SIZE = 16
collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")

train_dataset = tokenized_dataset["train"].to_tf_dataset(columns=["input_ids", "attention_mask"],
                                                         label_cols=["labels"],
                                                         shuffle=True,
                                                         collate_fn=collator,
                                                         batch_size=BATCH_SIZE)

validation_dataset = tokenized_dataset["validation"].to_tf_dataset(columns=["input_ids", "attention_mask"],
                                                         label_cols=["labels"],
                                                         shuffle=True,
                                                         collate_fn=collator,
                                                         batch_size=BATCH_SIZE)

test_dataset = tokenized_dataset["test"].to_tf_dataset(columns=["input_ids", "attention_mask"],
                                                         label_cols=["labels"],
                                                         shuffle=True,
                                                         collate_fn=collator,
                                                         batch_size=BATCH_SIZE)

In [None]:
EPOCHS = 2
batches_per_epoch = len(tokenized_dataset["train"]["input_ids"]) // BATCH_SIZE
training_steps = int(batches_per_epoch * EPOCHS)

optimizer_, schedule = create_optimizer(init_lr=2e-5,num_warmup_steps=0, num_train_steps=training_steps)

In [None]:
model = TFRobertaForTokenClassification.from_pretrained(model_id, num_labels=9)
model.summary()

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForTokenClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForTokenClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForTokenClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_roberta_for_token_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFRobertaMainLaye  multiple                  124055040 
 r)                                                              
                                                                 
 dropout_37 (Dropout)        multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  6921      
                                                                 
Total params: 124061961 (473.26 MB)
Trainable params: 124061961 (473.26 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model.compile(optimizer=optimizer_) #metrics=tf.keras.metrics.SparseCategoricalAccuracy())

In [None]:
history = model.fit(train_dataset,
                    validation_data=validation_dataset,
                    epochs=EPOCHS)

Epoch 1/2


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported
Epoch 2/2


In [None]:
ind_to_label={0:'O', 1:'B-PER',2:'I-PER',3:'B-ORG',4:'I-ORG',5:'B-LOC',6:'I-LOC',7:'B-MISC',8:'I-MISC'}
all_predictions = []
all_labels = []


for input, label in test_dataset.take(1):
  pred = model.predict(input)
  predictions = tf.argmax(pred.logits, axis=-1).numpy()
  labels = label.numpy()

  for prediction, label in zip(predictions, labels):
    for prediction_idx, label_idx in zip(prediction, label):
      if label_idx == -100:
        continue
      all_predictions.append(ind_to_label[prediction_idx])
      all_labels.append(ind_to_label[label_idx])



In [None]:
print(all_predictions)
print(all_labels)

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'B-PER', 'I-PER', 'I-PER', 'O', 'O', 'B-MISC', 'I-MISC', 'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 

In [None]:
metric=evaluate.load("seqeval")

In [None]:
metric.compute(predictions=[all_predictions], references=[all_labels])

{'LOC': {'precision': 0.8,
  'recall': 0.8,
  'f1': 0.8000000000000002,
  'number': 5},
 'MISC': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 5},
 'ORG': {'precision': 0.9230769230769231,
  'recall': 1.0,
  'f1': 0.9600000000000001,
  'number': 12},
 'PER': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 4},
 'overall_precision': 0.9259259259259259,
 'overall_recall': 0.9615384615384616,
 'overall_f1': 0.9433962264150944,
 'overall_accuracy': 0.9919678714859438}

In [None]:
ind_to_label

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC',
 7: 'B-MISC',
 8: 'I-MISC'}

In [None]:
test = test_dataset.take(1)

In [None]:
for input, label in test:
  tokens = []
  for i in input['input_ids'].numpy():
    tokens.append(tokenizer.convert_ids_to_tokens(i, skip_special_tokens=True))
  pred = model.predict(input).logits
  prediction = tf.argmax(pred, axis=-1).numpy()
  for j in range(len(tokens)):
    for i in range(len(tokens[j])):
      print(f"{tokens[j][i].replace('Ġ', '')}-->{ind_to_label[prediction[j][i+1]]}",end='  ')
    print("", end="\n")

WAR-->B-LOC  SA-->I-LOC  W-->I-LOC  1996-->O  --->O  12-->O  --->O  06-->O  
H-->B-ORG  apo-->I-ORG  el-->I-ORG  Tel-->I-ORG  Aviv-->I-ORG  1-->O  Bet-->B-ORG  ar-->I-ORG  Jerusalem-->I-ORG  4-->O  
Squad-->O  :-->O  
10-->O  .-->O  S-->B-PER  vet-->I-PER  l-->I-PER  ana-->I-PER  Glad-->I-PER  ish-->I-PER  iva-->I-PER  (-->O  Russia-->B-LOC  )-->O  137-->O  
BO-->B-PER  BS-->I-PER  LE-->I-PER  IGH-->I-PER  --->O  SH-->O  IM-->O  ER-->O  P-->O  IL-->O  OTS-->O  USA-->B-MISC  III-->I-MISC  TO-->O  SUR-->O  PR-->O  ISE-->O  WIN-->O  .-->O  
The-->O  UK-->B-ORG  Department-->I-ORG  of-->I-ORG  Transport-->I-ORG  on-->O  Friday-->O  said-->O  that-->O  the-->O  latest-->O  round-->O  of-->O  "-->O  open-->O  skies-->O  "-->O  talks-->O  with-->O  the-->O  U-->B-LOC  .-->I-LOC  S-->I-LOC  .-->I-LOC  had-->O  ended-->O  with-->O  no-->O  deal-->O  on-->O  liberal-->O  ising-->O  the-->O  trans-->O  atlantic-->O  flight-->O  market-->O  and-->O  no-->O  date-->O  set-->O  for-->O  when-->O  ta