<a href="https://colab.research.google.com/github/sjpark0605/NLP-FYP/blob/main/NER_Training_Loop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
%%capture
!pip install datasets evaluate transformers[sentencepiece] seqeval accelerate

In [7]:
# Imports for Data Processing
import pandas as pd
import torch
from datasets import load_from_disk

In [8]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive/')

dataset_dir = '/content/drive/MyDrive/COMP0029/datasets/'

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [9]:
device = torch.device('cpu')

if torch.cuda.is_available():
  device = torch.device('cuda')

In [10]:
corpus_datasets = load_from_disk(dataset_dir + 'english-recipe-ner')

In [11]:
ner_feature = corpus_datasets["train"].features["ner_tags"]
label_names = ner_feature.feature.names
pure_label_names = list(set(label.replace("-B", "").replace("-I", "") for label in label_names))

In [12]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [13]:
def align_labels_with_tokens(labels, word_ids):
  new_labels = []
  current_word = None
  for word_id in word_ids:
    if word_id != current_word:
      # Start of a new word!
      current_word = word_id
      label = -100 if word_id is None else labels[word_id]
      new_labels.append(label)
    elif word_id is None:
      # Special token
      new_labels.append(-100)
    else:
      # Same word as previous token
      label = labels[word_id]
      # If the label is XXX-B we change it to XXX-I
      if label % 2 == 0 and label != 20:
        new_labels.append(-100)
        # label += 1
        # new_labels.append(label)

  return new_labels

In [14]:
def tokenize_and_align_labels(examples):
  tokenized_inputs = tokenizer(
      examples["tokens"], truncation=True, is_split_into_words=True, max_length=75
  )
  all_labels = examples["ner_tags"]
  new_labels = []
  for i, labels in enumerate(all_labels):
    word_ids = tokenized_inputs.word_ids(i)
    new_labels.append(align_labels_with_tokens(labels, word_ids))

  tokenized_inputs["labels"] = new_labels
  return tokenized_inputs

In [15]:
tokenized_datasets = corpus_datasets.map(
  tokenize_and_align_labels,
  batched=True,
  remove_columns=corpus_datasets["train"].column_names,
)

Map:   0%|          | 0/2201 [00:00<?, ? examples/s]

Map:   0%|          | 0/551 [00:00<?, ? examples/s]

In [16]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [17]:
import evaluate

metric = evaluate.load("seqeval")

In [18]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [19]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
  tokenized_datasets["train"],
  shuffle=True,
  collate_fn=data_collator,
  batch_size=32,
)
eval_dataloader = DataLoader(
  tokenized_datasets["valid"], 
  collate_fn=data_collator, 
  batch_size=32,
)

In [20]:
from transformers import AutoModelForTokenClassification

ner_model = AutoModelForTokenClassification.from_pretrained(
  model_checkpoint,
  id2label=id2label,
  label2id=label2id,
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [21]:
from torch.optim import AdamW

optimizer = AdamW(ner_model.parameters(), lr=3e-5, eps=1e-8, weight_decay=0.1)

In [22]:
from accelerate import Accelerator

accelerator = Accelerator()
ner_model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
  ner_model, optimizer, train_dataloader, eval_dataloader
)

In [23]:
print(len(train_dataloader))

69


In [24]:
from transformers import get_scheduler

num_train_epochs = 10
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
  "linear",
  optimizer=optimizer,
  num_warmup_steps=0,
  num_training_steps=num_training_steps,
)

In [25]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_predictions, true_labels

In [26]:
from tqdm.auto import tqdm
from torch import nn
import torch

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    
    # Training
    train_loss_val = 0

    ner_model.train()
    for batch in train_dataloader:
        labels = batch.get("labels")
        outputs = ner_model(**batch)

        logits = outputs.get("logits")
        loss = outputs.loss

        train_loss_val += loss.item()

        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    print(f"Training Loss: {train_loss_val / len(train_dataloader)}")

    # Evaluation
    eval_loss_val = 0

    ner_model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = ner_model(**batch)
            
        eval_loss_val += outputs.get("loss").item()

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)


    print(f"Validation Loss: {eval_loss_val / len(eval_dataloader)}")

    results = metric.compute(suffix=True)
    print(
        f"epoch {epoch}:",
        {
            f"overall_{key}": results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
        "\n"
    )

  0%|          | 0/690 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Training Loss: 1.2436255346173826
Validation Loss: 0.6996073739396201
epoch 0: {'overall_precision': 0.7094953519256308, 'overall_recall': 0.7161528150134048, 'overall_f1': 0.7128085390260173, 'overall_accuracy': 0.8083067092651757} 



  _warn_prf(average, modifier, msg_start, len(result))


Training Loss: 0.6326342561970586
Validation Loss: 0.5499346339040332
epoch 1: {'overall_precision': 0.7500837520938023, 'overall_recall': 0.7503351206434317, 'overall_f1': 0.7502094153124478, 'overall_accuracy': 0.8289403620873269} 

Training Loss: 0.496767227632412
Validation Loss: 0.5364173038138284
epoch 2: {'overall_precision': 0.7631055076310551, 'overall_recall': 0.7707774798927614, 'overall_f1': 0.7669223074358119, 'overall_accuracy': 0.8423855165069223} 

Training Loss: 0.4121871845445771
Validation Loss: 0.5144125736422009
epoch 3: {'overall_precision': 0.7659993436166721, 'overall_recall': 0.782171581769437, 'overall_f1': 0.7740009948598906, 'overall_accuracy': 0.847444089456869} 

Training Loss: 0.3426238829674928
Validation Loss: 0.5206184784571329
epoch 4: {'overall_precision': 0.7893681043129388, 'overall_recall': 0.7912198391420912, 'overall_f1': 0.7902928870292886, 'overall_accuracy': 0.8571618743343983} 

Training Loss: 0.29551003635793494
Validation Loss: 0.513020538

In [27]:
# Evaluation Metric of Final Iteration
for label in pure_label_names:
    if label != "O" and label != "At":
      print(
          f"{label}:",
          {
              key: results[label][key]
              for key in ["precision", "recall", "f1"]
          },
      )

Q: {'precision': 0.6020408163265306, 'recall': 0.6413043478260869, 'f1': 0.6210526315789474}
Af: {'precision': 0.525, 'recall': 0.3387096774193548, 'f1': 0.4117647058823529}
F: {'precision': 0.7987486965589156, 'recall': 0.8254310344827587, 'f1': 0.8118706942236356}
St: {'precision': 0.7025641025641025, 'recall': 0.7061855670103093, 'f1': 0.7043701799485862}
Sf: {'precision': 0.5924170616113744, 'recall': 0.6377551020408163, 'f1': 0.6142506142506142}
Ac: {'precision': 0.8809523809523809, 'recall': 0.8683673469387755, 'f1': 0.8746145940390544}
Ac2: {'precision': 0.2765957446808511, 'recall': 0.4482758620689655, 'f1': 0.3421052631578947}
T: {'precision': 0.8563685636856369, 'recall': 0.8494623655913979, 'f1': 0.8529014844804319}
D: {'precision': 0.7214285714285714, 'recall': 0.7890625, 'f1': 0.753731343283582}
