In [None]:
!pip install -qqq transformers datasets evaluate seqeval accelerate

# Model Training and Evaluation using PyTorch

In [None]:
# Load Data
from datasets import load_dataset

wnut = load_dataset("wnut_17")

print(wnut)

In [None]:
# Each number in ner_tags column represents an entity. We can convert the numbers to names to get labels
label_list = wnut["train"].features["ner_tags"].feature.names

label_list

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

id2label = {
    0: "O",
    1: "B-corporation",
    2: "I-corporation",
    3: "B-creative-work",
    4: "I-creative-work",
    5: "B-group",
    6: "I-group",
    7: "B-location",
    8: "I-location",
    9: "B-person",
    10: "I-person",
    11: "B-product",
    12: "I-product",
}
label2id = {
    "O": 0,
    "B-corporation": 1,
    "I-corporation": 2,
    "B-creative-work": 3,
    "I-creative-work": 4,
    "B-group": 5,
    "I-group": 6,
    "B-location": 7,
    "I-location": 8,
    "B-person": 9,
    "I-person": 10,
    "B-product": 11,
    "I-product": 12,
}
model = AutoModelForTokenClassification.from_pretrained(checkpoint, num_labels=13, id2label=id2label, label2id=label2id)

In [None]:
# Check Model
print(model)

In [None]:
for name, param in model.named_parameters():
    print(f"Parameter name: {name}")
    print(f"Requires gradients: {param.requires_grad}")
    print(f"Parameter shape: {param.shape}")
    print("=" * 30)

In [None]:
# # Optional: Specify layers to fine tune and which to freeze by setting requires grad to true and false
# layers_to_fine_tune = ['classifier.weight','classifier.bias']

# # Freeze layers
# for name, param in model.named_parameters():
#     if not any(layer_name in name for layer_name in layers_to_fine_tune):
#         param.requires_grad = False

# # Unfreeze Fine-tune layers
# for name, param in model.named_parameters():
#     if any(layer_name in name for layer_name in layers_to_fine_tune):
#         param.requires_grad = True

In [None]:
# Define preprocess function
def tokenize_and_align_labels(examples):
  tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

  labels = []
  for i, label in enumerate(examples[f"ner_tags"]):
    word_ids = tokenized_inputs.word_ids(batch_index=i)
    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:
      if word_idx is None:
        label_ids.append(-100)
      elif word_idx != previous_word_idx:
        # Only label the first token of a given word
        label_ids.append(label[word_idx])
      else:
        label_ids.append(-100)

      previous_word_idx = word_idx
    labels.append(label_ids)


  tokenized_inputs["labels"] = labels

  return tokenized_inputs

In [None]:
# Apply preprocessing to every instance in the dataset
tokenized_wnut = wnut.map(tokenize_and_align_labels, batched = True,)

In [None]:
print(tokenized_wnut)
print(len(tokenized_wnut['train']['input_ids'][0]))
print(len(tokenized_wnut['train']['labels'][0]))

In [None]:
# Create a batch of examples, with dynamic padding. Use the appropriate collator function
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer = tokenizer)

In [None]:
tokenized_wnut_trch = tokenized_wnut.remove_columns(["id", "tokens", "ner_tags"])

tokenized_wnut_trch.set_format("torch")
tokenized_wnut_trch

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_wnut_trch['train'], shuffle = True, batch_size = 16, collate_fn=data_collator
)

val_dataloader = DataLoader(
    tokenized_wnut_trch['validation'], shuffle = True, batch_size = 16, collate_fn=data_collator
)

In [None]:
import pprint
batch = next(iter(train_dataloader))

#print(batch)
print(len(train_dataloader))
print(f"input_ids batch shape: {batch.input_ids.shape}")
print(f"attention_mask batch shape: {batch.attention_mask.shape}")
print(f"labels batch shape: {batch.labels.shape}")

In [None]:
import evaluate

seqeval = evaluate.load("seqeval")

In [None]:
from transformers import AdamW, get_scheduler
import torch
import numpy as np

# Define Optimiser
optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=0.01)

# Define Loss Function
def compute_metrics(predictions, labels):

  true_predictions = [
      [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
  ]
  true_labels = [
      [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
  ]

  results = seqeval.compute(predictions=true_predictions, references=true_labels)

  return {
      "precision": results["overall_precision"],
      "recall": results["overall_recall"],
      'f1': results["overall_f1"],
      "accuracy": results["overall_accuracy"],
  }

# Initialize variables to track the best model
best_loss = float('inf')
best_checkpoint_path = None

# Collect Statistics
train_loss = []
train_metrics = []
test_metrics = []

## Place training on a GPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)
model.to(device)

# Define Learning Rate Scheduler
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
# Set up a list to store checkpoints
checkpoint_paths = []

model.train()

# Training loop
for epoch in range(num_epochs):
    for i, batch in enumerate(train_dataloader):
        # Set Gradients to 0
        optimizer.zero_grad()

        # Perform a forward model pass
        ## Put the batch onto a GPU
        batch = {k: v.to(device) for (k, v) in batch.items()}

        ## Forward Pass
        outputs = model(**batch)

        # Compute Loss
        loss = outputs.loss

        # Compute Metric
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        labels = batch["labels"]

        metrics = compute_metrics(predictions=predictions, labels=labels)

        # Store Metrics
        train_loss.append(float(loss))
        train_metrics.append(metrics)

        # Backward pass to update parameters
        loss.backward()

        # Optimizer step
        optimizer.step()

        # Update Learning Rate
        lr_scheduler.step()

        # Print Progress
        print(f"epoch {epoch} batch_number {i} loss {loss} metrics {metrics}")

    # Save checkpoint at certain intervals
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
        'metrics': metrics,
        # Add other relevant information if needed
    }
    checkpoint_path = f'checkpoint_epoch_{epoch}_batch_{i}.bin'
    torch.save(checkpoint, checkpoint_path)
    checkpoint_paths.append(checkpoint_path)

    # Update best_loss and best_checkpoint_path if needed
    if loss < best_loss:
        best_loss = loss
        best_checkpoint_path = checkpoint_path




In [None]:
# Load the best model checkpoint
best_checkpoint = torch.load(best_checkpoint_path)

model.load_state_dict(best_checkpoint['model_state_dict'])
optimizer.load_state_dict(best_checkpoint['optimizer_state_dict'])

best_epoch = best_checkpoint['epoch']
best_loss = best_checkpoint['loss']
best_metrics = best_checkpoint['metrics']


In [None]:
print(best_epoch)
print(best_loss)
print(best_metrics)

In [None]:
model.eval()

eval_metrics = []
for epoch in range(num_epochs):
  for i, batch in enumerate(val_dataloader):
    # Perform a forward model pass
    ## Put the batch onto a GPU
    batch = {k: v.to(device) for (k, v) in batch.items()}

    ## Forward Pass - Set no grad because we don't want to update parameters in validation
    with torch.no_grad():
        outputs = model(**batch)

    # Compute Metric
    logits = outputs.logits
    predictions = torch.argmax(logits, dim = -1)
    labels = batch["labels"]

    metrics = compute_metrics(predictions=predictions, labels=labels)

    # Store Metrics
    eval_metrics.append(metrics)

    # Print Progress
    print(f"epoch {epoch} batch_number {i} metrics {metrics}")


In [None]:
import pandas as pd

eval_df = pd.DataFrame(eval_metrics)
eval_df

# Saving and Loading Tokenizer and Model

In [None]:
# Suggested from Docs: https://huggingface.co/transformers/v1.2.0/serialization.html
# Save Tokenizer and Model
import os

output_dir = "./token_classification_wnut/"
os.makedirs(output_dir, exist_ok=True)

# Step 1: Save a model, configuration and vocabulary that you have fine-tuned

# If we have a distributed model, save only the encapsulated model
# (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
model_to_save = model.module if hasattr(model, 'module') else model

# If we save using the predefined names, we can load using `from_pretrained`
WEIGHTS_NAME = "pytorch_model.bin"
CONFIG_NAME = "config.json"
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)

torch.save(model.state_dict(), output_model_file)
model.config.to_json_file(output_config_file)
#tokenizer.save_vocabulary(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
# Load Model
pretrained_loaded_model = AutoModelForTokenClassification.from_pretrained("token_classification_wnut")

In [None]:
# Using raw Pytorch state dict + model architecture method
torch.save(model.state_dict(), 'token_classification_wnut_model.bin')

In [None]:
checkpoint = "distilbert-base-uncased"
loaded_model = AutoModelForTokenClassification.from_pretrained(checkpoint, num_labels=13, id2label=id2label, label2id=label2id)

loaded_model.load_state_dict(torch.load('./token_classification_wnut/token_classification_wnut_model.bin'))
loaded_model.eval()

# Inference

In [None]:
# Inference
text = "Chuck Norris starred in the TV Show - Walker Texas Ranger"
## Tokenize inputs
inputs = tokenizer(text, return_tensors="pt")

## Feed inputs to the model and return logits
with torch.no_grad():
  logits = pretrained_loaded_model(**inputs).logits

In [None]:
predictions = torch.argmax(logits, dim=2)
predicted_token_class = [pretrained_loaded_model.config.id2label[t.item()] for t in predictions[0]]

print(predictions)
print(predicted_token_class)

In [None]:
dict(zip(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]), predicted_token_class))