<a href="https://colab.research.google.com/github/qmedina/Hack-Harvard2022/blob/main/F24_Project_2_Bert_QA_Stencil_Quetz.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
!pip install datasets==3.1.0
#load_dataset sometimes hangs on a higher version
!pip install transformers torch evaluate tqdm



# Preprocessing

In [14]:
# from google.colab import drive
# drive.mount("/content/drive", force_remount=True)

In [15]:
from datasets import load_dataset

import torch
import numpy as np
import random

# we set up some seeds so that we can reproduce results
seed = 123
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True


In [16]:
"""
Some options for BERT model that can be run in colab:

"distilbert-base-uncased",
"distilbert-base-uncased-distilled-squad",
"distilbert-base-cased",
"distilbert-base-cased-distilled-squad",

"""

'\nSome options for BERT model that can be run in colab:\n\n"distilbert-base-uncased",\n"distilbert-base-uncased-distilled-squad",\n"distilbert-base-cased",\n"distilbert-base-cased-distilled-squad",\n\n'

In [17]:
# Change train.json / dev.json to the appropriate filepaths =====
# FOLDER = "/content/drive/My Drive/Bert QA Data"
FOLDER = ""
data_files = {"train": f"{FOLDER}/all_train.json", "dev": f"{FOLDER}/all_dev.json"}
dataset = load_dataset('json', data_files=data_files)

In [18]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['name', 'id', 'questions', 'answers', 'has_correct_context', 'contexts'],
        num_rows: 27866
    })
    dev: Dataset({
        features: ['name', 'id', 'questions', 'answers', 'has_correct_context', 'contexts'],
        num_rows: 1743
    })
})


In [19]:
from transformers import DistilBertModel, DistilBertTokenizerFast, get_scheduler
from tqdm.auto import tqdm
from evaluate import load as load_metric
from collections import Counter
from sklearn.utils import shuffle

In [20]:
def load_model():
  '''Load the DistilBERT model and tokenizer.'''

  # First define a custom model to work with question answering
  class QAModel(torch.nn.Module):
    def __init__(self):
      super(QAModel, self).__init__() # initialize parent class
      self.model = DistilBertModel.from_pretrained("distilbert-base-uncased-distilled-squad")
      self.qa_outputs = torch.nn.Linear(self.model.config.hidden_size, 2) # linear layer to find the start and end logits for the span
      self.type_outputs = torch.nn.Linear(self.model.config.hidden_size, 2) # linear layer to find the type of answer (no answer or short answer)

    def forward(self, input_ids, attention_mask):
      outputs = self.model(input_ids, attention_mask) # pass input data through model
      outputs = outputs.last_hidden_state # take the last hidden state
      logits = self.qa_outputs(outputs)
      type_logits = self.type_outputs(outputs[:, 0, :]) # [CLS] token for type classification
      \
      # split the logits into start and end logits
      start_logits, end_logits = logits.split(1, dim=-1)
      start_logits = start_logits.squeeze(-1)
      end_logits = end_logits.squeeze(-1)

      return start_logits, end_logits, type_logits

  # initialize and return the custom model and the pretrained tokenizer
  model = QAModel()
  tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased-distilled-squad")
  return model, tokenizer

In [21]:
def load_data():
  '''Assign the train and validation splits from the dataset we loaded above.'''
  # access the train and validation portions of the dataset
  train = dataset["train"]
  validation = dataset["dev"]

  # shuffle the datasets since they are grouped by no answer and short answer
  # but keep the same seed to reproduce results
  train = train.shuffle(seed=seed)
  validation = validation.shuffle(seed=seed)

  return train, validation

In [22]:
class QADataset(torch.utils.data.Dataset):
    """ Custom dataset class for Question-Answering task. """
    def __init__(self, contexts, questions, answers, tokenizer : DistilBertTokenizerFast):
        """Initializes the QADataset with the provided contexts, questions, and answers."""
        self.contexts = contexts
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_len = tokenizer.model_max_length

    def __len__(self) -> int:
        """
        Returns the number of examples in the dataset.

        Returns:
        -------
        int
            Number of examples in the dataset
        """
        return len(self.contexts)

    def __getitem__(self, index : int) -> dict[str, torch.Tensor]:
        """
        Retrieve a preprocessed data item from the dataset at the specified index.
        This is called when iterating through a QADataset

        Parameters:
        ----------
        index : int
            The index of the data item to retrieve.

        Returns:
        -------
        dict[str, Any]
            A dictionary containing the preprocessed data for the given index.
            The dictionary includes the following keys:
            - "input_ids": Encoded input IDs for the tweet.
            - "attention_mask": Attention mask for the tweet.
            - "start_positions": Token index of the start of the answer span.
            - "end_positions": Token index of the end of the answer span.
            - "answer_type": A label indicating whether there is an answer or not (1 for answer, 0 for no answer).
        """

        # get the question, context, and answer
        question = self.questions[index][0]["input_text"]
        context = self.contexts[index]
        answer = self.answers[index][0]

        # tokenize the question and context
        encoded = self.tokenizer(
          question,
          context,
          truncation=True,
          max_length=self.max_len,
          padding="max_length",
          return_tensors="pt",
          return_offsets_mapping=True,
        )

        # get the start and end positions for the answer
        start_char, end_char = answer["span_start"], answer["span_end"]

        # If no answer, start and end indices point to [CLS] token at i = 0
        if answer["input_text"] == "no_answer":
          start = 0
          end = 0
          answer_type = 0

        else:
          # convert character positions to token positions using offset mapping
          start, end = self.char_to_token(encoded["offset_mapping"][0].squeeze(), start_char, end_char)
          if start is None or end is None or start > end or (start == 0 and end == 0): # edge case where token positions are incorrect
            start = 0
            end = 0
            answer_type = 0
          else:
            answer_type = 1

        encoded["answer_type"] = torch.tensor(answer_type)

        # return preprocessed information
        return {
            "input_ids": encoded["input_ids"].squeeze(),
            "attention_mask": encoded["attention_mask"].squeeze(),
            "start_positions": torch.tensor(start),
            "end_positions": torch.tensor(end),
            "answer_type": torch.tensor(answer_type),
        }

    def char_to_token(self, offset_mapping, start_char, end_char):
        """ Helper function to convert character positions to token-level positions. """
        start_token, end_token = 0, 0
        # loop through offset mapping
        for i, (start, end) in enumerate(offset_mapping):
          # finds the tokens that include the start_char and end_char
          if start <= start_char and end >= start_char:
            start_token = i
          if start <= end_char and end >= end_char:
            end_token = i

        return start_token, end_token

In [23]:
def preprocess_and_tokenize(dataset, tokenizer, batch_size):
  """ Preprocesses and tokenizes the dataset, returning a DataLoader for batching. """

  # gets the contexts, questions, and answers from the dataset to use as the inputs for the QA dataset
  contexts = dataset["contexts"]
  questions = dataset["questions"]
  answers = dataset["answers"]

  qa_dataset = QADataset(contexts, questions, answers, tokenizer)

  # creates a dataloader to handle batching and shuffling of the dataset
  data_loader = torch.utils.data.DataLoader(qa_dataset, batch_size, shuffle=True)

  return data_loader


In [24]:
def calculate_loss(logits, targets):
  """ Helper function to calculate loss as described in the paper """
  # get softmax of the logits
  probs = torch.nn.functional.softmax(logits, dim=-1)

  # calculate negative log of the correct class probabilities
  correct_probs = probs[range(logits.size(0)), targets]
  loss = -torch.log(correct_probs)

  return loss.mean()

In [25]:
def train_loop(model, train_data_loader, validation_data_loader, device, optimizer, tokenizer, lr_scheduler, num_epochs):
  """ Trains the model for the specified number of epochs and validates it after each epoch """

  for epoch in range(num_epochs):
    model.train() # sets model to training mode

    print(f"Epoch {epoch + 1} training:")
    progress_bar = tqdm(range(len(train_data_loader)))

    total_loss = 0

    # loop over each batch in the train data loader
    for batch in train_data_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      start_positions = batch['start_positions'].to(device)
      end_positions = batch['end_positions'].to(device)
      answer_type = batch['answer_type'].to(device)

      optimizer.zero_grad()

      # forward pass, get model outputs (start logits, end logits, and type logits)
      output = model(input_ids=input_ids, attention_mask=attention_mask)
      start_logits, end_logits, type_logits = output[0], output[1], output[2]

      # calculate loss using the helper function and add them up as per the paper
      start_loss = calculate_loss(start_logits, start_positions)
      end_loss = calculate_loss(end_logits, end_positions)
      type_loss = calculate_loss(type_logits, answer_type)

      loss = start_loss + end_loss + type_loss
      total_loss += loss.item()

      # backpropogate loss, update model weights, and update learning rate
      loss.backward()
      optimizer.step()
      lr_scheduler.step()

      progress_bar.update(1)


    avg_loss = total_loss / len(train_data_loader)
    print(f"Epoch {epoch + 1} average training loss: {avg_loss:.4f}")

    # evaluate the preformance on the validation dataset
    print("Running validation:")
    val_metrics = eval_loop(model, validation_data_loader, tokenizer, device)

    print(f"Epoch {epoch + 1} validation: F1={val_metrics['f1']:.4f}, Precision={val_metrics['precision']:.4f}, Recall={val_metrics['recall']:.4f}")

  return avg_loss, val_metrics["loss"]

In [26]:
def compute_metrics_helper(start_preds, start_positions, end_preds, end_positions, tokenizer, input_ids, type_preds, answer_type):
  """ Helper function for computing the final metrics by returning true positive, false positive, and false negative values """

  tp, fp, fn = 0, 0, 0

  # loop through each prediction
  for i in range(len(start_preds)):
    pred_start, pred_end = start_preds[i], end_preds[i]
    true_start, true_end = start_positions[i], end_positions[i]

    # ignore case where predicted span is invalid
    if pred_start > pred_end:
      continue

    # if both predicted and true point to no answer, add one to true positive and continue
    if pred_start <= 0 and pred_end <= 0 and true_start <= 0 and true_end <= 0:
      tp += 1
      continue

    # add to the values initialized at the beginning by getting a counter of the actual words in the span
    pred_tokens = tokenizer.convert_ids_to_tokens(input_ids[i][pred_start: pred_end + 1])
    true_tokens = tokenizer.convert_ids_to_tokens(input_ids[i][true_start: true_end + 1])

    pred_freqs = Counter(pred_tokens)
    true_freqs = Counter(true_tokens)

    tp += sum((pred_freqs & true_freqs).values())
    fp += sum((pred_freqs - true_freqs).values())
    fn += sum((true_freqs - pred_freqs).values())

  return tp, fp, fn

In [27]:
def eval_loop(model, validation_data_loader, tokenizer, device):
  """ Evaluates the model based on the validation dataset and computes precision, recall, f1 score, and average loss """
  metrics = [load_metric(x) for x in ["precision", "recall", "f1"]]
  model.eval() # sets model in evaluation mode

  progress_bar = tqdm(range(len(validation_data_loader)))

  total_val_loss = 0
  total_tp, total_fp, total_fn = 0, 0, 0

  for batch in validation_data_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)
    answer_type = batch['answer_type'].to(device)

    # pass inputs through the model
    output = model(input_ids=input_ids, attention_mask=attention_mask)
    start_logits, end_logits, type_logits = output[0], output[1], output[2]

    # get predictions
    start_preds = torch.argmax(start_logits, dim=1)
    end_preds = torch.argmax(end_logits, dim=1)
    type_preds = torch.argmax(type_logits, dim=1)

    # use helper function to ompute losses for each output and add them together
    start_loss = calculate_loss(start_logits, start_positions)
    end_loss = calculate_loss(end_logits, end_positions)
    type_loss = calculate_loss(type_logits, answer_type)

    val_loss = start_loss + end_loss + type_loss
    total_val_loss += val_loss.item()

    # use helper function to get true positives, false positives, and false negatives for the current batch
    tp, fp, fn = compute_metrics_helper(start_preds, start_positions, end_preds, end_positions, tokenizer, input_ids, type_preds, answer_type)
    total_tp += tp
    total_fp += fp
    total_fn += fn

    progress_bar.update(1)

  avg_val_loss = total_val_loss / len(validation_data_loader)
  print(f"Average validation loss: {avg_val_loss:.4f}")

  # use computed values to compute the actual metrics we care about
  precision = total_tp / (total_tp + total_fp)
  recall = total_tp / (total_tp + total_fn)
  f1 = 2 * (precision * recall) / (precision + recall)

  # return a dictionary with all the desired metrics
  computed = {"precision": precision, "recall": recall, "f1": f1, "loss" : avg_val_loss}

  return computed

In [28]:
def main():
  '''Here's the basic structure of the main block -- feel free to add or
  remove parameters/helper functions as you see fit, but all steps here are
  needed and we expect to see precision, recall, and f1 scores printed out'''
  device = "cuda" if torch.cuda.is_available() else "cpu"
  batch_size = 16
  num_epochs = 2

  model, tokenizer = load_model()
  train, validation = load_data()

  model.to(device) # load the model on to the device

  train_data_loader = preprocess_and_tokenize(train, tokenizer, batch_size)
  validation_data_loader = preprocess_and_tokenize(validation, tokenizer, batch_size)

  optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

  lr_scheduler = get_scheduler(
      "linear",
      optimizer = optimizer,
      num_warmup_steps = 50,
      num_training_steps = len(train_data_loader) * num_epochs,
  )

  train_losses, val_losses = train_loop(model, train_data_loader, validation_data_loader, device, optimizer, tokenizer, lr_scheduler, num_epochs)
  scores = eval_loop(model, validation_data_loader, tokenizer, device)

  print("PRECISION: ", scores["precision"])
  print("RECALL: ", scores["recall"])
  print("F1-SCORE: ", scores["f1"])


if __name__ == "__main__":
  main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Epoch 1 training:


  0%|          | 0/1742 [00:00<?, ?it/s]

Epoch 1 average training loss: 2.7625
Running validation:


  0%|          | 0/109 [00:00<?, ?it/s]

Average validation loss: 2.2869
Epoch 1 validation: F1=0.4446, Precision=0.3594, Recall=0.5828
Epoch 2 training:


  0%|          | 0/1742 [00:00<?, ?it/s]

Epoch 2 average training loss: 1.5968
Running validation:


  0%|          | 0/109 [00:00<?, ?it/s]

Average validation loss: 2.2049
Epoch 2 validation: F1=0.5014, Precision=0.4411, Recall=0.5807


  0%|          | 0/109 [00:00<?, ?it/s]

Average validation loss: 2.2059
PRECISION:  0.44108386604331645
RECALL:  0.5807059414646402
F1-SCORE:  0.5013556013447565
