<a href="https://colab.research.google.com/github/schwarzmarcel/MasterThesis_MSchwarz/blob/main/DAPT_BERT_sentence_pairs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install and import necessary libraries


In [None]:
!pip install transformers

In [None]:
from transformers import BertModel, BertTokenizerFast, BertForMaskedLM, BertForNextSentencePrediction
from transformers.models.bert.modeling_bert import BertOnlyMLMHead
from transformers.optimization import AdamW, get_linear_schedule_with_warmup
import torch 
import copy
import random
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from torch import nn, optim
from torch.utils import data 
from torch.nn import CrossEntropyLoss


RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# Check for GPU

Tesla P100 is recommended since this procedure takes a few hours

In [None]:
!nvidia-smi

In [None]:
if torch.cuda.is_available():
  device = torch.device("cuda:0")
else:
  device = torch.device("cpu")

In [None]:
device

# Prepare Data

 

*   open the dataset containing setnence pairs (50% of the time the sentence B is the subsequent sentence to sentence A; the other 50% sentence B is a random sentence from the corpus)











In [None]:
df = pd.read_json("sentence_pairs.json")
df.head()

In [None]:
df_train, df_test = train_test_split(df, test_size=0.7, random_state=RANDOM_SEED)

# Select model

In [None]:
PRE_TRAINED_MODEL_NAME = "bert-base-cased"
tokenizer = BertTokenizerFast.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
MAX_LEN = 128
BATCH_SIZE = 32

# Tokenizer and DataLoaders

The tokenizer creates the input for the pre-trained LMs and does the padding/truncation. This is simpler for single sentence input. The necessary outputs are:

*   Standard inputs: input_ids, attention_mask, segment_ids(here called token_type_ids)
*   label for NSP: is_next
*   labels for MLM: mask_labels

The tokenization function truncates sentence pairs that are too long. If one sentence is much longer than the other one (difference >25 tokens) then only the longer sentence is truncated. Else both sentences are truncated equally.

Then, tokens are randomly masked for each sentence with the BERT masking strategy (random_word function). 

The dataloaders create batches from the dataset which can be used by the LM.



In [None]:
def random_word(tokens, tokenizer):

    output_label = [-100]*len(tokens)
    i = 0
    while i < len(tokens)-1:
      prob = random.random()

      if prob < 0.15:
          prob /= 0.15

          output_label[i] = tokens[i]

          if prob < 0.8:
              tokens[i] = tokenizer.mask_token_id

          elif prob < 0.9:
              tokens[i] = random.choice(list(tokenizer.vocab.items()))[1]

      i+=1

    return tokens, output_label

In [None]:
def tokenize_pairs(sentence_a, sentence_b, tokenizer, max_len):

  tokens_a = tokenizer.encode(sentence_a, add_special_tokens=False, max_length=96, truncation=True)
  tokens_b = tokenizer.encode(sentence_b, add_special_tokens=False, max_length=96, truncation=True)

  if (len(tokens_a) + len(tokens_b)) > (max_len - 3):
    excess = len(tokens_a) + len(tokens_b) - (max_len - 3)
    dif = len(tokens_a) - len(tokens_b)
    if dif < 0:
      dif = -dif
    if dif > 25:
      if len(tokens_a) > len(tokens_b):
        tokens_a = tokens_a[:-excess]
      else:
        tokens_b = tokens_b[:-excess]
    else:
      excess_a = round(excess/2)
      excess_b = excess - excess_a
      if excess_a != 0:
        tokens_a = tokens_a[:-excess_a]
      tokens_b = tokens_b[:-excess_b]
  
  masked_a, label_a = random_word(tokens_a, tokenizer)
  masked_b, label_b = random_word(tokens_b, tokenizer)

  masked = [tokenizer.cls_token_id] + masked_a + [tokenizer.sep_token_id] + masked_b + [tokenizer.sep_token_id]
  mask_labels = [-100] + label_a + [-100] + label_b + [-100]
  attention_mask = torch.cat((torch.ones(len(masked), dtype=torch.int64), torch.zeros(max_len - len(masked), dtype=torch.int64)))
  token_type_ids = torch.cat((torch.zeros(len(masked_a)+2, dtype=torch.int64), torch.ones(len(masked_b)+1, dtype=torch.int64), torch.zeros(max_len - len(masked), dtype=torch.int64)))
  input_ids = torch.cat((torch.tensor(masked, dtype=torch.int64), torch.zeros(max_len - len(masked), dtype=torch.int64)))
  mask_labels = torch.cat((torch.tensor(mask_labels), torch.empty(max_len - len(mask_labels), dtype=torch.int64).fill_(-100)))
  
  return input_ids, token_type_ids, attention_mask, mask_labels

In [None]:
class DAPTDataset(data.Dataset):

  def __init__(self, sentences_a, sentences_b, is_next_labels, tokenizer, max_len):
    self.sentences_a = sentences_a
    self.sentences_b = sentences_b
    self.is_next_labels = is_next_labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.sentences_a)

  def __getitem__(self, item):
    sentence_a = self.sentences_a[item]
    sentence_b = self.sentences_b[item]
    is_next = self.is_next_labels[item]

    input_ids, token_type_ids, attention_mask, mask_labels = tokenize_pairs(sentence_a, sentence_b, tokenizer=self.tokenizer, max_len=self.max_len)

    return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_mask,
      "is_next": is_next,
      "mask_labels": mask_labels
    }

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = DAPTDataset(
      sentences_a=df.sentence_a.to_numpy(),
      sentences_b=df.sentence_b.to_numpy(),
      is_next_labels=df.is_next.to_numpy(),
      tokenizer=tokenizer,
      max_len=max_len,
  )

  return data.DataLoader(
      ds,
      batch_size=batch_size,
  )

In [None]:
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

# Define model
the model calculates the loss for MLM and NSP and concatenates it

In [None]:
class DAPTPreTrainer(nn.Module):

  def __init__(self):
      super(DAPTPreTrainer, self).__init__()
      self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
      self.linear2 = nn.Linear(self.bert.config.hidden_size, 2)
      self.cls = BertOnlyMLMHead(self.bert.config)

  def forward(self, input_ids, token_type_ids, attention_mask, mask_labels, is_next):
    outputs = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids
    )

    sequence_output = outputs[0]
    pooler_output = outputs[1]

    nsp_scores = self.linear2(pooler_output)

    hidden_size = sequence_output.size(-1)
    mask_labels = mask_labels.reshape(-1)
    labels_mask = mask_labels != -100
    selected_mask_labels = mask_labels[labels_mask]
    sequence_output = sequence_output.view(-1, hidden_size)
    selected_sequence_output = sequence_output.masked_select(labels_mask.unsqueeze(1)).view(-1, hidden_size)
    mlm_scores = self.cls(selected_sequence_output)

    loss_fn = CrossEntropyLoss()

    mlm_loss = loss_fn(mlm_scores, selected_mask_labels)
    mlm_preds = torch.argmax(mlm_scores, dim=1)
    mlm_results = mlm_preds == selected_mask_labels

    nsp_loss = loss_fn(nsp_scores, is_next)
    nsp_preds = torch.argmax(nsp_scores, dim=1)
    nsp_results = nsp_preds == is_next

    return mlm_loss, mlm_results.tolist(), nsp_loss, nsp_results.tolist()

In [None]:
model = DAPTPreTrainer().to(device)

# Training and validation

Here the actual DAPT happens. First, the hyperparameters and some metrics are defined. Then the training and evaluation functions are defined and called. During the training process, the loss and accuracy for MLM and NSP are returned. On a Tesla P100 this should take ~5 hours.

In [None]:
EPOCHS = 5

optimizer = AdamW(model.parameters(), lr=1e-4, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps= 0.1 * total_steps,
  num_training_steps= 0.9 * total_steps
)
loss_fn = CrossEntropyLoss()

In [None]:
def train_epoch(model, data_loader, optimizer, device, scheduler):

  model = model.train()

  mlm_losses = []
  mlm_predictions = []
  nsp_losses = []
  nsp_predictions = []

  for batch in data_loader:
    input_ids = batch["input_ids"].to(device)
    mask_labels = batch["mask_labels"].to(device)
    token_type_ids = batch["token_type_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    is_next = batch["is_next"].to(device)

    mlm_loss, mlm_results, nsp_loss, nsp_results = model(input_ids, token_type_ids, attention_mask, mask_labels, is_next)
    mlm_losses.append(mlm_loss.item())
    nsp_losses.append(nsp_loss.item())
    mlm_predictions = mlm_predictions + mlm_results
    nsp_predictions = nsp_predictions + nsp_results
    loss = mlm_loss + nsp_loss

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return np.mean(mlm_losses), sum(mlm_predictions)/len(mlm_predictions), np.mean(nsp_losses), sum(nsp_predictions)/len(nsp_predictions)

In [None]:
def eval_model(model, data_loader, device):

  model = model.eval()

  mlm_losses = []
  mlm_predictions = []
  nsp_losses = []
  nsp_predictions = []

  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch["input_ids"].to(device)
      mask_labels = batch["mask_labels"].to(device)
      token_type_ids = batch["token_type_ids"].to(device)
      attention_mask = batch["attention_mask"].to(device)
      is_next = batch["is_next"].to(device)

      mlm_loss, mlm_results, nsp_loss, nsp_results = model(input_ids, token_type_ids, attention_mask, mask_labels, is_next)
      mlm_losses.append(mlm_loss.item())
      nsp_losses.append(nsp_loss.item())
      mlm_predictions = mlm_predictions + mlm_results
      nsp_predictions = nsp_predictions + nsp_results

  return np.mean(mlm_losses), sum(mlm_predictions)/len(mlm_predictions), np.mean(nsp_losses), sum(nsp_predictions)/len(nsp_predictions)

In [None]:
%%time

for epoch in range(EPOCHS):

  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  train_mlm_loss, train_mlm_acc, train_nsp_loss, train_nsp_acc = train_epoch(
      model,
      train_data_loader,
      optimizer,
      device,
      scheduler
  )

  print(f"Train mlm loss {train_mlm_loss} mlm accuracy {train_mlm_acc}")
  print(f"Train nsp loss {train_nsp_loss} nsp accuracy {train_nsp_acc}")
  print()

  val_mlm_loss, val_mlm_acc, val_nsp_loss, val_nsp_acc = eval_model(
      model,
      test_data_loader,
      device
  )

  print(f"Val   mlm loss {val_mlm_loss} mlm accuracy {val_mlm_acc}")
  print(f"Val   nsp loss {val_nsp_loss} nsp accuracy {val_nsp_acc}")
  print()

In [None]:
model.bert.save_pretrained("bert_mlm_nsp_cased")