In [12]:
try: import fastai2
except ImportError:
  !git clone -q https://github.com/richardyy1188/Pretrain-MLM-and-finetune-on-GLUE-with-fastai.git
  %pip install -q transformers fastai2 nlp

In [13]:
%cd Pretrain-MLM-and-finetune-on-GLUE-with-fastai

from IPython.core.debugger import set_trace as bk
from pathlib import Path
from functools import partial
import textwrap
import pandas as pd
import torch
from torch import nn
import nlp
from transformers import ElectraForMaskedLM, ElectraForPreTraining, ElectraTokenizer, ElectraTokenizerFast
from fastai2.text.all import *
from _utils.demo_data import load_demo_dataframe
from _utils.hf_integration import HF_Tokenizer, HF_TextBlock, HF_ModelWrapper
from _utils.would_like_to_pr import TextDataloader, LabelSmoothingCrossEntropyFlat

/home/yisiang/Pretrain-MLM-and-finetune-on-GLUE-with-fastai


In [14]:
""" tokenizer and fast tokenizer
We use normal tokenizer to get vocab, use fast tokenizer to convert tokens to ids.
Because we can't get vocab from fast tokenizer and fast tokenizer is faster,
and they have the same token-id mapping.
"""
hf_tokenizer = ElectraTokenizer.from_pretrained("google/electra-small-generator")
hf_fast_tokenizer = ElectraTokenizerFast.from_pretrained("google/electra-small-generator")
electra_generator = ElectraForMaskedLM.from_pretrained('google/electra-small-generator')
electra_discriminator = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=463.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=54236116.0, style=ProgressStyle(descrip…




In [15]:
vocab_size = hf_tokenizer.vocab_size
print(f'vocab_size: {vocab_size}')
size = (2,4)
print(f'Assume (batch size, sequence length) is {size}')
g_output = electra_generator(torch.randint(0,vocab_size, (2,4)))
print(g_output[0].shape, ', [b][i][t] is logits(kind of score) for ith token in batch i to be token type t')
d_output = electra_discriminator(torch.randint(0,vocab_size, (2,4)))
print(d_output[0].shape, ', [b][i] is logits(kind of score) for whether ith token in batch i is replaced.') 

vocab_size: 30522
Assume (batch size, sequence length) is (2, 4)
torch.Size([2, 4, 30522]) , [b][i][t] is logits(kind of score) for ith token in batch i to be token type t
torch.Size([2, 4]) , [b][i] is logits(kind of score) for whether ith token in batch i is replaced.


# 1. Load Data

In [16]:
cache_dir=Path('~/datasets')
cache_dir.mkdir(parents=True, exist_ok=True) # create recursively if not exist
wiki = nlp.load_dataset('wikipedia', '20200501.en', cache_dir=cache_dir)['train']

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=13008.0, style=ProgressStyle(descriptio…




In [None]:
def tokenize_tfm(example):
    example['input_ids'] = hf_fast_tokenizer.convert_tokens_to_ids(hf_fast_tokenizer.tokenize(example['text'])
    return example

In [22]:
cache_file = Path(wiki.cache_files[0]['filename']).parent / f'tokenized_{split}.arrow'

device(type='cuda', index=0)

**Prepare source dataframe**

In [4]:
df = load_demo_dataframe()
# fastai can use is_valid column to split this datafram into train and validate set
df.head()

NameError: name 'load_demo_dataframe' is not defined

## 1.2 Dataloaders using datablock api

In [5]:
db = DataBlock(splitter=ColSplitter(),
              blocks=HF_TextBlock.from_df('text', hf_fast_tokenizer, 
                                          vocab=list(hf_tokenizer.get_vocab()),
                                          rules=[lambda x: x.replace('<unk>', hf_tokenizer.unk_token)]),
              get_x=ColReader('text'),)
""" Note:
If you don't cancel pad first, you will get input with variable number of pad before sentence.
like 'pad ... pad pad I am input x pad .. pad pad'
Which you may cause add wrong position's position embeddings when using transformer architecture
"""
dls = db.dataloaders(df, shuffle_train=True, bs=128, 
                     dl_type=partial(TextDataloader, 
                                     max_seq_len=150,
                                     agg_mode='lines',
                                     sort_by_len=False,
                                     remove_heads=True,
                                     remove_tails=True,
                                     bos_idx_add=hf_tokenizer.cls_token_id,
                                     eos_idx_add=hf_tokenizer.sep_token_id))
""" Note:
fastai wiil assure data and model are on the same device, so we just need to specify device for dataloaders,
and fastai will infer and move model to that device.
"""
dls.show_batch(max_n=2, trunc_at=None) # trunc_at=None: don't cut the text when longer than 150

NameError: name 'DataBlock' is not defined

# 2. Masked language model objective

## 2.1 MLM objective callback

In [6]:
# https://github.com/huggingface/transformers/blob/1789c7daf1b8013006b0aef6cb1b8f80573031c5/examples/run_language_modeling.py#L179
def mask_tokens(inputs, mask_token_index, vocab_size, special_token_indices, mlm_probability=0.15, ignore_index=-100):
  """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
  "ignore_index in nn.CrossEntropy is default to -100, so you don't need to specify ignore_index in loss"
  
  #assert len(inputs.shape)==2, "Assume input shape of (batch_size, sequence_length)"
  labels = inputs.clone()
  
  # We sample a few tokens in each sequence for masked-LM training (with probability mlm_probability defaults to 0.15 in Bert/RoBERTa)
  probability_matrix = torch.full(labels.shape, mlm_probability)
  special_tokens_mask = torch.tensor([
    [i in special_token_indices for i in sample] for sample in labels
  ], dtype=torch.bool)
  probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
  mlm_mask = torch.bernoulli(probability_matrix).bool()
  labels[~mlm_mask] = ignore_index  # We only compute loss on masked tokens
  
  # 80% of the time, we replace masked input tokens with mask_token
  mask_token_mask = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & mlm_mask
  inputs[mask_token_mask] = mask_token_index
  
  # 10% of the time, we replace masked input tokens with random word
  replace_token_mask = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & mlm_mask & ~mask_token_mask
  random_words = torch.randint(vocab_size, labels.shape, dtype=torch.long, device=inputs.device)
  inputs[replace_token_mask] = random_words[replace_token_mask]

  # The rest of the time (10% of the time) we keep the masked input tokens unchanged
  return inputs, labels

class MaskedLMCallback(Callback):
  @delegates(mask_tokens)
  def __init__(self, mask_tok_id, special_tok_ids, vocab_size, ignore_index=-100, **kwargs):
    self.ignore_index = ignore_index
    self.mask_tokens = partial(mask_tokens,
                               mask_token_index=mask_tok_id,
                               special_token_indices=special_tok_ids,
                               vocab_size=vocab_size,
                               ignore_index=-100,
                               **kwargs)

  def begin_batch(self):
    text_indices = self.xb[0]
    masked_inputs, labels = self.mask_tokens(text_indices)
    self.learn.xb, self.learn.yb = (masked_inputs,), (labels,)

  @delegates(TfmdDL.show_batch)
  def show_batch(self, dl, verbose=True, show_ignore_idx=None, **kwargs):
    b = dl.one_batch()
    masked_inputs, labels = self.mask_tokens(b[0])
    if show_ignore_idx:
      labels[labels==self.ignore_index] = show_ignore_idx
    if verbose: 
      print("We won't count loss from position where y is ignore index")
      print("Notice 1. Positions have label token in y will be either [Mask]/other token/orginal token in x")
      print("Notice 2. Special tokens (CLS, SEP) won't be masked.")
      print("Notice 3. Dynamic masking: every time you run gives you different results.")
    dl.show_batch(b=(masked_inputs, labels), **kwargs)

NameError: name 'Callback' is not defined

In [7]:
mlm_cb = MaskedLMCallback(mask_tok_id=hf_tokenizer.mask_token_id, 
                          special_tok_ids=hf_tokenizer.all_special_ids, 
                          vocab_size=hf_tokenizer.vocab_size)
mlm_cb.show_batch(dls.train, max_n=2, show_ignore_idx=hf_tokenizer.vocab['@'])

NameError: name 'MaskedLMCallback' is not defined

## 2.2 Train

In [8]:
learn = Learner(dls, HF_ModelWrapper(electra_generator,hf_tokenizer.pad_token_id),  
                loss_func=LabelSmoothingCrossEntropyFlat(), 
                metrics=accuracy,
                opt_func=ranger,
                cbs=[mlm_cb]).to_fp16()

# help you find the best learning rate for "one cycle learning rate schedule"
# the graph below is the result of learning rate finding
lr, _ = learn.lr_find()

learn.fit_one_cycle(1, 0.01)

NameError: name 'Learner' is not defined

# 3. ELECTRA (replaced token detection objective)

see details on arxiv [ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators](https://arxiv.org/abs/2003.10555)

In [9]:
class Electra(nn.Module):
  def __init__(self, generator, discriminator):
    super().__init__()
    self.generator = generator
    self.discriminator = discriminator

NameError: name 'nn' is not defined

In [10]:
class ElectraCallback(MaskedLMCallback):
  run_after=MixedPrecision
  @delegates(MaskedLMCallback.__init__)
  def __init__(self, electra_model, mask_tok_id, special_tok_ids, vocab_size, loss_weights=(1.0, 50.0), **kwargs):
    super().__init__(mask_tok_id, special_tok_ids, vocab_size, **kwargs)
    assert len(loss_weights)==2, 'loss_weights should contain element 0 and 1 for weight of generator loss and dicriminator loss respectively.'
    store_attr(self, 'electra_model, mask_tok_id, loss_weights')
    """
    This is equivalent of tf.nn.sigmoid_cross_entropy_with_logits, which is a sigmoid layer before cross entropy,
    when value of targes is either 1.0 or 0.0 .
    """
    self.discri_loss_func = nn.BCEWithLogitsLoss() # see ElectraForPreTraining.forward(https://github.com/huggingface/transformers/blob/b86e42e0ac1b59f21f0eccf351d3346bbe3ed4eb/src/transformers/modeling_electra.py#L518
  
  def begin_fit(self):
    for cb in self.cbs:
      if isinstance(cb, MixedPrecision): self.fp16_cb = cb
  def after_fit(self):
    if hasattr(self, 'fp16_cb'): delattr(self, 'fp16_cb')

  def after_loss(self):
    # prepare discriminator's inputs and labels
    masked_inputs, labels = self.xb[0], self.yb[0] # both (batch size, sequence length)
    masked_pos_mask = masked_inputs == self.mask_tok_id # 1 for masked position, 0 for the others
    pred_toks = self.pred.argmax(dim=-1)
    generated = masked_inputs + masked_pos_mask * pred_toks # use predicted token to fill masked position
    replaced = (masked_pos_mask & (pred_toks != labels))  # is masked token and not equal to predicted
    
    discri_pred = self.electra_model.discriminator(generated).to(dtype=self.pred.dtype) # (bs, seq_len, 1)
    discri_pred = discri_pred.squeeze(-1) # (bs, seq_len)
    if hasattr(self, 'fp16_cb'): discri_pred = to_float(discri_pred)
    discri_loss = self.discri_loss_func(discri_pred, replaced.to(dtype=discri_pred.dtype))
    if hasattr(self, 'fp16_cb'): discri_loss *= self.fp16_cb.loss_scale
    self.learn.loss = torch.stack([self.loss, discri_loss]).matmul(torch.tensor(self.loss_weights, dtype=self.loss.dtype, device=self.loss.device))

    self.learn.model = self.electra_model

  def begin_batch(self):
    super().begin_batch() # -> self.xb, self.yb = (masked_inputs,),(answers,)
    self.learn.model = self.electra_model.generator

NameError: name 'MaskedLMCallback' is not defined

* The loss is very large

  Because Electra scales the discriminator loss by 50.

* Original optimization hyperparamter and schedule

  see [config](https://github.com/google-research/electra/blob/79111328070e491b287c307906701ebc61091eb2/configure_pretraining.py#L40) of the offcial repository



In [11]:
electra_model = Electra(HF_ModelWrapper(electra_generator, pad_id=hf_tokenizer.pad_token_id), 
                        HF_ModelWrapper(electra_discriminator, pad_id=hf_tokenizer.pad_token_id))

electra_cb = ElectraCallback(electra_model, 
                             mask_tok_id=hf_tokenizer.mask_token_id, 
                             special_tok_ids=hf_tokenizer.all_special_ids, 
                             vocab_size=hf_tokenizer.vocab_size)
learn = Learner(dls, electra_model,  
                loss_func=LabelSmoothingCrossEntropyFlat(), # will only be applied to generator, the loss function for discriminator is specified in callback 
                metrics=accuracy,
                opt_func=ranger,
                cbs=[electra_cb]).to_fp16()

# help you find the best learning rate for "one cycle learning rate schedule"
# # the graph below is the result of learning rate finding
lr, _ = learn.lr_find()

learn.fit_one_cycle(1, lr)

NameError: name 'Electra' is not defined