## BERT pre training using MLM and data loader


### Ref:

(i) https://towardsdatascience.com/how-to-train-a-bert-model-from-scratch-72cfce554fc6

(ii) https://towardsdatascience.com/bert-for-next-sentence-prediction-466b67f8226f

In [None]:
!pip install transformers datasets

Collecting transformers
  Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 7.7 MB/s 
[?25hCollecting datasets
  Downloading datasets-1.16.1-py3-none-any.whl (298 kB)
[K     |████████████████████████████████| 298 kB 55.8 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 63.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 67.9 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 71.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |█████████████

In [None]:
import os
from tqdm.auto import tqdm
from pathlib import Path
from datasets import load_dataset
from transformers import (
    BertTokenizerFast,
    BertConfig,
    BertForMaskedLM,
    AdamW
)
import torch

In [None]:
# CONFIG

FILE_DATASET_LIMIT = 10_000
DATA_DIR = 'data'

# For MLM Pretraining
MLM_MASK_RATIO = 0.15
MLM_BATCH_SIZE = 16
MLM_EPOCHS = 2

MODEL_NAME = "bert-base-uncased"
# MODEL_NAME = "bert-base-multilingual-uncased"

MODEL_SAVE_PATH = MODEL_NAME

### Dataset

In [None]:
# wiki = load_dataset("wikipedia", "20200501.en", split="train")
# bookcorpus = load_dataset("bookcorpus", split="train")
# print(wiki.column_names, bookcorpus.column_names)
# # ['title', 'text'] ['text']

# wiki.remove_columns_("title")
# bert_dataset = concatenate_datasets([wiki, bookcorpus])


dataset = load_dataset("cc_news", split="train")
bert_dataset = dataset

Downloading:   0%|          | 0.00/1.75k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/932 [00:00<?, ?B/s]

Downloading and preparing dataset cc_news/plain_text (download: 805.98 MiB, generated: 1.88 GiB, post-processed: Unknown size, total: 2.67 GiB) to /root/.cache/huggingface/datasets/cc_news/plain_text/1.0.0/ae469e556251e6e7e20a789f93803c7de19d0c4311b6854ab072fecb4e401bd6...


Downloading:   0%|          | 0.00/845M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

Dataset cc_news downloaded and prepared to /root/.cache/huggingface/datasets/cc_news/plain_text/1.0.0/ae469e556251e6e7e20a789f93803c7de19d0c4311b6854ab072fecb4e401bd6. Subsequent calls will reuse this data.


In [None]:
print(bert_dataset)
print(bert_dataset[0])

Dataset({
    features: ['title', 'text', 'domain', 'date', 'description', 'url', 'image_url'],
    num_rows: 708241
})
{'title': 'Daughter Duo is Dancing in The Same Company', 'text': 'There\'s a surprising twist to Regina Willoughby\'s last season with Columbia City Ballet: It\'s also her 18-year-old daughter Melina\'s first season with the company. Regina, 40, will retire from the stage in March, just as her daughter starts her own career as a trainee. But for this one season, they\'re sharing the stage together.\nPerforming Side-By-Side In The Nutcracker\nRegina and Melina are not only dancing in the same Nutcracker this month, they\'re onstage at the same time: Regina is doing Snow Queen, while Melina is in the snow corps, and they\'re both in the Arabian divertissement. "It\'s very surreal to be dancing it together," says Regina. "I don\'t know that I ever thought Melina would take ballet this far."\nLeft: Regina and Melina with another company member post-snow scene in 2003. Rig

In [None]:
if not os.path.exists(DATA_DIR):
  os.mkdir(DATA_DIR)

Write batches of files with max 10000 lines

In [None]:
text_data = []
file_count = 0

for sample in tqdm(bert_dataset):
    sample = sample['text'].replace('\n', '')
    text_data.append(sample)
    if len(text_data) == FILE_DATASET_LIMIT:
        # once we git the 10K mark, save to file
        with open(f'{DATA_DIR}/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1

# Saving the remaining data samples
with open(f'{DATA_DIR}/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
    fp.write('\n'.join(text_data))

  0%|          | 0/708241 [00:00<?, ?it/s]

### Tokenizer

In [None]:
# bert_tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

from transformers import BertTokenizerFast
bert_tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)

In [None]:
# test our tokenizer on a simple sentence
tokens = bert_tokenizer('this is a test')
print(tokens)

{'input_ids': [101, 2023, 2003, 1037, 3231, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}


In [None]:
# Check that PyTorch sees it
torch.cuda.is_available()

True

In [None]:
bert_tokenizer.decode([100, 101, 102,103, 104, 105, 106])

'[UNK] [CLS] [SEP] [MASK] [unused99] [unused100] [unused101]'

In [None]:
with open(f'{DATA_DIR}/text_0.txt', 'r', encoding='utf-8') as fp:
    lines = fp.read().split('\n')

batch = bert_tokenizer(lines, max_length=512, padding='max_length', truncation=True)
len(batch)

3

### Input Pipeline For MLM

In [None]:
def mlm(tensor):
    """
    Assign Mask
    """
    rand = torch.rand(tensor.shape)   # ever val between 0-1
    # don't mask special tokens
    mask_arr = (rand < MLM_MASK_RATIO)* (tensor >103)
    for i in range(tensor.shape[0]):
        # for each row
        selection = torch.flatten(mask_arr[i].nonzero())   
        tensor[i, selection] = 103
    return tensor


In [None]:
# all training files
paths = [str(x) for x in Path(DATA_DIR).glob('*.txt')]
print(len(paths))

71


In [None]:
# Need to create three tensors for MLM Pretraining
input_ids = [] # has mask
mask = []
labels = []

In [None]:
for path in tqdm(paths[:5]):
    with open(path, 'r', encoding='utf-8') as f:
      lines = f.read().split('\n')
    
    sample = bert_tokenizer(lines, max_length=512, padding='max_length',
                            truncation=True, return_tensors='pt')
    labels.append(sample['input_ids'])
    mask.append(sample['attention_mask'])
    input_ids.append(mlm(sample['input_ids'].detach().clone()))

  0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
input_ids = torch.cat(input_ids)
mask = torch.cat(mask)
labels = torch.cat(labels)

In [None]:
encodings = {
    'input_ids': input_ids,
    'attention_mask': mask,
    'labels': labels
}

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    
    def __len__(self):
        return self.encodings['input_ids'].shape[0]
    
    def __getitem__(self, i):
        return {
           key:tensor[i] for key, tensor in self.encodings.items() 
        }

In [None]:
dataset = Dataset(encodings)

dataloader = torch.utils.data.DataLoader(
                    dataset, 
                    batch_size=16, 
                    shuffle=True
                    )

### Model

In [None]:
BertConfig


config = BertConfig(
    hidden_size=768,
    num_attention_heads=4,
    num_hidden_layers=4
)

# Initialize BERT model with a language modeling (LM) head.

model = BertForMaskedLM(config)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [None]:
# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=1e-4)

In [None]:
for epoch in range(MLM_EPOCHS):
    # setup loop with TQDM and dataloader
    loop = tqdm(dataloader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  0%|          | 0/3125 [00:00<?, ?it/s]

  0%|          | 0/3125 [00:00<?, ?it/s]