# BERT from scratch

This content is loosely based on James Briggs' [tutorial](https://www.kdnuggets.com/2021/08/train-bert-model-scratch.html) "How to Train a BERT Model From Scratch".

Differently from the original tutorial, the latin language dataset is used - not the best choice for accuracy, but it is a small dataset and evaluations come easy.

## Getting the data

In [12]:
!pip install datasets



In [13]:
import datasets
all_ds = datasets.list_datasets()
all_ds[:5] 

['amirveyseh/acronym_identification',
 'ade-benchmark-corpus/ade_corpus_v2',
 'UCLNLP/adversarial_qa',
 'Yale-LILY/aeslc',
 'nwu-ctext/afrikaans_ner_corpus']

In [14]:
'oscar' in all_ds

False

In [15]:
dataset = datasets.load_dataset('oscar', 'unshuffled_deduplicated_la')

In [16]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text'],
        num_rows: 18808
    })
})

In [17]:
dataset['train'][0] 

{'id': 0,
 'text': 'Hæ sunt generationes Noë: Noë vir justus atque perfectus fuit in generationibus suis; cum Deo ambulavit.\nEcce ego adducam aquas diluvii super terram, ut interficiam omnem carnem, in qua spiritus vitæ est subter cælum: universa quæ in terra sunt, consumentur.\nTolles igitur tecum ex omnibus escis, quæ mandi possunt, et comportabis apud te: et erunt tam tibi, quam illis in cibum.'}

In [18]:
from tqdm.auto import tqdm

text_data = []
file_count = 0

for sample in tqdm(dataset['train']):
    sample = sample['text'].replace('\n', '')
    text_data.append(sample)
    if len(text_data) == 6_000:
        # once we git the 6K mark, save to file
        with open(f'oscar_la/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1
        
# after saving in 6K chunks, we will have ~808 leftover samples, we save those now too
with open(f'oscar_la/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
    fp.write('\n'.join(text_data))

  0%|          | 0/18808 [00:00<?, ?it/s]

## Building a tokenizer

In [19]:
from pathlib import Path
paths = [str(x) for x in Path('oscar_la').glob('**/*.txt')] 

In [20]:
paths

['oscar_la/text_2.txt',
 'oscar_la/text_3.txt',
 'oscar_la/text_1.txt',
 'oscar_la/text_0.txt']

In [21]:
!pip install transformers 



In [22]:
from tokenizers import ByteLevelBPETokenizer

tokenizer = ByteLevelBPETokenizer() 

In [23]:
tokenizer.train(files=paths, 
                vocab_size=30_522,
                min_frequency=2,
                special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>']) 






In [24]:
# use the tokeinzer to tokenize text string 'i will go to Paris', i want to check the tokens split by the tokenizer.
tokenizer.encode('i will go to Paris for 15 days, putain.').tokens


['i',
 'Ġwill',
 'Ġgo',
 'Ġto',
 'ĠParis',
 'Ġfor',
 'Ġ15',
 'Ġda',
 'ys',
 ',',
 'Ġput',
 'ain',
 '.']

In [25]:
import os

os.mkdir('./liberto')

tokenizer.save_model('liberto') 

FileExistsError: [Errno 17] File exists: './liberto'

## Initializing the tokenizer

In [26]:
from transformers import RobertaTokenizer

# initialize the tokenizer using the tokenizer we initialized and saved to file
tokenizer = RobertaTokenizer.from_pretrained('liberto', max_len=512) 

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizer'.


In [27]:
tokenizer


RobertaTokenizer(name_or_path='liberto', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	4: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}

In [31]:
# test our tokenizer on a simple sentence
tokens = tokenizer('quo vadis?') 

In [32]:
tokens

{'input_ids': [0, 3106, 14116, 35, 2], 'attention_mask': [1, 1, 1, 1, 1]}

In [33]:
tokens.input_ids

[0, 3106, 14116, 35, 2]

## Creating the Input Pipeline

## Preparing the data

In [34]:
with open('oscar_la/text_0.txt', 'r', encoding='utf-8') as fp:
    lines = fp.read().split('\n') 

In [35]:
lines[0] 

'Hæ sunt generationes Noë: Noë vir justus atque perfectus fuit in generationibus suis; cum Deo ambulavit.Ecce ego adducam aquas diluvii super terram, ut interficiam omnem carnem, in qua spiritus vitæ est subter cælum: universa quæ in terra sunt, consumentur.Tolles igitur tecum ex omnibus escis, quæ mandi possunt, et comportabis apud te: et erunt tam tibi, quam illis in cibum.'

In [36]:
batch = tokenizer(lines, max_length=512, padding='max_length', truncation=True)
len(batch) 

2

In [37]:
for x in batch['input_ids']:
    print(x)
    break

[0, 44, 836, 337, 7597, 21560, 30, 21560, 609, 14600, 545, 9976, 517, 285, 16827, 1490, 31, 342, 1149, 15969, 18, 5436, 636, 10902, 4973, 12302, 761, 1516, 16, 329, 10904, 1458, 5203, 16, 285, 503, 3658, 9917, 297, 9259, 19092, 30, 3481, 1673, 285, 1127, 337, 16, 16795, 18, 56, 20711, 796, 2007, 349, 837, 14882, 16, 1673, 16329, 884, 16, 290, 26857, 494, 486, 30, 290, 1933, 508, 591, 16, 350, 1144, 285, 4729, 18, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [38]:
import torch

labels = torch.tensor([x for x in batch['input_ids']])
mask = torch.tensor([x for x in batch['attention_mask']]) 

In [56]:
labels

tensor([[    0,  2226,   726,  ...,     1,     1,     1],
        [    0,  9176,   706,  ...,     1,     1,     1],
        [    0,  7044,   884,  ...,     1,     1,     1],
        ...,
        [    0,    44,  1109,  ...,     1,     1,     1],
        [    0, 23782,   358,  ...,     1,     1,     1],
        [    0, 13160,   880,  ...,     1,     1,     1]])

In [57]:
mask

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

In [62]:
mask[11]



tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [40]:
# make copy of labels tensor, this will be input_ids
input_ids = labels.detach().clone()
# create random array of floats with equal dims to input_ids
rand = torch.rand(input_ids.shape)
# mask random 15% where token is not 0 [PAD], 1 [CLS], or 2 [SEP]
# mask_arr = (rand < .15) * (input_ids != 0) * (input_ids != 1) * (input_ids != 2)
mask_arr = (rand < .15) * (input_ids > 2) 
# loop through each row in input_ids tensor (cannot do in parallel)
for i in range(input_ids.shape[0]):
    # get indices of mask positions from mask array
    selection = torch.flatten(mask_arr[i].nonzero()).tolist()
    # mask input_ids
    input_ids[i, selection] = 3  # our custom [MASK] token == 3 

In [41]:
input_ids.shape

torch.Size([6000, 512])

In [42]:
input_ids[0][:200] 

tensor([    0,    44,   836,   337,  7597, 21560,    30, 21560,   609, 14600,
          545,  9976,   517,     3, 16827,  1490,     3,     3,  1149, 15969,
            3,     3,   636,     3,  4973, 12302,   761,  1516,    16,     3,
        10904,  1458,  5203,     3,     3,   503,  3658,  9917,   297,  9259,
        19092,    30,  3481,  1673,   285,     3,     3,    16, 16795,    18,
            3, 20711,   796,  2007,   349,     3, 14882,    16,  1673, 16329,
          884,    16,   290, 26857,   494,   486,    30,     3,  1933,   508,
          591,     3,   350,  1144,   285,  4729,    18,     2,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1, 

In [43]:
encodings = {'input_ids': input_ids, 'attention_mask': mask, 'labels': labels} 

In [45]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        # store encodings internally
        self.encodings = encodings

    def __len__(self):
        # return the number of samples
        return self.encodings['input_ids'].shape[0]

    def __getitem__(self, i):
        # return dictionary of input_ids, attention_mask, and labels for index i
        return {key: tensor[i] for key, tensor in self.encodings.items()}

In [46]:
dataset = Dataset(encodings) 

In [47]:
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True) 

## Training the model

## Initializing the model 

In [48]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=30_522,  # we align this to the tokenizer vocab_size
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
    ) 

In [49]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config) 

## Training Preparation 

In [50]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device) 

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

In [51]:
from transformers import AdamW

# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=1e-4)



## Training 

In [52]:
epochs = 2

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item()) 

  0%|          | 0/375 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [66]:
model.save_pretrained('./liberto')  # and don't forget to save liBERTo!

## The Real Test 

In [67]:
from transformers import pipeline

In [68]:
fill = pipeline('fill-mask', model='liberto', tokenizer='liberto')

In [74]:
fill(f'abundans {fill.tokenizer.mask_token} non nocet ') # abundans cautela non nocet

[{'score': 0.0004939708742313087,
  'sequence': 'abundans qui non nocet ',
  'token': 365,
  'token_str': ' qui'},
 {'score': 0.00044498726492747664,
  'sequence': 'abundans, non nocet ',
  'token': 16,
  'token_str': ','},
 {'score': 0.0003394597733858973,
  'sequence': 'abundans vel non nocet ',
  'token': 449,
  'token_str': ' vel'},
 {'score': 0.00030346045969054103,
  'sequence': 'abundans. non nocet ',
  'token': 18,
  'token_str': '.'},
 {'score': 0.00024564063642174006,
  'sequence': 'abundans expressit non nocet ',
  'token': 29638,
  'token_str': ' expressit'}]

In [75]:
fill(f'quod {fill.tokenizer.mask_token} demonstrandum') # quod erat demonstrandum 

[{'score': 0.00040464798803441226,
  'sequence': 'quod qui demonstrandum',
  'token': 365,
  'token_str': ' qui'},
 {'score': 0.00034588476410135627,
  'sequence': 'quod, demonstrandum',
  'token': 16,
  'token_str': ','},
 {'score': 0.00031329740886576474,
  'sequence': 'quod vel demonstrandum',
  'token': 449,
  'token_str': ' vel'},
 {'score': 0.0002996937255375087,
  'sequence': 'quod deserv demonstrandum',
  'token': 18483,
  'token_str': ' deserv'},
 {'score': 0.0002783830277621746,
  'sequence': 'quod. demonstrandum',
  'token': 18,
  'token_str': '.'}]