In [3]:
import requests
from transformers import BertTokenizer, BertForPreTraining
import torch


Initiallize Tokenizer


In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')                              # Load tokenizer
model = BertForPreTraining.from_pretrained('bert-base-uncased', return_dict=True)           # Load model


Get Data


In [5]:
data = requests.get('https://raw.githubusercontent.com/jamescalam/transformers/main/data/text/meditations/clean.txt')  # Get data from GitHub

Transform Data into Text 

In [6]:
text = data.text.split('\n')        # Split text into sentences

Show Data

In [7]:
text[:5]                            # Show first 5 sentences

['From my grandfather Verus I learned good morals and the government of my temper.',
 'From the reputation and remembrance of my father, modesty and a manly character.',
 'From my mother, piety and beneficence, and abstinence, not only from evil deeds, but even from evil thoughts; and further, simplicity in my way of living, far removed from the habits of the rich.',
 'From my great-grandfather, not to have frequented public schools, and to have had good teachers at home, and to know that on such things a man should spend liberally.',
 "From my governor, to be neither of the green nor of the blue party at the games in the Circus, nor a partizan either of the Parmularius or the Scutarius at the gladiators' fights; from him too I learned endurance of labour, and to want little, and to work with my own hands, and not to meddle with other people's affairs, and not to be ready to listen to slander."]

NSP Task

In [8]:
text[40].split('.')     # split into sentences

['Labour not unwillingly, nor without regard to the common interest, nor without due consideration, nor with distraction; nor let studied ornament set off thy thoughts, and be not either a man of many words, or busy about too many things',
 " And further, let the deity which is in thee be the guardian of a living being, manly and of ripe age, and engaged in matter political, and a Roman, and a ruler, who has taken his post like a man waiting for the signal which summons him from life, and ready to go, having need neither of oath nor of any man's testimony",
 ' Be cheerful also, and seek not external help nor the tranquility which others give',
 ' A man then must stand erect, not be kept erect by others',
 '']

Create a list of all the different sentences that we have, that we can pull from

In [9]:
bag = [sentence for para in text for sentence in para.split('.') if len(sentence) > 0]      # bag of sentences
bag_size = len(bag)                                                                         # number of sentences in the bag

Create NSP training Data

In [10]:
import random

sentence_a = []     # first sentence
sentence_b = []     # next sentence
label = []          # next sentence prediction label

for paragraph in text:
    sentences = [sentence for sentence in paragraph.split('.') if len(sentence) > 0]        # split paragraph into sentences
    num_sentences = len(sentences)                                  # number of sentences
    if (num_sentences) >1:                                          # if more than one sentence
        start = random.randint(0, num_sentences-2)                  # start sentence
        sentence_a.append(sentences[start])                         # first sentence
        if random.random() > 0.5:                                   # 50% of the time
            sentence_b.append(sentences[start+1])                   # next sentence
            label.append(0)                                         # make it 0
        else:
            sentence_b.append(bag[random.randint(0, bag_size-1)])   # next sentence is random
            label.append(1)                                         #make it 1

    

Using Normal HuggingFace Transformers

In [11]:
# Tokenize sentences
inputs = tokenizer(sentence_a, sentence_b, truncation=True, return_tensors="pt", max_length=512 , padding='max_length')   

Show Keys

In [12]:
inputs.keys()   

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

We also need 2 more Tensors

In [13]:
inputs['next_sentence_label'] = torch.LongTensor([label]).T    # add next sentence label
inputs['next_sentence_label'][:25]

tensor([[0],
        [1],
        [0],
        [1],
        [0],
        [0],
        [0],
        [1],
        [0],
        [0],
        [1],
        [1],
        [0],
        [1],
        [0],
        [1],
        [1],
        [0],
        [0],
        [0],
        [0],
        [1],
        [0],
        [0],
        [1]])

In [14]:
inputs['labels'] = inputs.input_ids.detach().clone()           # add labels
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels'])

For MLM Task

In [15]:
rand = torch.rand(inputs.input_ids.shape)                    # random numbers

In [16]:
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * (inputs.input_ids != 102) * (inputs.input_ids != 0)  # mask array

In [17]:
for i in range(inputs.input_ids.shape[0]):                    # for each sentence
    indices = torch.flatten(mask_arr[i].nonzero()).tolist()   # get indices of masked words
    inputs.input_ids[i, indices] = 103                        # replace with [MASK] token

In [18]:
indices

[1,
 4,
 5,
 10,
 14,
 23,
 41,
 44,
 49,
 52,
 57,
 61,
 64,
 67,
 71,
 73,
 77,
 81,
 82,
 89,
 96,
 123,
 125,
 132,
 133,
 138,
 145,
 149,
 163]

PyTorch Dataloader

In [19]:
class MeditationsDataset(torch.utils.data.Dataset):                                         # create dataset class
    def __init__(self, encodings):                                                          # initialize
        self.encodings = encodings                                                          # save encodings                              

    def __getitem__(self, idx):                                                             # get item method
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}         # get item
        return item                                                                         # return item

    def __len__(self):                                                                      # get length method                                 
        return len(self.encodings.input_ids)                                                # return length

Pass Data to Dataloader

In [20]:
dataset = MeditationsDataset(inputs)                                                        # create dataset

Create Dataloader

In [21]:
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)                  # create data loader

Check for using GPU or not

In [22]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')         # set device

Training Loop

In [23]:
model.to(device)                                                                            # move model to device

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [24]:
model.train()                                                                               # set model to train mode

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

Initiallize Optimizer

In [25]:
from transformers import AdamW                                                            # import AdamW optimizer

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)                                   # create optimizer


Create Training Loop

In [None]:
from tqdm import tqdm     

for epoch in range(2):                                                   # import tqdm
    loop = tqdm(loader, leave=True)
    for batch in loop:                                                   # for each batch
        optimizer.zero_grad()                                            # zero gradients
        input_ids = batch['input_ids'].to(device)                        # move input to device
        token_type_ids = batch['token_type_ids'].to(device)              # move token type ids to device
        attention_mask = batch['attention_mask'].to(device)              # move attention mask to device
        next_sentence_label = batch['next_sentence_label'].to(device)    # move next sentence label to device
        labels = batch['labels'].to(device)                              # move labels to device
        outputs = model(input_ids, attention_mask=attention_mask, next_sentence_label=next_sentence_label , token_type_ids=token_type_ids , labels=labels)   # get outputs
        loss = outputs.loss                                              # get loss
        loss.backward()                                                  # backpropagate loss
        optimizer.step()                                                 # update parameters
        loop.set_description(f'Epoch {epoch}')                           # set description
        loop.set_postfix(loss=loss.item())                               # set loss

                                                            