In [1]:
!pip install transformers datasets

Collecting transformers
  Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 5.2 MB/s 
[?25hCollecting datasets
  Downloading datasets-1.16.1-py3-none-any.whl (298 kB)
[K     |████████████████████████████████| 298 kB 47.2 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 37.0 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 45.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 518 kB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |█████████

In [2]:
import os
import random
from tqdm import tqdm
import nltk
from nltk.tokenize import sent_tokenize
import torch
from transformers import (
    BertTokenizer,
    BertConfig,
    BertForPreTraining,
    AdamW
)
from datasets import load_dataset

In [3]:
# For sentence tokenization
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
# MOUNTING DRIVE TO ACCESS DATASET
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# MOUNT PATH
DRIVE_PATH = os.path.join('drive','MyDrive','collab','research', 'bert_scratch')

In [6]:
MODEL_NAME = "bert-base-uncased"
VOCAB = 'eng'

# MODEL_NAME = "bert-base-multilingual-uncased"
# VOCAB = 'multilingual'

MODEL_SAVE_PATH = os.path.join(DRIVE_PATH, f"{MODEL_NAME.replace('-','_')}_{VOCAB}")

In [7]:
# CONFIGS

RANDOM_SEED=37

DATASET_LIMIT = 300_000

MODEL_MAX_LEN = 512
MLM_MASKING_PROB = .15

TRAIN_EPOCHS = 6
LEARNING_RATE = 5e-5
BS = 16

## Dataset

In [8]:
wiki = load_dataset("wikipedia", "20200501.en", split="train")
# bookcorpus = load_dataset("bookcorpus", split="train")
# print(wiki.column_names, bookcorpus.column_names)
# # ['title', 'text'] ['text']

# wiki.remove_columns_("title")
# bert_dataset = concatenate_datasets([wiki, bookcorpus])


# dataset = load_dataset("cc_news", split="train")

bert_dataset = wiki

Downloading:   0%|          | 0.00/4.24k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/6.87k [00:00<?, ?B/s]

Downloading and preparing dataset wikipedia/20200501.en (download: 16.99 GiB, generated: 17.07 GiB, post-processed: Unknown size, total: 34.06 GiB) to /root/.cache/huggingface/datasets/wikipedia/20200501.en/1.0.0/009f923d9b6dd00c00c8cdc7f408f2b47f45dd4f5fb7982a21f9448f4afbe475...


Downloading:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/18.3G [00:00<?, ?B/s]

Dataset wikipedia downloaded and prepared to /root/.cache/huggingface/datasets/wikipedia/20200501.en/1.0.0/009f923d9b6dd00c00c8cdc7f408f2b47f45dd4f5fb7982a21f9448f4afbe475. Subsequent calls will reuse this data.


In [9]:
bert_dataset

Dataset({
    features: ['title', 'text'],
    num_rows: 6078422
})

#### All sentences in the dataset

In [10]:
def get_sentences_from_paragraph(paragraph):
    """ Returns list of sentences from paragraph"""
    sentences = []
    for sentence in sent_tokenize(paragraph.replace('\n', ' ')):
        sentence_tokens = sentence.strip().split(' ')
        # filter successive space chars
        sentence_tokens = list(filter(lambda token: token!='', sentence_tokens)) 
        if sentence!='':
            if len(sentence_tokens)>4:
                # filtering incomplete sentence due to incorrect sentence segmentation
                sentences.append(' '.join(sentence_tokens))
    return sentences

In [11]:
# all_sentences = [sentence for paragraph in bert_dataset[:DATASET_LIMIT]['text'] for sentence in sent_tokenize(paragraph.replace('\n', ' ')) if sentence!='']

all_sentences = []

for paragraph in bert_dataset[:DATASET_LIMIT]['text']:
    sentences = get_sentences_from_paragraph(paragraph)
    all_sentences += sentences

all_sentences_cnt = len(all_sentences)

In [12]:
# with open('text.tst', 'w') as f:
#     for sent in all_sentences:
#         print(sent, file=f)

### NSP

Data Preparation

In [13]:
sentence_a = []
sentence_b = []
label = []

for paragraph in bert_dataset[:DATASET_LIMIT]['text']:

    sentences = get_sentences_from_paragraph(paragraph)
    num_sentences_in_para = len(sentences)

    if num_sentences_in_para>1:
        start_sent_indx = random.randint(0, num_sentences_in_para-2)
        sentence_a.append(sentences[start_sent_indx])

        if random.random()> 0.5:
            # isNextSentence
            sentence_b.append(sentences[start_sent_indx+1])
            label.append(0)
        else:
            # isNotNextSentnece          
            sentence_b.append(all_sentences[random.randint(0, all_sentences_cnt-1)])
            label.append(1)

In [14]:
for a, b,c in zip(sentence_a[:3], sentence_b[:3], label[:3]):
  print(a,'\n',b,'\n',c)
  print('----')

Many of the rooms feature period furniture, paintings and calligraphy, and the extensive Shifu Garden. 
 A delegation of centurions was sent to Rome from Africa, to assassinate Publius Aelius Vitalianus, the Praetorian prefect and to spread a rumor that Maximinus had been killed while campaigning against the Sarmatians. 
 1
----
Orana was one of the first disability service organisations to achieve Quality Accreditation. 
 Shortly after the 1923 election, the Labour party formed its first government, taking office on 22 January 1924. 
 1
----
Towards the end of the 16th century, John II, Duke of Schleswig-Holstein-Sonderburg commissioned the enlargement of the building in order to make it suitable for the function of the parish church of his city. 
 The lawsuit stated in part that he "had to endure, and still endures today, harassment and derision from his high-school mates and the public at large" and "will be under psychiatric care for an indefinite amount of time". 
 1
----


## Tokenizer

The only difference between bert-base-uncased and bert-base-multilingual-uncased is the vocab size in tokenizer:



bert-base-uncased: Embedding(30522, 768, padding_idx=0)

bert-base-multilingual-uncased: Embedding(105879, 768, padding_idx=0)


In [15]:
bert_tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
print(f'Tokenizer vocab size: {bert_tokenizer.vocab_size}')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Tokenizer vocab size: 30522


In [16]:
inputs = bert_tokenizer(sentence_a, sentence_b, return_tensors='pt',
                        max_length=MODEL_MAX_LEN, truncation=True, padding='max_length')

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [17]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [18]:
inputs['next_sentence_label'] = torch.LongTensor([label]).T
inputs['next_sentence_label'][:5]

tensor([[1],
        [1],
        [1],
        [1],
        [0]])

### MLM

In [19]:
inputs['labels'] = inputs.input_ids.detach().clone()

Masking the data

In [20]:
rand_arr = torch.rand(inputs.input_ids.shape)

# not masking special tokens
mask_arr = (rand_arr<MLM_MASKING_PROB) * (inputs.input_ids != 101) * (inputs.input_ids != 102)* (inputs.input_ids != 0)       # CLS, SEP, PAD


In [21]:
# Assign Mask Token Id where element is True
for i in range(inputs.input_ids.shape[0]):
    # for each row
    selection = torch.flatten(mask_arr[i].nonzero()).tolist()
    inputs.input_ids[i, selection] = 103        # [MASK] token id

In [22]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    
    def __len__(self):
        return self.encodings['input_ids'].shape[0]
    
    def __getitem__(self, idx):
        return {
           key:torch.tensor(val[i]) for key, val in self.encodings.items() 
        }

In [23]:
dataset = Dataset(inputs)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=BS, shuffle=True)

## Model

### If we want to pretrain using Pretrained model

In [24]:
bert_model = BertForPreTraining.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Pretraining from scratch

In [25]:
config = BertConfig(
    num_hidden_layers=4, 
    num_attention_heads=4, 
)

bert_model = BertForPreTraining(config=config)

### GPU

In [26]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
bert_model.to(device)

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

### Train

In [27]:
# enable training mode
bert_model.train()

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [28]:
# ADAM with weighted decay
optim = AdamW(bert_model.parameters(), lr=LEARNING_RATE)

In [None]:
for epoch in range(TRAIN_EPOCHS):
    # setup loop with TQDM and dataloader
    loop = tqdm(dataloader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        next_sentence_label = batch['next_sentence_label'].to(device)
        labels = batch['labels'].to(device)

        # process
        outputs = bert_model(input_ids, token_type_ids=token_type_ids, 
                             attention_mask=attention_mask,
                             next_sentence_label=next_sentence_label,
                             labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  # Remove the CWD from sys.path while we load stuff.
Epoch 0:  14%|█▍        | 2554/17737 [18:54<1:52:01,  2.26it/s, loss=0.00111]

### Save Model

In [None]:
bert_model.save_pretrained(MODEL_SAVE_PATH)

### Test Model

In [None]:

from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model=MODEL_SAVE_PATH,
    tokenizer=bert_tokenizer
)


fill_mask("I love to [MASK] rice.")

Some weights of the model checkpoint at drive/MyDrive/collab/research/bert_scratch/bert_base_uncased_eng were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'score': 0.9997678399085999,
  'sequence': 'i love to rice.',
  'token': 0,
  'token_str': '[ P A D ]'},
 {'score': 5.396691631176509e-05,
  'sequence': 'i love to the rice.',
  'token': 1996,
  'token_str': 't h e'},
 {'score': 1.6131540178321302e-05,
  'sequence': 'i love to best rice.',
  'token': 2190,
  'token_str': 'b e s t'},
 {'score': 5.864294053026242e-06,
  'sequence': 'i love to burgundy rice.',
  'token': 18383,
  'token_str': 'b u r g u n d y'},
 {'score': 6.878402700749575e-07,
  'sequence': 'i love to rice.',
  'token': 101,
  'token_str': '[ C L S ]'}]

In [None]:
fill_mask = pipeline(
    "fill-mask",
    model='bert-base-uncased',
    tokenizer='bert-base-uncased'
)


fill_mask("I love to [MASK] rice.")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'score': 0.7250857353210449,
  'sequence': 'i love to eat rice.',
  'token': 4521,
  'token_str': 'eat'},
 {'score': 0.05431700497865677,
  'sequence': 'i love to make rice.',
  'token': 2191,
  'token_str': 'make'},
 {'score': 0.03795541077852249,
  'sequence': 'i love to have rice.',
  'token': 2031,
  'token_str': 'have'},
 {'score': 0.030118411406874657,
  'sequence': 'i love to cook rice.',
  'token': 5660,
  'token_str': 'cook'},
 {'score': 0.015348264016211033,
  'sequence': 'i love to do rice.',
  'token': 2079,
  'token_str': 'do'}]