Reference:
1. [How to Code BERT Using PyTorch](https://neptune.ai/blog/how-to-code-bert-using-pytorch-tutorial)

In [17]:
maxlen = 30  
batch_size = 6  
max_pred = 5  
n_layers = 6  
n_heads = 12  
d_model = 768  
d_ff = 768*4  
d_k = d_v = 64  
n_segments = 2  


In [15]:
import re
import numpy as np

text = (
    'Hello, how are you? I am Romeo.\n' 
    'Hello, Romeo My name is Juliet. Nice to meet you.\n'
    'Nice to meet you too. How are you today?\n'
    'Great. My baseball team won the competition.\n'
    'Oh Congratulations, Juliet\n'
    'Thank you Romeo'
)
sentences = re.sub("[.,!?-]", '', text.lower()).split('\n')  # filter '.', ',', '?', '!'
word_list = list(set(" ".join(sentences).split()))
word_dict = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}
for i, w in enumerate(word_list):
   word_dict[w] = i + 4
   number_dict = {i: w for i, w in enumerate(word_dict)}
vocab_size = len(word_dict)
number_dict, vocab_size

({0: '[PAD]',
  1: '[CLS]',
  2: '[SEP]',
  3: '[MASK]',
  4: 'juliet',
  5: 'is',
  6: 'team',
  7: 'competition',
  8: 'name',
  9: 'today',
  10: 'are',
  11: 'thank',
  12: 'i',
  13: 'the',
  14: 'meet',
  15: 'you',
  16: 'too',
  17: 'oh',
  18: 'to',
  19: 'am',
  20: 'congratulations',
  21: 'won',
  22: 'how',
  23: 'my',
  24: 'romeo',
  25: 'great',
  26: 'baseball',
  27: 'nice',
  28: 'hello'},
 29)

In [None]:
token_list = list()
for sentence in sentences:
   arr = [word_dict[s] for s in sentence.split()]
   token_list.append(arr)
   # print(sentence)
   # print(arr)

token_list

hello how are you i am romeo
[28, 22, 10, 15, 12, 19, 24]
hello romeo my name is juliet nice to meet you
[28, 24, 23, 8, 5, 4, 27, 18, 14, 15]
nice to meet you too how are you today
[27, 18, 14, 15, 16, 22, 10, 15, 9]
great my baseball team won the competition
[25, 23, 26, 6, 21, 13, 7]
oh congratulations juliet
[17, 20, 4]
thank you romeo
[11, 15, 24]


[[28, 22, 10, 15, 12, 19, 24],
 [28, 24, 23, 8, 5, 4, 27, 18, 14, 15],
 [27, 18, 14, 15, 16, 22, 10, 15, 9],
 [25, 23, 26, 6, 21, 13, 7],
 [17, 20, 4],
 [11, 15, 24]]

In [4]:
def make_batch():
   batch = []
   positive = negative = 0
   while positive != batch_size/2 or negative != batch_size/2:
       tokens_a_index, tokens_b_index= randrange(len(sentences)), randrange(len(sentences))

       tokens_a, tokens_b= token_list[tokens_a_index], token_list[tokens_b_index]

       input_ids = [word_dict['[CLS]']] + tokens_a + [word_dict['[SEP]']] + tokens_b + [word_dict['[SEP]']]
       segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)

       # MASK LM
       n_pred =  min(max_pred, max(1, int(round(len(input_ids) * 0.15)))) # 15 % of tokens in one sentence
       cand_maked_pos = [i for i, token in enumerate(input_ids)
                         if token != word_dict['[CLS]'] and token != word_dict['[SEP]']]
       shuffle(cand_maked_pos)
       masked_tokens, masked_pos = [], []
       for pos in cand_maked_pos[:n_pred]:
           masked_pos.append(pos)
           masked_tokens.append(input_ids[pos])
           if random() < 0.8:  # 80%
               input_ids[pos] = word_dict['[MASK]'] # make mask
           elif random() < 0.5:  # 10%
               index = randint(0, vocab_size - 1) # random index in vocabulary
               input_ids[pos] = word_dict[number_dict[index]] # replace

       # Zero Paddings
       n_pad = maxlen - len(input_ids)
       input_ids.extend([0] * n_pad)
       segment_ids.extend([0] * n_pad)

       # Zero Padding (100% - 15%) tokens
       if max_pred > n_pred:
           n_pad = max_pred - n_pred
           masked_tokens.extend([0] * n_pad)
           masked_pos.extend([0] * n_pad)

       if tokens_a_index + 1 == tokens_b_index and positive < batch_size/2:
           batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # IsNext
           positive += 1
       elif tokens_a_index + 1 != tokens_b_index and negative < batch_size/2:
           batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) # NotNext
           negative += 1
   return batch