# Module 1 Project 3: BERT

Implement BERT and play around with it

## STEP 0: IMPORT THE NECESSARY LIBRARIES
- We need Torch, transformers, and numpy for this implementation

In [None]:
import os
from pathlib import Path
import torch
import re
import random
import transformers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader
import itertools
import math
import torch.nn.functional as F
import numpy as np
from torch.optim import Adam

## STEP 1: DATA LOADING AND PREPROCESSING
- Here, the provided data is two text files, one with lines from a conversation in a movie, and one with each line containing the line numbers of a given conversation
- We are splitting on the special delimiter " +++$+++ " as the examples in the data file are delimited with this
- We grab the last piece of splitting on the above delimiter to get the text or line numbers
- Lastly we build question/answer pairs using the line numbers from the conversation data file
- This gives us a solid dataset of question and answer pairs to train our model on

In [None]:
# Max input tokens
MAX_LEN = 64

corpus_movie_conv = './movie_data/movie_conversations.txt'
corpus_movie_lines = './movie_data/movie_lines.txt'
with open(corpus_movie_conv, 'r', encoding='iso-8859-1') as c:
    conv = c.readlines()
with open(corpus_movie_lines, 'r', encoding='iso-8859-1') as l:
    lines = l.readlines()

lines_dic = {}
for line in lines:
    objects = line.split(" +++$+++ ")
    lines_dic[objects[0]] = objects[-1]

pairs = []
for con in conv:
    ids = eval(con.split(" +++$+++ ")[-1])
    for i in range(len(ids)):
        qa_pairs = []
        
        if i == len(ids) - 1:
            break

        first = lines_dic[ids[i]].strip()  
        second = lines_dic[ids[i+1]].strip() 

        qa_pairs.append(' '.join(first.split()[:MAX_LEN]))
        qa_pairs.append(' '.join(second.split()[:MAX_LEN]))
        pairs.append(qa_pairs)

print(pairs[12]) # Printing a random pair as an example

## STEP 2: DATA PREPARATION
- Once we have the question/answer pairs as displayed above, we can generate the data files to feed into our model
- BertWordPieceTokenizer from `transformers` needs to process a list of file paths that represent our corpus
- So we split our conversation question and answers into text files with ~10000 characters in each one
- This should result in around ~21 text files in `./data` if using the provided datasets

In [None]:
os.mkdir('./data')
text_data = []
file_count = 0

for sample in [x[0] for x in pairs]:
    text_data.append(sample)

    # Each text file should be ~10000 characters
    if len(text_data) == 10000:
        with open(f'./data/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1

paths = [str(x) for x in Path('./data').glob('**/*.txt')]

# Printing the first path as an example
print(paths[0])

## STEP 3: TOKENIZER
- We will be using BertWordPieceTokenizer from `transformers`
- The use of a pre-built tokenizer here is so we don't have to re-implement the BERT tokenizer
- Wordpieces prefix is an important concept here not covered in the tokenizer piece of this module
- Special tokens are used to denote the start and end of sentences, as well as a `[MASK]` token for use in masking
- We will train the tokenizer with a vocab size of 30000
- We will save the vocab to `./bert-it-1/bert-it-vocab.txt`

In [None]:
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=True
)

tokenizer.train( 
    files=paths,
    vocab_size=30_000, 
    min_frequency=5,
    limit_alphabet=1000, 
    wordpieces_prefix='##',
    special_tokens=['[PAD]', '[CLS]', '[SEP]', '[MASK]', '[UNK]']
    )

os.mkdir('./bert-it-1')
tokenizer.save_model('./bert-it-1', 'bert-it')
tokenizer = BertTokenizer.from_pretrained('./bert-it-1/bert-it-vocab.txt', local_files_only=True)

# Assert we have our vocab file created correctly
print(os.path.exists("./bert-it-1/bert-it-vocab.txt"))

## STEP 4: DATASET CLASS WITH TOKENIZATION
- We use a wrapper class here to handle the dataset loading and tokenization pieces with one call
- This class will be wrapped in a torch DataLoader during training and enumerates at top level with `__getitem__`
- When a new item is produced, two sentences are chosen a 'first' and a 'next' with the goal of predicting the likelihood the 'first' sentence the 'next' one
- There is a 50% chance the returned 'next' sentence will actually follow the given 'first' sentence
- BERT has a 15% likelihood of masking a random word within each sentence, and an additional 10% chance of choosing a random replacement for that word, instead of the token '[MASK]'
- This comes into play later with Masked Language Modeling

In [None]:
class BERTDataset(Dataset):
    def __init__(self, data_pair, tokenizer, seq_len=64):

        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.corpus_lines = len(data_pair)
        self.lines = data_pair

    def __len__(self):
        return self.corpus_lines

    def __getitem__(self, item):
        t1, t2, is_next_label = self.get_sent(item)

        t1_random, t1_label = self.random_word(t1)
        t2_random, t2_label = self.random_word(t2)

        t1 = [self.tokenizer.vocab['[CLS]']] + t1_random + [self.tokenizer.vocab['[SEP]']]
        t2 = t2_random + [self.tokenizer.vocab['[SEP]']]
        t1_label = [self.tokenizer.vocab['[PAD]']] + t1_label + [self.tokenizer.vocab['[PAD]']]
        t2_label = t2_label + [self.tokenizer.vocab['[PAD]']]

        segment_label = ([1 for _ in range(len(t1))] + [2 for _ in range(len(t2))])[:self.seq_len]
        bert_input = (t1 + t2)[:self.seq_len]
        bert_label = (t1_label + t2_label)[:self.seq_len]
        padding = [self.tokenizer.vocab['[PAD]'] for _ in range(self.seq_len - len(bert_input))]
        bert_input.extend(padding), bert_label.extend(padding), segment_label.extend(padding)

        output = {"bert_input": bert_input,
                    "bert_label": bert_label,
                    "segment_label": segment_label,
                    "is_next": is_next_label}

        return {key: torch.tensor(value) for key, value in output.items()}

    def random_word(self, sentence):
        tokens = sentence.split()
        output_label = []
        output = []

        for i, token in enumerate(tokens):
            prob = random.random()
            token_id = self.tokenizer(token)['input_ids'][1:-1]

            if prob < 0.15:
                prob /= 0.15
                if prob < 0.8:
                    for i in range(len(token_id)):
                        output.append(self.tokenizer.vocab['[MASK]'])
                elif prob < 0.9:
                    for i in range(len(token_id)):
                        output.append(random.randrange(len(self.tokenizer.vocab)))
                else:
                    output.append(token_id)

                output_label.append(token_id)
            else:
                output.append(token_id)
                for i in range(len(token_id)):
                    output_label.append(0)

        output = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output]))
        output_label = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output_label]))
        assert len(output) == len(output_label)
        return output, output_label

    def get_sent(self, index):
        t1, t2 = self.get_corpus_line(index)

        if random.random() > 0.5:
            return t1, t2, 1
        else:
            return t1, self.get_random_line(), 0

    def get_corpus_line(self, item):
        return self.lines[item][0], self.lines[item][1]

    def get_random_line(self):
        return self.lines[random.randrange(len(self.lines))][1]

## STEP 5: TEST THE DATASET GENERATION
- We want to confirm that our Tokenizer works with our Dataset class.
- So we build a `train_data` variable to store our data given the Q/A pairs we generated earlier.
- We then display the 'next' data piece in the data loader (random) and we choose a random piece from our Dataset class to see it's value 

In [None]:
train_data = BERTDataset(
   pairs, seq_len=MAX_LEN, tokenizer=tokenizer)
train_loader = DataLoader(
   train_data, batch_size=32, shuffle=True, pin_memory=True)
sample_data = next(iter(train_loader))
print(sample_data)
print(train_data[random.randrange(len(train_data))])

## STEP 6: POSITIONAL EMBEDDING
- BERT works with 3 types of embedding: Token, Positional, and Segment embedding
- Token embedding is a simple lookup of size `vocab_size` with each embedding having size `embed_size`
- Positional embedding works differently, where `sin` and `cos` are used to capture the positional information in a vector with a given sequence length
- Segment embedding is yet another simple lookup of whether or not a token belongs to sentence A or sentence B, with each embedding having size `embed_size`

In [None]:
class PositionalEmbedding(torch.nn.Module):

    def __init__(self, hidden_size, max_len=128):
        super().__init__()
        pe = torch.zeros(max_len, hidden_size).float()
        pe.require_grad = False

        for pos in range(max_len):   
            for i in range(0, hidden_size, 2):   
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/hidden_size)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/hidden_size)))

        self.pe = pe.unsqueeze(0)   

    def forward(self, x):
        return self.pe

## STEP 7: BERT EMBEDDING
- Combine the PositionalEmbedding class with the Token and Segment embeddings using `torch.nn.Embedding`
- Add in a dropout later using a dropout of 0.1
- When this BERTEmbedding class is utilized, it will combine the Token, Positional, and Segment embeddings and returns the result after dropout

In [None]:
class BERTEmbedding(torch.nn.Module):
    def __init__(self, vocab_size, embed_size, seq_len=64, dropout=0.1):

        super().__init__()
        self.embed_size = embed_size
        self.token = torch.nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.segment = torch.nn.Embedding(3, embed_size, padding_idx=0)
        self.position = PositionalEmbedding(hidden_size=embed_size, max_len=seq_len)
        self.dropout = torch.nn.Dropout(p=dropout)
       
    def forward(self, sequence, segment_label):
        x = self.token(sequence) + self.position(sequence) + self.segment(segment_label)
        return self.dropout(x)

## STEP 7: MUTLI HEADED ATTENTION
- Multi Headed Attention uses a scaled dot product to perform the 'attention' step multiple times in parallel
- The 'attention' step here leverages our positional/token/segment encoding to calculate 'soft' weights for each word in a given sentence
- This loosely corresponds to the relationship between a given word (our query), the possible important words in context that are related to this word (our key) and the position of these possible matches (our value).

In [None]:
class MultiHeadedAttention(torch.nn.Module):
    
    def __init__(self, heads, hidden_size, dropout=0.1):
        super(MultiHeadedAttention, self).__init__()
        
        assert hidden_size % heads == 0
        self.d_k = hidden_size // heads
        self.heads = heads
        self.dropout = torch.nn.Dropout(dropout)

        self.query = torch.nn.Linear(hidden_size, hidden_size)
        self.key = torch.nn.Linear(hidden_size, hidden_size)
        self.value = torch.nn.Linear(hidden_size, hidden_size)
        self.output_linear = torch.nn.Linear(hidden_size, hidden_size)
        
    def forward(self, query, key, value, mask):
        query = self.query(query)
        key = self.key(key)        
        value = self.value(value)   

        query = query.view(query.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)   
        key = key.view(key.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)  
        value = value.view(value.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)  
        
        scores = torch.matmul(query, key.permute(0, 1, 3, 2)) / math.sqrt(query.size(-1))
        scores = scores.masked_fill(mask == 0, -1e9)    

        weights = F.softmax(scores, dim=-1)           
        weights = self.dropout(weights)

        context = torch.matmul(weights, value)
        context = context.permute(0, 2, 1, 3).contiguous().view(context.shape[0], -1, self.heads * self.d_k)

        return self.output_linear(context)

## STEP 8: FEED FORWARD LAYER
- We need a generalized Feed Forward layer in our model to manage the weights generated from the attention layer created above
- This layer is two fully connected layers, with input size `hidden_size`
- It also uses a dropout layer (of 0.1) and GELU activation after the first fully connected layer, and then returns the ouput

In [None]:
class FeedForward(torch.nn.Module):

    def __init__(self, hidden_size, middle_dim=2048, dropout=0.1):
        super(FeedForward, self).__init__()
        
        self.fc1 = torch.nn.Linear(hidden_size, middle_dim)
        self.fc2 = torch.nn.Linear(middle_dim, hidden_size)
        self.dropout = torch.nn.Dropout(dropout)
        self.activation = torch.nn.GELU()

    def forward(self, x):
        out = self.activation(self.fc1(x))
        out = self.fc2(self.dropout(out))
        return out

## STEP 8: ENCODER LAYER
- We use both the FeedForward and MultiHeadedAttention classes created above to build our Encoder layer
- This serves as the main 'building block' for BERT architecture, there are 12 of them in this example
- The encoder layer gives a predefined `hidden_size` (the size of the attention and FF layer inputs/outputs) and a set number of attention heads (to be run in parallel)
- Dropout is applied after the attention later, and then again after the feed forward layer
- The embeddings of each input sentence are normalized and fed into the feed forward layer before the forward pass
- The output is the normalized with the attention results again before returning output from the Encoder layer 

In [None]:
class EncoderLayer(torch.nn.Module):
    def __init__(
        self, 
        hidden_size=768,
        heads=12, 
        feed_forward_hidden=768 * 4, 
        dropout=0.1
        ):
        super(EncoderLayer, self).__init__()
        self.layernorm = torch.nn.LayerNorm(hidden_size)
        self.self_multihead = MultiHeadedAttention(heads, hidden_size)
        self.feed_forward = FeedForward(hidden_size, middle_dim=feed_forward_hidden)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, embeddings, mask):
        interacted = self.dropout(self.self_multihead(embeddings, embeddings, embeddings, mask))
        interacted = self.layernorm(interacted + embeddings)
        feed_forward_out = self.dropout(self.feed_forward(interacted))
        encoded = self.layernorm(feed_forward_out + interacted)
        return encoded

## STEP 9: BERT BASE MODULE
- We use the BERTEmbedding and EncoderLayer classes created above to build our BERT base
- Given that we have set 12 layers below, 12 Encoder layers are created after our embedding is complete
- This is not good enough for a standalone language model just yet
- We need to incorporate our masking and a prediction if a given sentence A is followed by a given sentence B

In [None]:
class BERT(torch.nn.Module):
    def __init__(self, vocab_size, hidden_size=768, n_layers=12, heads=12, dropout=0.1):

        super().__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.heads = heads

        self.feed_forward_hidden = hidden_size * 4

        self.embedding = BERTEmbedding(vocab_size=vocab_size, embed_size=hidden_size)

        self.encoder_blocks = torch.nn.ModuleList(
            [EncoderLayer(hidden_size, heads, hidden_size * 4, dropout) for _ in range(n_layers)])

    def forward(self, x, segment_info):
        mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)

        x = self.embedding(x, segment_info)

        for encoder in self.encoder_blocks:
            x = encoder.forward(x, mask)
        return x

## STEP 9: NEXT SENTENCE PREDICTION
- The first piece we need to complete our LM is Next Sentence Prediction
- This simply consists of a binary classifier and softmax output for ("YES" or "NO) - which denotes whether sentence B follows sentence A when provided two sentences

In [None]:
class NextSentencePrediction(torch.nn.Module):
    def __init__(self, hidden):
        super().__init__()
        self.linear = torch.nn.Linear(hidden, 2)
        self.softmax = torch.nn.LogSoftmax(dim=-1)

    def forward(self, x):
        return self.softmax(self.linear(x[:, 0]))

## STEP 10: MASKED LANGUAGE MODEL
- Masked Language Modeling (MLM) is used with the ouputs of our BERT module to determine the probabilites of a masked token being it's predicted value
- This is another simple linear layer and a softmax on the output, with the size being `vocab_size`
- This outputs the probabilities of all tokens in our vocab being the masked token in the sentence

In [None]:
class MaskedLanguageModel(torch.nn.Module):
    def __init__(self, hidden, vocab_size):
        super().__init__()
        self.linear = torch.nn.Linear(hidden, vocab_size)
        self.softmax = torch.nn.LogSoftmax(dim=-1)

    def forward(self, x):
        return self.softmax(self.linear(x))

## STEP 11: BUILD OUR LANGUAGE MODEL
- Now we can finally build our LM
- We combine the BERT module with Next Sentence Prediction and Masked Language Modeling to output the probablities of the masked token and the 'net' sentence being the following sentence in the corpus
- We return these probabilities as outputs to calculate our loss and help train our BERTLM model

In [None]:
class BERTLM(torch.nn.Module):
    def __init__(self, bert: BERT, vocab_size):
        super().__init__()
        self.bert = bert
        self.next_sentence = NextSentencePrediction(self.bert.hidden_size)
        self.mask_lm = MaskedLanguageModel(self.bert.hidden_size, vocab_size)

    def forward(self, x, segment_label):
        x = self.bert(x, segment_label)
        return self.next_sentence(x), self.mask_lm(x)

## STEP 12: OPTIMIZATION
- We will use a learning rate scheduler and optimizer to train our model
- We will use Adam for our optimizer with a learning rate of 1e-4 to start
- We will also use a weight decay of 0.01
- Our optimizer class has a top level method `step_and_update_lr` that is used to step through our training sequence with the appropriate learning rate reductions and optimizations

In [None]:
class ScheduledOptim():
    def __init__(self, optimizer, hidden_size, n_warmup_steps):
        self._optimizer = optimizer
        self.n_warmup_steps = n_warmup_steps
        self.n_current_steps = 0
        self.init_lr = np.power(hidden_size, -0.5)

    def step_and_update_lr(self):
        self._update_learning_rate()
        self._optimizer.step()

    def zero_grad(self):
        self._optimizer.zero_grad()

    def _get_lr_scale(self):
        return np.min([
            np.power(self.n_current_steps, -0.5),
            np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])

    def _update_learning_rate(self):
        self.n_current_steps += 1
        lr = self.init_lr * self._get_lr_scale()

        for param_group in self._optimizer.param_groups:
            param_group['lr'] = lr

## STEP 12: MODEL TRAINER
- Now we can combine everything into a final class, our model trainer
- This class isnt explicitly necessary, but for re-usability we can utilize this class to abstract away a lot of the common code used in training models
- This class contains our BERT model, our DataLoader, our optimizer, and an iteration method to step through a batch in our training sequence
- We also use a loss function called negative log likelihood loss to calculate the error in the outputs from the BERT model

In [None]:
class BERTTrainer:
    def __init__(
        self, 
        model, 
        train_dataloader, 
        test_dataloader=None, 
        lr= 1e-4,
        weight_decay=0.01,
        betas=(0.9, 0.999),
        warmup_steps=10000,
        log_freq=10,
        device='cpu'
        ):

        self.device = device
        self.model = model
        self.train_data = train_dataloader
        self.test_data = test_dataloader

        self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
        self.optim_schedule = ScheduledOptim(
            self.optim, self.model.bert.hidden_size, n_warmup_steps=warmup_steps
            )

        self.criterion = torch.nn.NLLLoss(ignore_index=0)
        self.log_freq = log_freq
        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
    
    def train(self, epoch):
        self.iteration(epoch, self.train_data)

    def test(self, epoch):
        self.iteration(epoch, self.test_data, train=False)

    def iteration(self, epoch, data_loader, train=True):
        avg_loss = 0.0
        total_correct = 0
        total_element = 0
        
        mode = "train" if train else "test"

        for i, data in enumerate(data_loader):
            data = {key: value.to(self.device) for key, value in data.items()}
            next_sent_output, mask_lm_output = self.model.forward(data["bert_input"], data["segment_label"])
            next_loss = self.criterion(next_sent_output, data["is_next"])
            mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"])
            loss = next_loss + mask_loss

            if train:
                self.optim_schedule.zero_grad()
                loss.backward()
                self.optim_schedule.step_and_update_lr()

            correct = next_sent_output.argmax(dim=-1).eq(data["is_next"]).sum().item()
            avg_loss += loss.item()
            total_correct += correct
            total_element += data["is_next"].nelement()

            post_fix = {
                "epoch": epoch,
                "iter": i,
                "avg_loss": avg_loss / (i + 1),
                "avg_acc": total_correct / total_element * 100,
                "loss": loss.item()
            }

            if i % self.log_freq == 0:
                print(str(post_fix))
        print(
            f"EP{epoch}, {mode}: \
            avg_loss={avg_loss / len(enumerate(data_loader))}, \
            total_acc={total_correct * 100.0 / total_element}")

## STEP 12: TRAIN THE MODEL
- In the final step here, we train our model on our provided data
- After setting up the BERTDataset and data loader (again), we can create our model base
- Given this model base, we can build our BERTLM language model and trainer class
- Defining 20 epochs, we iterate through them and iterate through our batches of training data in each pass
- Now we will have implemented and trained BERT

In [None]:
train_data = BERTDataset(
   pairs, seq_len=MAX_LEN, tokenizer=tokenizer)

train_loader = DataLoader(
   train_data, batch_size=32, shuffle=True, pin_memory=True)

bert_model = BERT(
  vocab_size=len(tokenizer.vocab),
  hidden_size=768,
  n_layers=2,
  heads=12,
  dropout=0.1
)

bert_lm = BERTLM(bert_model, len(tokenizer.vocab))
bert_trainer = BERTTrainer(bert_lm, train_loader, device='cpu')
epochs = 20

for epoch in range(epochs):
  bert_trainer.train(epoch)