In [6]:
import os
from pathlib import Path
import re
import random
import transformers, datasets
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer
import tqdm
import torch
from argparse import Namespace
from torch.utils.data import Dataset, DataLoader
import itertools
import math

import torch.nn.functional as F
import numpy as np
from torch.optim import Adam

In [7]:
args = Namespace(
    
    VOCAB_SIZE = 30000,
    N_SEGMENTS = 3,
    MAX_LEN = 64,
    EMBED_DIM = 768,
    N_LAYERS = 12,
    ATTN_HEADS = 12,
    DROPOUT = 0.1,
    # Data and path information
    frequency_cutoff=25,
    data_path = "datasets/pairs.txt",
    model_state_file='model.pth', 
    review_csv='data/yelp/reviews_with_splits_lite.csv', 
    save_dir='model_storage/ch3/yelp/', 
    vectorizer_file='vectorizer.json',
    # No model hyperparameters
    # Training hyperparameters
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=2,
    seed=1337,
    # Runtime options omitted for space
)

# Pairs and Conv data

In [8]:
MAX_LEN = 64

### loading all data into memory
corpus_movie_conv = './datasets/movie_conversations.txt'
corpus_movie_lines = './datasets/movie_lines.txt'

In [9]:
with open(corpus_movie_conv, 'r', encoding='iso-8859-1') as c:
    conv = c.readlines()
with open(corpus_movie_lines, 'r', encoding='iso-8859-1') as l:
    lines = l.readlines()

In [10]:
### splitting text using special lines
lines_dic = {}
for line in lines:
    objects = line.split(" +++$+++ ")
    lines_dic[objects[0]] = objects[-1]

In [11]:
lines_dic

{'L1045': 'They do not!\n',
 'L1044': 'They do to!\n',
 'L985': 'I hope so.\n',
 'L984': 'She okay?\n',
 'L925': "Let's go.\n",
 'L924': 'Wow\n',
 'L872': "Okay -- you're gonna need to learn how to lie.\n",
 'L871': 'No\n',
 'L870': 'I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?\n',
 'L869': 'Like my fear of wearing pastels?\n',
 'L868': 'The "real you".\n',
 'L867': 'What good stuff?\n',
 'L866': "I figured you'd get to the good stuff eventually.\n",
 'L865': 'Thank God!  If I had to hear one more story about your coiffure...\n',
 'L864': "Me.  This endless ...blonde babble. I'm like, boring myself.\n",
 'L863': 'What crap?\n',
 'L862': 'do you listen to this crap?\n',
 'L861': 'No...\n',
 'L860': 'Then Guillermo says, "If you go any lighter, you\'re gonna look like an extra on 90210."\n',
 'L699': 'You always been this selfish?\n',
 'L698': 'But\n',
 'L697': "Then that's all you had to say.\n",
 'L696': 'Well, no...\n',
 'L695

In [12]:
### generate question answer pairs
pairs = []
for con in conv:
    ids = eval(con.split(" +++$+++ ")[-1])
    for i in range(len(ids)):
        qa_pairs = []
        
        if i == len(ids) - 1:
            break
        
        # remove leading and trailing white spaces 
        first = lines_dic[ids[i]].strip()
        second = lines_dic[ids[i+1]].strip() 

        # remove non-alphabetical characters
        qa_pairs.append(' '.join(first.split()[:MAX_LEN]))
        qa_pairs.append(' '.join(second.split()[:MAX_LEN]))
        pairs.append(qa_pairs)

In [13]:
pairs

[['Can we make this quick? Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad. Again.',
  "Well, I thought we'd start with pronunciation, if that's okay with you."],
 ["Well, I thought we'd start with pronunciation, if that's okay with you.",
  'Not the hacking and gagging and spitting part. Please.'],
 ['Not the hacking and gagging and spitting part. Please.',
  "Okay... then how 'bout we try out some French cuisine. Saturday? Night?"],
 ["You're asking me out. That's so cute. What's your name again?",
  'Forget it.'],
 ["No, no, it's my fault -- we didn't have a proper introduction ---",
  'Cameron.'],
 ['Cameron.',
  "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser. My sister. I can't date until she does."],
 ["The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser. My sister. I can't date until she does.",
  'Seems like she could get a date easy enough...'],
 ['Why?',
  '

### Vectorizer


In [11]:
tokenizer = BertTokenizer.from_pretrained('./tokenizer/tokenizer-1-vocab.txt', local_files_only=True)



### Embeddings

In [12]:
class BERTEmbedding(nn.Module):
    def __init__(self,
                 vocab_size,
                 n_segments,
                 max_len,
                 embed_dim,
                 dropout):
        super().__init__()
        self.tok_embed = nn.Embedding(vocab_size, embed_dim)
        self.seg_embed = nn.Embedding(n_segments, embed_dim)
        self.pos_embed = nn.Embedding(max_len, embed_dim)

        self.drop = nn.Dropout(dropout)
        self.pos_inp = torch.tensor([i for i in range(max_len)],)

    def forward(self, seq, seg):
        embed_val = self.tok_embed(seq) + self.seg_embed(seg) + self.pos_embed(self.pos_inp)
        return self.drop(embed_val)
    
class BERT(nn.Module):
    def __init__(self,
                 vocab_size,
                 n_segments,
                 max_len,
                 embed_dim,
                 n_layers,
                 attn_heads,
                 dropout):
        super().__init__()
        self.embedding = BERTEmbedding(vocab_size, n_segments, max_len, embed_dim, dropout)
        self.enc_layer = nn.TransformerEncoderLayer(embed_dim, attn_heads, embed_dim*4)
        self.enc_block = nn.TransformerEncoder(self.enc_layer, n_layers)
    
    def forward(self, seq, seg):
        embed_val = self.embedding(seq, seg)
        return self.enc_block(embed_val)
        

NameError: name 'nn' is not defined

In [13]:
sample_seq = torch.randint(high = args.VOCAB_SIZE, size = [args.MAX_LEN,])
sample_seg = torch.randint(high = args.N_SEGMENTS, size = [args.MAX_LEN,])

embedding = BERTEmbedding(args.VOCAB_SIZE, args.N_SEGMENTS, args.MAX_LEN, args.EMBED_DIM, args.DROPOUT)
embedding_tensor = embedding(sample_seq, sample_seg)
print(embedding_tensor.shape)  # [512, 768] -> [max_len, embed_dim]

bert = BERT(args.VOCAB_SIZE, args.N_SEGMENTS, args.MAX_LEN, args.EMBED_DIM, args.N_LAYERS, args.ATTN_HEADS, args.DROPOUT)
out = bert(sample_seq, sample_seg)
print(out.shape)  # [512, 768] -> [max_len, embed_dim]

NameError: name 'BERTEmbedding' is not defined

# BERT MODEL

In [14]:
import torch
import torch.nn as nn
from embed import BERTEmbedding, PositionalEmbeddings
from encoder import MultiHeadedAttention, FeedForward, EncoderLayer
from dataloader import BERTDataset

class BERT(nn.Module):
    def __init__(self, 
                vocab_size, 
                d_in=768, 
                n_layers=12, 
                n_heads=12, 
                dropout=0.1):
        super().__init__()

        self.d_in = d_in
        self.n_layers = n_layers
        self.heads = n_heads

        #paper has 4*hidden_size for ff_hidden_size
        self.feed_forward_hidden = 4*d_in

        self.embedding = BERTEmbedding(vocab_size, d_in)

        #multi attention
        self.encoder_block = nn.ModuleList(
            [EncoderLayer(d_in, n_heads, d_in*4, dropout) for _ in range(n_layers)]
        )
    
    def forward(self, x, segment_info):
        mask = (x>0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)

        x = self.embedding(x, segment_info)

        for layer in self.encoder_block:
            x = layer(x, mask)
        return x
    

class NextSentencePrediction(torch.nn.Module):

    def __init__(self, hidden):
        super().__init__()
        self.linear = nn.Linear(hidden, 2)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x):
        return self.softmax(self.linear(x[:, 0]))

class MaskedLanguageModel(torch.nn.Module):
    """
    predicting origin token from masked input sequence
    n-class classification problem, n-class = vocab_size
    """

    def __init__(self, hidden, vocab_size):
        super().__init__()
        self.linear = nn.Linear(hidden, vocab_size)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x):
        return self.softmax(self.linear(x))
    
class BERTLM(nn.Module):

    def __init__(self, bert, vocab_size):

        super().__init__()
        self.bert = bert
        self.next_sentence = NextSentencePrediction(bert.d_in)
        self.mask_lm = MaskedLanguageModel(bert.d_in, vocab_size)

    def forward(self, x, segment_label):
        x = self.bert(x, segment_label)
        return self.next_sentence(x), self.mask_lm(x)

### Optimizer

In [15]:
import numpy as np

class ScheduledOptim():

    def __init__(self, optimizer, d_in, n_warmup_steps):
        self._optimizer = optimizer
        self.n_warmup_steps = n_warmup_steps
        self.n_current_steps = 0
        self.init_lr = np.power(d_in, -0.5)

    def step_and_update_lr(self):
        self._update_learning_rate()
        self._optimizer.step()

    def zero_grad(self):
        self._optimizer.zero_grad()

    def _get_lr_scale(self):
        return np.min([
            np.power(self.n_current_steps, -0.5),
            self.n_current_steps * np.power(self.n_warmup_steps, -1.5)
        ])
    
    def _update_learning_rate(self):
        self.n_current_steps += 1
        lr = self.init_lr * self._get_lr_scale()
        for param_group in self._optimizer.param_groups:
            param_group['lr'] = lr



### Trainer

In [16]:
class BERTTrainer:
    def __init__(
        self, 
        model, 
        train_dataloader, 
        test_dataloader=None, 
        lr= 1e-4,
        weight_decay=0.01,
        betas=(0.9, 0.999),
        warmup_steps=10000,
        log_freq=10,
        device='cuda'
        ):

        self.device = device
        self.model = model
        self.train_data = train_dataloader
        self.test_data = test_dataloader

        # Setting the Adam optimizer with hyper-param
        self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
        self.optim_schedule = ScheduledOptim(
            self.optim, self.model.bert.d_in, n_warmup_steps=warmup_steps
            )

        # Using Negative Log Likelihood Loss function for predicting the masked_token
        self.criterion = torch.nn.NLLLoss(ignore_index=0)
        self.log_freq = log_freq
        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
    
    def train(self, epoch):
        self.iteration(epoch, self.train_data)

    def test(self, epoch):
        self.iteration(epoch, self.test_data, train=False)

    def iteration(self, epoch, data_loader, train=True):
        
        avg_loss = 0.0
        total_correct = 0
        total_element = 0
        
        mode = "train" if train else "test"

        # progress bar
        data_iter = tqdm.tqdm(
            enumerate(data_loader),
            desc="EP_%s:%d" % (mode, epoch),
            total=len(data_loader),
            bar_format="{l_bar}{r_bar}"
        )

        for i, data in data_iter:
            print(i)

            # 0. batch_data will be sent into the device(GPU or cpu)
            data = {key: value.to(self.device) for key, value in data.items()}

            # 1. forward the next_sentence_prediction and masked_lm model
            next_sent_output, mask_lm_output = self.model.forward(data["bert_input"], data["segment_label"])

            # 2-1. NLL(negative log likelihood) loss of is_next classification result
            next_loss = self.criterion(next_sent_output, data["is_next"])

            # 2-2. NLLLoss of predicting masked token word
            # transpose to (m, vocab_size, seq_len) vs (m, seq_len)
            # criterion(mask_lm_output.view(-1, mask_lm_output.size(-1)), data["bert_label"].view(-1))
            mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"])

            # 2-3. Adding next_loss and mask_loss : 3.4 Pre-training Procedure
            loss = next_loss + mask_loss

            # 3. backward and optimization only in train
            if train:
                self.optim_schedule.zero_grad()
                loss.backward()
                self.optim_schedule.step_and_update_lr()

            # next sentence prediction accuracy
            correct = next_sent_output.argmax(dim=-1).eq(data["is_next"]).sum().item()
            avg_loss += loss.item()
            total_correct += correct
            total_element += data["is_next"].nelement()

            post_fix = {
                "epoch": epoch,
                "iter": i,
                "avg_loss": avg_loss / (i + 1),
                "avg_acc": total_correct / total_element * 100,
                "loss": loss.item()
            }

            if i % self.log_freq == 0:
                data_iter.write(str(post_fix))
        print(
            f"EP{epoch}, {mode}: \
            avg_loss={avg_loss / len(data_iter)}, \
            total_acc={total_correct * 100.0 / total_element}"
        ) 

In [14]:
##### DRY RUN
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from argparse import Namespace

from dataloader import BERTDataset
from model import BERT, BERTLM 
from trainer import BERTTrainer

train_data = BERTDataset(
   pairs, seq_len=MAX_LEN, tokenizer=tokenizer)

train_loader = DataLoader(
   train_data, batch_size=32, shuffle=True, pin_memory=True)

bert_model = BERT(
  vocab_size=len(tokenizer.vocab),
  d_in=768,
  n_layers=2,
  n_heads=12,
  dropout=0.1
)

bert_lm = BERTLM(bert_model, len(tokenizer.vocab))
bert_trainer = BERTTrainer(bert_lm, train_loader, device='cpu')
epochs = 20

for epoch in range(epochs):
  bert_trainer.train(epoch)


Total Parameters: 46697897


EP_train:0:   0%|| 0/6926 [00:00<?, ?it/s]

0


EP_train:0:   0%|| 1/6926 [00:00<1:04:21,  1.79it/s]

{'epoch': 0, 'iter': 0, 'avg_loss': 10.606180191040039, 'avg_acc': 53.125, 'loss': 10.606180191040039}
1


EP_train:0:   0%|| 2/6926 [00:01<1:02:17,  1.85it/s]

2


EP_train:0:   0%|| 3/6926 [00:01<1:01:06,  1.89it/s]

3


EP_train:0:   0%|| 4/6926 [00:02<1:00:39,  1.90it/s]

4


EP_train:0:   0%|| 5/6926 [00:02<1:00:02,  1.92it/s]

5


EP_train:0:   0%|| 6/6926 [00:03<1:00:59,  1.89it/s]

6


EP_train:0:   0%|| 7/6926 [00:03<1:00:22,  1.91it/s]

7


EP_train:0:   0%|| 8/6926 [00:04<59:31,  1.94it/s]  

8


EP_train:0:   0%|| 9/6926 [00:04<58:54,  1.96it/s]

9


EP_train:0:   0%|| 10/6926 [00:05<58:24,  1.97it/s]

10


EP_train:0:   0%|| 11/6926 [00:05<58:27,  1.97it/s]

{'epoch': 0, 'iter': 10, 'avg_loss': 10.524168361317027, 'avg_acc': 48.29545454545455, 'loss': 10.582555770874023}
11


EP_train:0:   0%|| 12/6926 [00:06<58:26,  1.97it/s]

12


EP_train:0:   0%|| 13/6926 [00:06<58:12,  1.98it/s]

13


EP_train:0:   0%|| 14/6926 [00:07<58:09,  1.98it/s]

14


EP_train:0:   0%|| 15/6926 [00:07<58:36,  1.97it/s]

15


EP_train:0:   0%|| 16/6926 [00:08<58:15,  1.98it/s]

16


EP_train:0:   0%|| 17/6926 [00:08<59:08,  1.95it/s]

17


EP_train:0:   0%|| 18/6926 [00:09<59:15,  1.94it/s]

18


EP_train:0:   0%|| 19/6926 [00:09<58:54,  1.95it/s]

19


EP_train:0:   0%|| 20/6926 [00:10<58:43,  1.96it/s]

20


EP_train:0:   0%|| 21/6926 [00:10<59:24,  1.94it/s]

{'epoch': 0, 'iter': 20, 'avg_loss': 10.503575097946893, 'avg_acc': 50.297619047619044, 'loss': 10.309139251708984}
21


EP_train:0:   0%|| 22/6926 [00:11<59:05,  1.95it/s]

22


EP_train:0:   0%|| 23/6926 [00:11<59:17,  1.94it/s]

23


EP_train:0:   0%|| 24/6926 [00:12<58:59,  1.95it/s]

24


EP_train:0:   0%|| 25/6926 [00:12<58:54,  1.95it/s]

25


EP_train:0:   0%|| 26/6926 [00:13<58:45,  1.96it/s]

26


EP_train:0:   0%|| 27/6926 [00:13<58:37,  1.96it/s]

27


EP_train:0:   0%|| 28/6926 [00:14<1:00:19,  1.91it/s]

28


EP_train:0:   0%|| 29/6926 [00:14<1:00:26,  1.90it/s]

29


EP_train:0:   0%|| 29/6926 [00:15<1:01:13,  1.88it/s]


KeyboardInterrupt: 

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from argparse import Namespace

from dataloader import BERTDataset
from model import BERT, BERTLM 
from trainer import BERTTrainer


# train_data = BERTDataset.load_data_and_get_tokenize(data_pair_path=args.data_path, seq_len=args.MAX_LEN)

train_data = BERTDataset(
   pairs, seq_len=MAX_LEN, tokenizer=tokenizer)

tokenizer = train_data.tokenizer

train_loader = DataLoader(train_data, batch_size=32, shuffle=True, pin_memory=True)

# print(len(tokenizer.vocab))

bert_model = BERT(
  vocab_size=len(tokenizer.vocab),
  d_in=768,
  n_layers=2,
  n_heads=12,
  dropout=0.1
)

bert_lm = BERTLM(bert_model, len(tokenizer.vocab))
bert_trainer = BERTTrainer(bert_lm, train_loader, device='cpu')

for epoch in range(args.num_epochs):
  bert_trainer.train(epoch)

In [16]:
pairs

[['Can we make this quick? Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad. Again.',
  "Well, I thought we'd start with pronunciation, if that's okay with you."],
 ["Well, I thought we'd start with pronunciation, if that's okay with you.",
  'Not the hacking and gagging and spitting part. Please.'],
 ['Not the hacking and gagging and spitting part. Please.',
  "Okay... then how 'bout we try out some French cuisine. Saturday? Night?"],
 ["You're asking me out. That's so cute. What's your name again?",
  'Forget it.'],
 ["No, no, it's my fault -- we didn't have a proper introduction ---",
  'Cameron.'],
 ['Cameron.',
  "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser. My sister. I can't date until she does."],
 ["The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser. My sister. I can't date until she does.",
  'Seems like she could get a date easy enough...'],
 ['Why?',
  '

'a'

In [23]:
len(pairs)

221616

In [24]:
# save pairs as json and then load it in a new variable
import json
with open('pairs.json', 'w') as f:
    json.dump(pairs, f)

with open('pairs.json', 'r') as f:
    pairs_2 = json.load(f)

In [26]:
len(pairs_2)

221616