# Mask Language Model

In [1]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataset import BERTLanguageModelingDataset
from vocab import Vocab
    

In [2]:
data_dir = "ptb"
epochs = 5
batch_length = 32
batch_size = 16
lr = 0.001

n_layers = 1
d_emb = 200
d_hid = 250
p_drop = 0.2

interval_print = 100
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Load Dataset


In [3]:
vocab = Vocab(data_dir, mask_token='<mask>')
trainset = BERTLanguageModelingDataset(data_dir, vocab, seq_len=batch_length, split='train')
validset = BERTLanguageModelingDataset(data_dir, vocab, seq_len=batch_length, split='valid')
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size)
validloader = torch.utils.data.DataLoader(validset, batch_size=batch_size)


building vocab...


100%|██████████| 42068/42068 [00:00<00:00, 121191.25it/s]

[('the', 50770), ('<unk>', 45020), ('N', 32481), ('of', 24400), ('to', 23638), ('a', 21196), ('in', 18000), ('and', 17474), ("'s", 9784), ('that', 8931)]
end building vocab ...
['<mask>', '<pad>', '<eos>', 'the', '<unk>', 'N', 'of', 'to', 'a', 'in']





# Model

In [4]:
class WordEmbedding(nn.Module):
    def __init__(self, num_embeddomgs, embedding_dim, p_drop=0.):
        super(WordEmbedding, self).__init__()
        self.emb = nn.Embedding(num_embeddomgs, embedding_dim)
        self.dropout = nn.Dropout(p_drop)

    def forward(self, input):
        output = self.emb(input)
        output = self.dropout(output)
        return output         

class MLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, p_drop):
        super(MLM, self).__init__()
        self.n_classes = vocab_size
        self.d_emb = embedding_dim


        self.word_embedding = WordEmbedding(self.n_classes, self.d_emb, p_drop=p_drop)
        self.layers = nn.GRU(self.d_emb, hidden_dim, n_layers, dropout=p_drop, batch_first=True, bidirectional=True)
        self.proj_layer = nn.Linear(hidden_dim*2, self.n_classes)
        
        self.drop = nn.Dropout(p_drop)
        # self.layer2 = nn.GRU(hidden_dim, self.n_classes)

    def forward(self, input):
        emb = self.word_embedding(input)
        
        # (bsz, len_step, h_dim)()
        output, h = self.layers(emb)

        output = self.drop(output)
        
        output = self.proj_layer(output)
        return output

class BidirectionalLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, p_drop):
        super(MLM, self).__init__()
        self.n_classes = vocab_size
        self.d_emb = embedding_dim
        self.hidden_dim = hidden_dim


        self.word_embedding = WordEmbedding(self.n_classes, self.d_emb, p_drop=p_drop)
        self.layers = nn.GRU(self.d_emb, hidden_dim, n_layers, dropout=p_drop, batch_first=True, bidirectional=True)
        self.proj_layer = nn.Linear(hidden_dim*2, self.n_classes)
        
        self.drop = nn.Dropout(p_drop)
        # self.layer2 = nn.GRU(hidden_dim, self.n_classes)

    def forward(self, input):
        emb = self.word_embedding(input)
        # (bsz, len_step, h_dim)()
        output, h = self.layers(emb)
        forward_output, backward_output = output[:-2, :, :self.hidden_dim], output[2:, :, self.hidden_dim:]        

        output = self.drop(h)
        print(output.shape)

        output = self.proj_layer(output)
        return output

model = MLM(vocab_size=vocab.size, embedding_dim=d_emb, hidden_dim=d_hid, n_layers=n_layers, p_drop=p_drop)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(),
                  lr = lr, # config.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # config.adam_epsilon  - default is 1e-8.
                  )




In [38]:

n_iter, train_loss, best_ppl = 0, 0., float('inf')
for ep in range(epochs):
    print(f"[{ep}/{epochs}] epochs training...")
    
    # train
    model.train()
    for (mlm_train, mlm_target) in trainloader:
        n_iter += 1
        mlm_train = mlm_train.to(device)
        mlm_target = mlm_target.to(device)

        logits = model(mlm_train)
        loss = F.cross_entropy(logits.reshape(-1, vocab.size), mlm_target.reshape(-1), ignore_index=vocab.padding_idx)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

        if n_iter % interval_print == 0:
            train_loss /= interval_print
            train_ppl = math.exp(train_loss)
            print(f"n_iter:{n_iter} loss: {train_loss:0.3f} ppl: {train_ppl:0.3f}")
            train_loss = 0
            
    
    model.eval()
    valid_loss = 0.
    for mlm_train, mlm_target in validloader:
        mlm_train = mlm_train.to(device)
        mlm_target = mlm_target.to(device)

        with torch.no_grad():

            logits = model(mlm_train)
            loss = F.cross_entropy(logits.reshape(-1, vocab.size), mlm_target.reshape(-1), ignore_index=vocab.padding_idx)
            valid_loss += loss.item()
            
        
    valid_loss = valid_loss/len(validloader)
    valid_ppl = math.exp(valid_loss)
        

    if valid_ppl < best_ppl:
        best_ppl = valid_ppl
        torch.save(model, "mlm-best.pth")
        print("### find best mode ###", best_ppl)

    print(f"validation vloss: {valid_loss:0.3f} vppl: {valid_ppl:0.3f}, best ppl: {best_ppl:0.3f}")


   



[0/5] epochs training...
torch.Size([16, 32, 500]) torch.Size([2, 16, 250])
torch.Size([2, 16, 250])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x250 and 500x10002)

In [24]:
input_text = "i do n't think their customers would <mask> it very much "
k = 10
model.eval()
input_text = input_text.lower()
mask_ind = input_text.split(" ").index("<mask>")

mask_input = torch.tensor([vocab.encode_line(input_text, add_eos=True)]).to(device)
mask_ind = torch.tensor([mask_ind]).to(device)
print(mask_input, mask_ind)
# masked_input, masked_labels = trainset.get_masked_input_and_labels(seq)

logits = F.softmax(model(mask_input)[:, mask_ind, :], dim=-1)
top_k = torch.topk(logits, k)

top_k_words = top_k.indices.flatten()
top_k_probs = top_k.values.flatten()
for i, (w, p) in enumerate(zip(top_k_words, top_k_probs)):
    print(f"{i}th 'predicted word (prob.)': {vocab.id2tok[w]} ({p:0.3f})")
    print(f"{i}th 'complete sentence': {input_text.replace('<mask>', vocab.id2tok[w])}"  )

tensor([[ 70,  89,  34, 318,  53, 527,  44,   0,  16, 254, 123,   2]]) tensor([7])
torch.Size([2, 1, 250])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (2x250 and 500x10002)