# Language Model

In [1]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataset import LMDataset
from vocab import Vocab
    

In [2]:
data_dir = "ptb"
epochs = 20
batch_length = 16
batch_size = 8
lr = 0.001

n_layers = 2
d_emb = 200
d_hid = 250
p_drop = 0.2

interval_print = 200
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Load Dataset


In [3]:
vocab = Vocab(data_dir)
trainset = LMDataset(data_dir, vocab, batch_size, 'train')
validset = LMDataset(data_dir, vocab, batch_size, 'valid')
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_length)
validloader = torch.utils.data.DataLoader(validset, batch_size=batch_length)


building vocab...


100%|██████████| 42068/42068 [00:00<00:00, 185002.14it/s]


[('the', 50770), ('<unk>', 45020), ('N', 32481), ('of', 24400), ('to', 23638), ('a', 21196), ('in', 18000), ('and', 17474), ("'s", 9784), ('that', 8931)]
end building vocab ...
['<pad>', '<eos>', 'the', '<unk>', 'N', 'of', 'to', 'a', 'in', 'and']
binarizing data ...


100%|██████████| 42068/42068 [00:00<00:00, 69148.38it/s]


binarizing data ...


100%|██████████| 3370/3370 [00:00<00:00, 76796.63it/s]


# Model

In [4]:
class WordEmbedding(nn.Module):
    def __init__(self, num_embeddomgs, embedding_dim, p_drop=0.):
        super(WordEmbedding, self).__init__()
        self.emb = nn.Embedding(num_embeddomgs, embedding_dim)
        self.dropout = nn.Dropout(p_drop)

    def forward(self, input):
        output = self.emb(input)
        output = self.dropout(output)
        return output         

class RNNLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, p_drop):
        super(RNNLM, self).__init__()
        self.n_classes = vocab_size
        self.d_emb = embedding_dim


        self.word_embedding = WordEmbedding(self.n_classes, self.d_emb, p_drop=p_drop)
        self.layers = nn.GRU(self.d_emb, hidden_dim, n_layers, dropout=p_drop, batch_first=True)
        self.proj_layer = nn.Linear(hidden_dim, self.n_classes)
        
        self.drop = nn.Dropout(p_drop)
        # self.layer2 = nn.GRU(hidden_dim, self.n_classes)

    def forward(self, input):
        emb = self.word_embedding(input)
        output, h = self.layers(emb)
        output = self.drop(output)
        output = self.proj_layer(output)
        return output

model = RNNLM(vocab_size=vocab.size, embedding_dim=d_emb, hidden_dim=d_hid, n_layers=n_layers, p_drop=p_drop)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(),
                  lr = lr, # config.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # config.adam_epsilon  - default is 1e-8.
                  )



In [5]:

n_iter, train_loss, valid_loss, best_ppl = 0, 0., 0., float('inf')
for ep in range(epochs):
    print(f"[{ep}/{epochs}] epochs training...")
    
    # train
    model.train()
    for batch in trainloader:
        n_iter += 1
        batch = batch.transpose(1, 0).contiguous().to(device)
        
        target = batch[:, 1:].clone()
        logits = model(batch[:, :-1])
        loss = F.cross_entropy(logits.reshape(-1, vocab.size), target.reshape(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

        if n_iter % interval_print == 0:
            train_loss /= interval_print
            train_ppl = math.exp(train_loss)
            print(f"n_iter:{n_iter} loss: {train_loss:0.3f} ppl: {train_ppl:0.3f}")
            train_loss = 0
    
    model.eval()
    for step, batch in enumerate(validloader, 1):
        batch = batch.transpose(1, 0).to(device)
        with torch.no_grad():
            target = batch[:, 1:].clone()
            logits = model(batch[:, :-1])
            loss = F.cross_entropy(logits.reshape(-1, vocab.size), target.reshape(-1))
            valid_loss += loss.item()
    valid_loss = valid_loss/step
    valid_ppl = math.exp(valid_loss)

    if valid_ppl < best_ppl:
        best_ppl = valid_ppl
        torch.save(model, "rnnlm-best.pth")
        print("### find best mode ###", best_ppl)

    print(f"validation vloss: {valid_loss:0.3f} vppl: {valid_ppl:0.3f}, best ppl: {best_ppl:0.3f}")






In [6]:
best_model = torch.load('./rnnlm-best.pth')
warmup = "I have watched this".lower()
inputs = torch.tensor([vocab.encode_line(warmup, add_eos=False)]).to(device)
logits = F.softmax(model(inputs), dim=-1)
