# PyTorch🔥Words Prediction Demo from Wikitext2


Subject: Building a demo word prediction model.

Data: WikiText-2 via torchtext 

Procedure:
- Creating a vocabulary as simple set of distinct words
- Tokenizing simply via '...'.split()
- Creating a tensor from the indices of the flat word list
  Train is tensor of indices of 35 subsequent words, Target is the same but with one word further. (e.g. 0, 1, ..., 34 -> 1, 2, ..., 35)
- RNN Model with Long short-term memory (LSTM) and embeddings layer. Using torch.nn.Dropout, torch.nn.Embedding, torch.nn.LSTM, torch.nn.Linear, and torch.nn.functional.log_softmax.
- Training with torch.nn.NLLLoss, no optimizer (?)
- Evaluation by generating some words
- Disappointing results (to be expected with that small dataset)

Others:
- CUDA support
- working on Colab with Google Drive for saving/loading interim stages

Sources used:
- https://github.com/FraLotito/pytorch-continuous-bag-of-words/blob/master/cbow.py

## Bootstrap and Imports

In [20]:
import torch
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Running on {DEVICE}')

if IN_COLAB := 'google.colab' in str(get_ipython()):
  NUM_EPOCHS = 40
  !pip install torchviz
  !pip install portalocker

else:
  NUM_EPOCHS = 2

Running on cpu


In [21]:
from __future__ import print_function
import random
from tqdm.auto import tqdm
import numpy as np
from collections.abc import Callable
import os
import math

from IPython.display import HTML, Image

import time
import matplotlib.animation as animation
import matplotlib.pyplot as plt
#%matplotlib inline

import torch
from torch import nn
import torchtext
from torchvision import transforms
from torchvision import datasets
from torchvision.utils import make_grid
from torchvision import utils
from torch.utils.data import DataLoader
from torch.nn.modules.loss import _Loss
from torchviz import make_dot
import torch.nn.functional as F

my_seed = 123
random.seed(my_seed)
torch.manual_seed(my_seed)

<torch._C.Generator at 0x1f0d7db0a70>

## Constants

In [22]:
BATCH_SIZE = 20
BATCH_SIZE_EVALUATION = 10
LEARNING_RATE = 20
SEQUENCE_LENGTH = 35
GRADIENT_CLIPPING = 0.25
LOG_INTERVAL = 10

DRY_RUN = False  # verify the code and the model

## Corpus

In [23]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)
        
class Corpus(object):
    def __init__(self, device):
        self.dictionary = Dictionary()
        self.train = self.tokenize(torchtext.datasets.WikiText2(split='train')).to(device)
        self.valid = self.tokenize(torchtext.datasets.WikiText2(split='valid')).to(device)
        self.test = self.tokenize(torchtext.datasets.WikiText2(split='test')).to(device)
    
    def tokenize(self, iter):
        # Add words to the dictionary
        for line in iter:
            words = line.split() + ['<eos>']
            for word in words:
                self.dictionary.add_word(word)

        idss = []
        for line in iter:
            words = line.split() + ['<eos>']
            ids = []
            for word in words:
                ids.append(self.dictionary.word2idx[word])
            idss.append(torch.tensor(ids).type(torch.int64))
        ids = torch.cat(idss)  # [2088628] / [217646] / [245569]

        return ids

In [24]:
corpus = Corpus(device=DEVICE)

In [25]:
# Starting from sequential data, batchify arranges the dataset into columns.
# For instance, with the alphabet as the sequence and batch size 4, we'd get
# ┌ a g m s ┐
# │ b h n t │
# │ c i o u │
# │ d j p v │
# │ e k q w │
# └ f l r x ┘.
# These columns are treated as independent by the model, which means that the
# dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
# batch processing.

def batchify(data: torch.Tensor, batch_size: int) -> torch.Tensor:
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // batch_size
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * batch_size)
    # Evenly divide the data across the bsz batches.
    data = data.view(batch_size, -1).t().contiguous()
    return data.to(DEVICE)


In [26]:
train_data = batchify(corpus.train, BATCH_SIZE)  # [104431, 20]  torch.int64
val_data = batchify(corpus.valid, BATCH_SIZE_EVALUATION)  # [21764, 10]  torch.int64
test_data = batchify(corpus.test, BATCH_SIZE_EVALUATION)  # [24556, 10]  torch.int64

## RNN Model

In [27]:
class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, 
                 ntoken, 
                 rnn_type='LSTM', 
                 ninp=200,  # size of word embeddings 
                 nhid=200,  # number of hidden units per layer 
                 nlayers=2, 
                 dropout=0.2, 
                 tie_weights=True,  # tie the word embedding and softmax weights
                ):
        super(RNNModel, self).__init__()
        self.ntoken = ntoken
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(num_embeddings=ntoken,   # size of the dictionary of embeddings
                                    embedding_dim=ninp)  # size of each embedding vector
        
        self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        # Optionally tie weights as in:
        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
        # https://arxiv.org/abs/1608.05859
        # and
        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
        # https://arxiv.org/abs/1611.01462
        if tie_weights:
            if nhid != ninp:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        self.init_weights()

        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.bias)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output)
        decoded = decoded.view(-1, self.ntoken)
        return F.log_softmax(decoded, dim=1), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters())
        return (weight.new_zeros(self.nlayers, bsz, self.nhid),
                weight.new_zeros(self.nlayers, bsz, self.nhid))

In [28]:
ntokens = len(corpus.dictionary)
model = RNNModel(ntoken=ntokens).to(DEVICE).to(DEVICE)
criterion = nn.NLLLoss()

## Training Helpers

In [29]:
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""

    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [30]:
# get_batch subdivides the source data into chunks of length SEQUENCE_LENGTH.
# If source is equal to the example output of the batchify function, with
# a bptt-limit of 2, we'd get the following two Variables for i = 0:
# ┌ a g m s ┐ ┌ b h n t ┐
# └ b h n t ┘ └ c i o u ┘
# Note that despite the name of the function, the subdivison of data is not
# done along the batch dimension (i.e. dimension 1), since that was handled
# by the batchify function. The chunks are along dimension 0, corresponding
# to the seq_len dimension in the LSTM.

def get_batch(source, i):
    seq_len = min(SEQUENCE_LENGTH, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

In [31]:
def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(BATCH_SIZE_EVALUATION)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, SEQUENCE_LENGTH):
            data, targets = get_batch(data_source, i)
            output, hidden = model(data, hidden)
            hidden = repackage_hidden(hidden)
            total_loss += len(data) * criterion(output, targets).item()
    return total_loss / (len(data_source) - 1)

## Training

In [32]:
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(BATCH_SIZE)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, SEQUENCE_LENGTH)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        model.zero_grad()
        hidden = repackage_hidden(hidden)
        output, hidden = model(data, hidden)
        loss = criterion(output, targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIPPING)
        for p in model.parameters():
            p.data.add_(p.grad, alpha=-LEARNING_RATE)

        total_loss += loss.item()

        if batch % LOG_INTERVAL == 0 and batch > 0:
            cur_loss = total_loss / LOG_INTERVAL
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // SEQUENCE_LENGTH, LEARNING_RATE,
                elapsed * 1000 / LOG_INTERVAL, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
        if DRY_RUN:
            break

In [33]:
for epoch in range(NUM_EPOCHS):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(val_data)
    print('-' * 89)
    print(f'| end of epoch {epoch:3d} | time: {(time.time() - epoch_start_time):5.2f}s | valid loss {val_loss:5.2f} | '
          f'  valid ppl {math.exp(val_loss):8.2f}')
    print('-' * 89)

| epoch   0 |    10/ 2983 batches | lr 20.00 | ms/batch 409.94 | loss 10.56 | ppl 38413.97
| epoch   0 |    20/ 2983 batches | lr 20.00 | ms/batch 360.66 | loss  8.30 | ppl  4026.61
| epoch   0 |    30/ 2983 batches | lr 20.00 | ms/batch 349.42 | loss  8.00 | ppl  2966.53
| epoch   0 |    40/ 2983 batches | lr 20.00 | ms/batch 359.22 | loss  7.82 | ppl  2499.71
| epoch   0 |    50/ 2983 batches | lr 20.00 | ms/batch 346.90 | loss  7.73 | ppl  2264.96
| epoch   0 |    60/ 2983 batches | lr 20.00 | ms/batch 337.15 | loss  7.69 | ppl  2177.92
| epoch   0 |    70/ 2983 batches | lr 20.00 | ms/batch 341.78 | loss  7.58 | ppl  1952.22
| epoch   0 |    80/ 2983 batches | lr 20.00 | ms/batch 342.28 | loss  7.53 | ppl  1865.86
| epoch   0 |    90/ 2983 batches | lr 20.00 | ms/batch 349.00 | loss  7.49 | ppl  1798.14
| epoch   0 |   100/ 2983 batches | lr 20.00 | ms/batch 343.89 | loss  7.50 | ppl  1800.64
| epoch   0 |   110/ 2983 batches | lr 20.00 | ms/batch 346.67 | loss  7.37 | ppl  1582.46

KeyboardInterrupt: 

## Evaluation

In [None]:
# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print(f'| End of training | test loss {test_loss: 5.2f} | test ppl {math.exp(test_loss): 8.2f}')
print('=' * 89)

In [None]:
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))

In [None]:
print(f'| End of training | test loss {test_loss: 5.2f} | test ppl {math.exp(test_loss): 8.2f}')

# Generation

In [None]:
WORDS_COUNT = 1000  # number of words to generate
LOG_INTERVAL_GENERATION = 100
TEMPERATURE = 1.0  # higher will increase diversity

In [None]:
model.eval()

hidden = model.init_hidden(1)
input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(DEVICE)

words_output = ''

with torch.no_grad():  # no tracking history
    for i in range(WORDS_COUNT):

        output, hidden = model(input, hidden)
        word_weights = output.squeeze().div(TEMPERATURE).exp().cpu()
        word_idx = torch.multinomial(word_weights, 1)[0]
        input.fill_(word_idx)

        word = corpus.dictionary.idx2word[word_idx]

        words_output += (word + ('\n' if i % 20 == 19 else ' '))

        if i % LOG_INTERVAL_GENERATION == 0:
            print('| Generated {}/{} words'.format(i, WORDS_COUNT))

print(words_output)