<a href="https://colab.research.google.com/github/rajlm10/D2L-Torch/blob/main/D2L_Seq_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install d2l
#Restart runtime

Collecting d2l
  Downloading d2l-0.17.4-py3-none-any.whl (82 kB)
[?25l[K     |████                            | 10 kB 26.3 MB/s eta 0:00:01[K     |████████                        | 20 kB 8.4 MB/s eta 0:00:01[K     |████████████                    | 30 kB 7.5 MB/s eta 0:00:01[K     |████████████████                | 40 kB 3.6 MB/s eta 0:00:01[K     |███████████████████▉            | 51 kB 3.6 MB/s eta 0:00:01[K     |███████████████████████▉        | 61 kB 4.2 MB/s eta 0:00:01[K     |███████████████████████████▉    | 71 kB 4.4 MB/s eta 0:00:01[K     |███████████████████████████████▉| 81 kB 4.8 MB/s eta 0:00:01[K     |████████████████████████████████| 82 kB 584 kB/s 
Collecting pandas==1.2.4
  Downloading pandas-1.2.4-cp37-cp37m-manylinux1_x86_64.whl (9.9 MB)
[K     |████████████████████████████████| 9.9 MB 22.3 MB/s 
[?25hCollecting d2l
  Downloading d2l-0.17.3-py3-none-any.whl (82 kB)
[K     |████████████████████████████████| 82 kB 618 kB/s 
[?25hCollecting numpy=

# Utils 

In [2]:
import collections
import re
from d2l import torch as d2l
import torch
import math
import random
from torch import nn
from torch.nn import functional as F 


In [3]:
d2l.DATA_HUB['time_machine'] = (d2l.DATA_URL + 'timemachine.txt', '090b5e7e70c295757f55df93cb0a180b9691891a')

def read_time_machine():
  """Load the time machine dataset into a list of text lines.""" 
  with open(d2l.download('time_machine'), 'r') as f:
    lines = f.readlines()
  return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]

lines = read_time_machine() 
print(f'# text lines: {len(lines)}') 
print(lines[0])
print(lines[10])

Downloading ../data/timemachine.txt from http://d2l-data.s3-accelerate.amazonaws.com/timemachine.txt...
# text lines: 3221
the time machine by h g wells
twinkled and his usually pale face was flushed and animated the


In [4]:
#Tokenization
def tokenize(lines,token='word'):
  if token=='word':
    return [line.split() for line in lines]
  if token=='char':
    return [list(line) for line in lines]
  else:
    print('ERROR: unknown token type: ' + token)

In [5]:
class Vocab:
  def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
    if tokens is None:
      tokens = []
    if reserved_tokens is None: 
      reserved_tokens = []
    # Sort according to frequencies
    counter = count_corpus(tokens)
    self._token_freqs = sorted(counter.items(), key=lambda x: x[1],reverse=True)

    #Build vocab on init
    self.idx_to_token=['<unk>']+reserved_tokens #List
    self.token_to_idx={token:idx for idx,token in enumerate(self.idx_to_token)} #Dict

    for token,freq in self._token_freqs:
      #Don't include tokens with freq<min_freq in the vocab
      if freq<min_freq:
        break
      if token not in self.idx_to_token:
        self.idx_to_token.append(token)
        self.token_to_idx[token]=len(self.idx_to_token)-1

  def __len__(self):
    return len(self.idx_to_token)

  @property
  def unk(self):
    # Index for the unknown token
    return 0
  
  @property
  def token_freqs(self):
    # Counter object 
    return self._token_freqs
  
  def __getitem__(self,tokens):
    if not isinstance(tokens, (list, tuple)):
      return self.token_to_idx.get(tokens, self.unk) 
    #If tokens is a list 
    return [self.__getitem__(token) for token in tokens]

  def to_tokens(self, indices):
    if not isinstance(indices, (list, tuple)):
      return self.idx_to_token[indices]
    #If indices is a list
    return [self.idx_to_token[index] for index in indices]


def count_corpus(tokens):
  """Count token frequencies."""
  # Here `tokens` is a 1D list or 2D list
  if len(tokens) == 0 or isinstance(tokens[0], list):
    # Flatten a list of token lists into a 1D list
    tokens = [token for line in tokens for token in line]
  #If its already a 1D list,return  
  return collections.Counter(tokens)


In [6]:
def load_corpus_time_machine(max_tokens=-1,level='char'):
  """Return token indices and the vocabulary of the time machine dataset.""" 
  lines = read_time_machine()
  tokens = tokenize(lines, level)
  vocab = Vocab(tokens)
  # Call getitem on a single token and append it to a list
  corpus = [vocab[token] for line in tokens for token in line]
  if max_tokens > 0:
    corpus = corpus[:max_tokens] #First max_tokens tokens
  return corpus, vocab


In [7]:
import random
def seq_data_iter_random(corpus, batch_size, num_steps):
  """Generate a minibatch of subsequences using random sampling."""
  # Start with a random offset (inclusive of `num_steps - 1`) to partition a # sequence
  corpus = corpus[random.randint(0, num_steps - 1):]
  # Subtract 1 since we need to account for labels
  num_subseqs = (len(corpus) - 1) // num_steps
  # The starting indices for subsequences of length `num_steps` 
  initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
  # In random sampling, the subsequences from two adjacent random
  # minibatches during iteration are not necessarily adjacent on the
  # original sequence
  random.shuffle(initial_indices)

  def data(pos):
  # Return a sequence of length `num_steps` starting from `pos` 
    return corpus[pos: pos + num_steps]
  
  num_batches = num_subseqs // batch_size
  for i in range(0, batch_size * num_batches, batch_size):
    # Here, `initial_indices` contains randomized starting indices for # subsequences
    initial_indices_per_batch = initial_indices[i: i + batch_size]
    X = [data(j) for j in initial_indices_per_batch]
    Y = [data(j + 1) for j in initial_indices_per_batch] 
    yield torch.tensor(X), torch.tensor(Y)

In [8]:
def seq_data_iter_sequential(corpus, batch_size, num_steps):
  """Generate a minibatch of subsequences using sequential partitioning.""" 
  # Start with a random offset to partition a sequence
  offset = random.randint(0, num_steps)
  num_tokens = ((len(corpus) - offset - 1) // batch_size) * batch_size
  Xs = torch.tensor(corpus[offset: offset + num_tokens])
  Ys = torch.tensor(corpus[offset + 1: offset + 1 + num_tokens])
  Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1) 
  num_batches = Xs.shape[1] // num_steps
  for i in range(0, num_steps * num_batches, num_steps):
    X = Xs[:, i: i + num_steps]
    Y = Ys[:, i: i + num_steps] 
    yield X, Y

In [9]:
class SeqDataLoader:
  """An iterator to load sequence data."""
  def __init__(self, batch_size, num_steps, use_random_iter, max_tokens):
    if use_random_iter:
      self.data_iter_fn = seq_data_iter_random
    else:
      self.data_iter_fn = seq_data_iter_sequential
    
    self.corpus, self.vocab = load_corpus_time_machine(max_tokens) 
    self.batch_size, self.num_steps = batch_size, num_steps

  def __iter__(self):
    return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps)

In [10]:
def load_data_time_machine(batch_size, num_steps, use_random_iter=False, max_tokens=10000):
  """Return the iterator and the vocabulary of the time machine dataset."""
  data_iter = SeqDataLoader(batch_size, num_steps, use_random_iter, max_tokens)
  return data_iter, data_iter.vocab

# RNNs 

In [11]:
def predict(prefix, num_preds, net, vocab, device): 
  """Generate new characters following the `prefix`."""
  state = net.begin_state(batch_size=1, device=device) 
  outputs = [vocab[prefix[0]]] # gives [token]
  get_input = lambda: torch.tensor([outputs[-1]], device=device).reshape((1, 1)) 
  for y in prefix[1:]: # Warm-up period
    _, state = net(get_input(), state)
    outputs.append(vocab[y]) #Append input text i.e is prefix

  for _ in range(num_preds): # Predict `num_preds` steps
    y, state = net(get_input(), state) #Keep updating state and input
    outputs.append(int(y.argmax(dim=1).reshape(1))) 
  return ''.join([vocab.idx_to_token[i] for i in outputs])

In [12]:
def grad_clipping(net, theta):
  """Clip the gradient."""
  if isinstance(net, nn.Module):
    params = [p for p in net.parameters() if p.requires_grad] 
  else:
    params = net.params
  norm = torch.sqrt(sum(torch.sum((p.grad ** 2)) for p in params)) 
  if norm > theta:
    for param in params: 
      param.grad[:] *= theta / norm

In [13]:
def train_epoch(net, train_iter, loss, optimizer, device, use_random_iter): 
  """Train a net within one epoch"""
  state, timer = None, d2l.Timer()
  metric = d2l.Accumulator(2) # Sum of training loss, no. of tokens
  for X, Y in train_iter:
    if state is None or use_random_iter:
      # Initialize `state` when either it is the first iteration or 
      # using random sampling
      state = net.begin_state(batch_size=X.shape[0], device=device)
    else:
      if isinstance(net, nn.Module) and not isinstance(state, tuple):
        # `state` is a tensor for `nn.GRU`
        state.detach_() 
      else:
        # `state` is a tuple of tensors for `nn.LSTM` and # for our custom scratch implementation
        for s in state:
          s.detach_() 
    y = Y.T.reshape(-1)
    X, y = X.to(device), y.to(device)
    y_hat, state = net(X, state)
    l = loss(y_hat, y.long()).mean()
    
    if isinstance(optimizer, torch.optim.Optimizer):
      optimizer.zero_grad() 
      l.backward() 
      grad_clipping(net, 1) 
      optimizer.step()

    metric.add(l * y.numel(), y.numel())
  return math.exp(metric[0] / metric[1]), metric[1] / timer.stop()

In [14]:
def train(net, train_iter, vocab,loss, optimizer, num_epochs, device, use_random_iter=False):
  """Train a model"""
  for epoch in range(num_epochs):
    ppl, speed = train_epoch(net, train_iter, loss, optimizer, device, use_random_iter)
    if (epoch + 1) % 50 == 0:
      print(f'perplexity {ppl:.3f}, {speed:.1f} tokens/sec on {str(device)}') 

In [15]:
batch_size, num_steps = 32, 35
train_iter, vocab = load_data_time_machine(batch_size, num_steps)

In [16]:
class RNNModel(nn.Module):
  """The RNN model."""
  def __init__(self, rnn_layer, vocab_size, **kwargs):
    super(RNNModel, self).__init__(**kwargs)
    self.rnn = rnn_layer #Can be a RNN,GRU or LSTM
    self.vocab_size = vocab_size
    self.num_hiddens = self.rnn.hidden_size

    # If the RNN is bidirectional (to be introduced later), 
    # `num_directions` should be 2, else it should be 1.
    if not self.rnn.bidirectional:
      self.num_directions = 1
      self.linear = nn.Linear(self.num_hiddens, self.vocab_size)
    else:
      self.num_directions = 2
      self.linear = nn.Linear(self.num_hiddens * 2, self.vocab_size)

  def forward(self, inputs, state):
    X = F.one_hot(inputs.T.long(), self.vocab_size) #inputs.T=num_steps X batch_size
    X = X.to(torch.float32)
    Y, state = self.rnn(X, state)
    # The fully connected layer will first change the shape of `Y` to # (`num_steps` * `batch_size`, `num_hiddens`). 
    #Its output shape after the linear layer is # (`num_steps` * `batch_size`, `vocab_size`).
    output = self.linear(Y.reshape((-1, Y.shape[-1])))
    return output, state

  def begin_state(self, device, batch_size=1): 
    if not isinstance(self.rnn, nn.LSTM):
    # `nn.GRU` takes a tensor as hidden state
      return torch.zeros((self.num_directions * self.rnn.num_layers, batch_size, self.num_hiddens),device=device)
    
    else:
    # `nn.LSTM` takes a tuple of hidden states return for C and H
      ( torch.zeros((self.num_directions * self.rnn.num_layers, batch_size, self.num_hiddens), device=device),
       torch.zeros((self.num_directions * self.rnn.num_layers, batch_size, self.num_hiddens), device=device) )


In [17]:
device = d2l.try_gpu()
num_hiddens=256
net = RNNModel(nn.RNN(len(vocab), num_hiddens), vocab_size=len(vocab))
net = net.to(device)
predict('time traveller', 10, net, vocab, device)

'time travellerbbbbpppppp'

In [18]:
num_epochs, lr = 500, 1

loss=nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr)
train(net, train_iter, vocab,loss,optimizer, num_epochs, device)

perplexity 7.522, 112556.6 tokens/sec on cuda:0
perplexity 3.596, 105103.9 tokens/sec on cuda:0
perplexity 2.056, 107354.2 tokens/sec on cuda:0
perplexity 1.592, 109778.3 tokens/sec on cuda:0
perplexity 1.518, 107961.8 tokens/sec on cuda:0
perplexity 1.445, 107909.1 tokens/sec on cuda:0
perplexity 1.324, 106752.0 tokens/sec on cuda:0
perplexity 1.418, 107126.3 tokens/sec on cuda:0
perplexity 1.346, 107646.7 tokens/sec on cuda:0
perplexity 1.299, 107497.7 tokens/sec on cuda:0


In [19]:
predict('time traveller', 50, net, vocab, device)

'time travellerit would be remarkably convenient for the time som'

# GRU

In [20]:
gru_layer = nn.GRU(len(vocab), num_hiddens)
model = RNNModel(gru_layer, len(vocab))
model = model.to(device)
predict('time traveller', 50, model, vocab, device)


'time travellerxxowwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww'

In [21]:
loss=nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr)
train(model, train_iter, vocab,loss,optimizer, num_epochs, device)

perplexity 9.854, 124438.6 tokens/sec on cuda:0
perplexity 7.817, 121533.1 tokens/sec on cuda:0
perplexity 6.028, 120107.3 tokens/sec on cuda:0
perplexity 3.765, 122578.9 tokens/sec on cuda:0
perplexity 1.568, 120465.7 tokens/sec on cuda:0
perplexity 1.105, 122215.3 tokens/sec on cuda:0
perplexity 1.064, 117252.2 tokens/sec on cuda:0
perplexity 1.051, 121475.4 tokens/sec on cuda:0
perplexity 1.055, 121212.1 tokens/sec on cuda:0
perplexity 1.044, 120371.6 tokens/sec on cuda:0


In [22]:
predict('time traveller', 50, model, vocab, device)


'time traveller for so it will be convenient to speak of himwas e'

# LSTM

In [23]:
lstm_layer = nn.LSTM(len(vocab), num_hiddens)
model = RNNModel(lstm_layer, len(vocab))
model = model.to(device)
predict('time traveller', 50, model, vocab, device)


'time travellerbbeeeneneeneeneeneeneeneeneeneeneeneeneeneeneeneen'

In [24]:
loss=nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr)
train(model, train_iter, vocab,loss,optimizer, num_epochs, device)

perplexity 11.172, 100491.4 tokens/sec on cuda:0
perplexity 8.553, 99860.4 tokens/sec on cuda:0
perplexity 6.553, 100810.6 tokens/sec on cuda:0
perplexity 4.553, 100541.7 tokens/sec on cuda:0
perplexity 2.436, 99911.6 tokens/sec on cuda:0
perplexity 1.348, 99873.9 tokens/sec on cuda:0
perplexity 1.132, 100040.1 tokens/sec on cuda:0
perplexity 1.066, 101240.2 tokens/sec on cuda:0
perplexity 1.042, 101301.9 tokens/sec on cuda:0
perplexity 1.042, 99982.3 tokens/sec on cuda:0


In [25]:
predict('time traveller', 50, model, vocab, device)


'time traveller for so it will be convenient to speak of himwas e'

# Stacked LSTM 

In [26]:
num_stacks=2
lstm_layer = nn.LSTM(len(vocab), num_hiddens,num_stacks)
model = RNNModel(lstm_layer, len(vocab))
model = model.to(device)
predict('time traveller', 50, model, vocab, device)


'time travelleraaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'

In [27]:
loss=nn.CrossEntropyLoss()
lr=2 #complex architecture
optimizer = torch.optim.SGD(model.parameters(), lr)
train(model, train_iter, vocab,loss,optimizer, num_epochs, device)

perplexity 14.547, 79689.4 tokens/sec on cuda:0
perplexity 9.229, 80013.5 tokens/sec on cuda:0
perplexity 5.512, 79797.5 tokens/sec on cuda:0
perplexity 1.752, 79467.6 tokens/sec on cuda:0
perplexity 1.108, 79289.1 tokens/sec on cuda:0
perplexity 1.053, 78335.3 tokens/sec on cuda:0
perplexity 1.039, 79101.7 tokens/sec on cuda:0
perplexity 1.035, 79150.1 tokens/sec on cuda:0
perplexity 1.024, 79693.8 tokens/sec on cuda:0
perplexity 1.029, 79261.3 tokens/sec on cuda:0


In [28]:
predict('time traveller', 50, model, vocab, device)


'time travelleryou can show black is white by argument said filby'

# Bidirectional LSTM

## Why you shouldn't use them for language modelling
One of the key features of a bidirectional RNN is that information from both ends of the sequence is used to estimate the output. That is, we use information from both future and past observations to predict the current one. In the case of next token prediction this is not quite what we want.


If we were to ignore all advice regarding the fact that bidirectional RNNs use past and future data and simply apply it to language models, we will get estimates with acceptable perplexity. Nonetheless, the ability of the model to predict future tokens is severely compromised as the experiment below illustrates. **Despite reasonable perplexity, it only generates gibberish even after many iterations. We include the code below as a cautionary example against using them in the wrong context.**

In [29]:
#To be added
num_stacks=2
lstm_layer = nn.LSTM(len(vocab), num_hiddens,num_stacks,bidirectional=True)
model = RNNModel(lstm_layer, len(vocab))
model = model.to(device)
predict('time traveller', 50, model, vocab, device)


'time travellerkdkdddkdddkdddkdddkdddkdddkdddkdddkdddkdddkdddkddd'

In [30]:
loss=nn.CrossEntropyLoss()
lr=1
optimizer = torch.optim.SGD(model.parameters(), lr)
train(model, train_iter, vocab,loss,optimizer, num_epochs, device)

perplexity 3.377, 41536.6 tokens/sec on cuda:0
perplexity 1.290, 42050.4 tokens/sec on cuda:0
perplexity 1.202, 41665.1 tokens/sec on cuda:0
perplexity 1.175, 41928.9 tokens/sec on cuda:0
perplexity 1.158, 42151.3 tokens/sec on cuda:0
perplexity 1.138, 41602.1 tokens/sec on cuda:0
perplexity 1.134, 41856.9 tokens/sec on cuda:0
perplexity 1.138, 41990.1 tokens/sec on cuda:0
perplexity 1.113, 41987.4 tokens/sec on cuda:0
perplexity 1.107, 41585.2 tokens/sec on cuda:0


In [31]:
predict('time traveller', 50, model, vocab, device)


'time travellerererererererererererererererererererererererererer'