<a href="https://colab.research.google.com/github/sudotouchwoman/math-misc/blob/main/notebooks/rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **RNN & LSTM implementation. Teterin Nikita, Spring 2022**

## **Building character-level model**

Training the network to predict words

In [None]:
import torch
import torch.nn as nn
import torch.functional as F
import torch.optim as optim

import numpy as np

In [None]:
word = 'ololoasdasddqweqw123456789'
# word = 'hello'

In [None]:
def seed(seed=10):
  np.random.seed(seed)
  torch.manual_seed(seed)


seed()

## **Dataset and model**

In [None]:
# the dataset one-hot encodes the letters
# and creates an iterator over the word
# on each iteration, yield current letter as input
# and the next letter as target

class WordDataSet:

    def __init__(self, word):
        self.chars2idx = {}
        self.indices  = []
        for c in word: 
            if c not in self.chars2idx:
                self.chars2idx[c] = len(self.chars2idx)
            self.indices.append(self.chars2idx[c])

        self.vec_size = len(self.chars2idx)
        self.seq_len  = len(word)

    def get_one_hot(self, idx):
        x = torch.zeros(self.vec_size)
        x[idx] = 1
        return x

    def __iter__(self):
        return zip(self.indices[:-1], self.indices[1:])

    def __len__(self):
        return self.seq_len

    def get_char_by_id(self, id):
        for c, i in self.chars2idx.items():
            if id == i: return c
        return None

In [None]:
class VanillaRNN(nn.Module):

    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(VanillaRNN, self).__init__()        
        self.x2hidden    = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.hidden      = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.activation  = nn.Tanh()
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)

    def forward(self, x, prev_hidden, activate=True):
        hidden = self.x2hidden(x) + self.hidden(prev_hidden)
        # if one omits activation function, gradient exploding might occur
        hidden = self.activation(hidden) if activate else hidden
        output = self.outweight(hidden)
        return output, hidden

## **Training**

In [None]:
ds = WordDataSet(word=word)
rnn = VanillaRNN(in_size=ds.vec_size, hidden_size=3, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 100
optimizer     = optim.SGD(rnn.parameters(), lr = 0.1, momentum=0.9)

In [None]:
CLIP_GRAD = True
VERBOSE = False

In [None]:
seed()

for epoch in range(e_cnt):
    hh = torch.zeros(rnn.hidden.in_features)
    loss = .0
    optimizer.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze_(0)
        target = torch.LongTensor([next_sample])
        
        y, hh = rnn(x, hh)
        loss += criterion(y, target)

    loss.backward()

    if epoch % 10 == 0:
        print(loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=5))
    else:
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1)

    if VERBOSE:
      print("Params : ")
      num_params = 0
      for item in rnn.parameters():
          num_params += 1
          print(item.grad)
      print("NumParams :", num_params)
      print("Optimizer call")

    optimizer.step()

71.73495483398438
Clip gradient :  tensor(5.1487)
55.152652740478516
Clip gradient :  tensor(5.7516)
38.997867584228516
Clip gradient :  tensor(9.1299)
31.139690399169922
Clip gradient :  tensor(8.7716)
25.971769332885742
Clip gradient :  tensor(9.2260)
25.782806396484375
Clip gradient :  tensor(13.4613)
23.939054489135742
Clip gradient :  tensor(7.9822)
22.67205238342285
Clip gradient :  tensor(11.9995)
19.75225257873535
Clip gradient :  tensor(6.3413)
21.81279754638672
Clip gradient :  tensor(10.9281)


In [None]:
rnn.eval()
hh = torch.zeros(rnn.hidden.in_features)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze_(0)
    y, hh = rnn(x, hh)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 oaseqweqweqweqweqweqweqweq
Original:	 ololoasdasddqweqw123456789


AssertionError: ignored

## **LSTM model**

One can observe that vanilla RNN is incapable of remembering long sequences. That's where LSTM and GRU join the game. The intuition behind LSTM is that on each step the model computes the extent to which it should remember (or forget) the history of previous steps and how much attention it should pay to the current input.

Basically, LSTM has one extra output on each step called candidate gate. This one prevents the gradient from vanishing, by having non-zero (or almost non-zero) derivative on each iteration. This is akin to what we have seen in ResNet, only applied to sequential format (e.g., time series or text data).

However, the insides of LSTM are a bit more complex. At high level the math can be described as follows:

$$
  \begin{pmatrix}
  i\\
  f\\
  o\\
  g
  \end{pmatrix} = 
  \begin{pmatrix}
  \sigma\\
  \sigma\\
  \sigma\\
  \tanh
  \end{pmatrix} W
  \begin{pmatrix}
  h_{t-1}\\
  x_t
  \end{pmatrix}\\
  c_t = f \odot c_{t-1} + i \odot g\\
  h_t=o \odot \tanh(c_t)
$$

Elements of the ouput vector stand for input, forget (sometimes referred to as remember), output and candidate gates (the latter sometimes is denoted by $c'_t$)

It is convenient to write the operations inside LSTM in concatenated matrix form (I forgot the correct term for the case when single matrix consists of several submatrices), with corresponding functions apllied afterwards.

In [None]:
class LSTMBase(nn.Module):

  def __init__(self, in_size, hidden_size, proj_size=1, bias=True):
    super(LSTMBase, self).__init__()

    self.input = in_size
    self.hidden = hidden_size if proj_size == 1 else proj_size
    self.cell = hidden_size
    self.out = proj_size

    # the model contains 2 linear layers,
    # applied to hidden and ordinary inputs respectively
    self.Wx = nn.Linear(self.input, self.cell * 4, bias=bias)
    self.Wh = nn.Linear(self.hidden, self.cell * 4, bias=bias)
    self.sigma = nn.Sigmoid()
    self.tanh = nn.Tanh()

    # perform projection of the hidden state,
    # if specified
    self.proj = nn.Sequential() if proj_size == 1\
    else nn.Linear(hidden_size, proj_size, bias=bias)
    self.weights_init()

  def weights_init(self):
    # weight initialization
    std = np.sqrt(1 / self.cell) if self.cell > 0 else 0
    for weight in self.parameters():
      weight.data.uniform_(-std, std)

  def forward(self, x, hc):
    h_0, c_0 = hc

    gates = self.Wx(x) + self.Wh(h_0)
    i, f, o, g = gates.chunk(4, dim=-1)
    i, f, o = map(self.sigma, (i, f, o))
    g = self.tanh(g)

    c = f * c_0 + i * g
    h = o * self.tanh(c)
    out = self.proj(h)

    # different ouputs and hiddens
    # make sense when several LSTMBases are
    # stacked together
    return out, (out, c)

In [None]:
class LSTM(nn.Module):

  def __init__(self, in_size, hidden_size, proj_size=1, num_layers=1, bias=True):
    super(LSTM, self).__init__()
    layer_params = []
    self.input = in_size
    self.hidden = hidden_size if proj_size == 1 else proj_size
    self.cell = hidden_size

    layers = []

    for layer in range(num_layers):
      layer_input = in_size if layer == 0 else self.hidden
      layers += [LSTMBase(layer_input, self.cell, self.hidden, bias=bias)]

    self.cells = nn.ModuleList(layers)

  def forward(self, x, hx):
    for cell in self.cells:
      x, hx = cell(x, hx)
    return x, hx

In [None]:
ds = WordDataSet(word=word)
lstm = LSTM(in_size=ds.vec_size, hidden_size=10, proj_size=ds.vec_size, num_layers=2)
criterion = nn.CrossEntropyLoss()
e_cnt     = 1000
optimizer = optim.Adam(lstm.parameters(), lr=1e-3, weight_decay=5e-4)

In [None]:
ds = WordDataSet(word=word)
lstm = LSTM(in_size=ds.vec_size, hidden_size=40, proj_size=ds.vec_size, num_layers=2)
criterion = nn.CrossEntropyLoss()
e_cnt     = 1000
optimizer = optim.Adam(lstm.parameters(), lr = 1e-3, weight_decay=1e-4)

In [None]:
lstm

LSTM(
  (cells): ModuleList(
    (0): LSTMBase(
      (Wx): Linear(in_features=17, out_features=160, bias=True)
      (Wh): Linear(in_features=17, out_features=160, bias=True)
      (sigma): Sigmoid()
      (tanh): Tanh()
      (proj): Linear(in_features=40, out_features=17, bias=True)
    )
    (1): LSTMBase(
      (Wx): Linear(in_features=17, out_features=160, bias=True)
      (Wh): Linear(in_features=17, out_features=160, bias=True)
      (sigma): Sigmoid()
      (tanh): Tanh()
      (proj): Linear(in_features=40, out_features=17, bias=True)
    )
  )
)

In [None]:
seed()

for epoch in range(e_cnt):
    hh = torch.zeros(lstm.hidden)
    cc = torch.zeros(lstm.cell)

    loss = .0
    optimizer.zero_grad()

    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze_(0)
        target = torch.LongTensor([next_sample])

        y, (hh, cc) = lstm(x, (hh, cc))
        # note the loss values
        loss += criterion(y, target)

    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print('Loss:\t', loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(lstm.parameters(), max_norm=5))
    else:
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(lstm.parameters(), max_norm=1)

    if VERBOSE:
      print("Params : ")
      num_params = 0
      for item in lstm.parameters():
          num_params += 1
          print(item.grad)
      print("NumParams :", num_params)
      print("Optimizer call")

Loss:	 70.54817962646484
Clip gradient :  tensor(2.6407)
Loss:	 69.99776458740234
Clip gradient :  tensor(2.4340)
Loss:	 69.34283447265625
Clip gradient :  tensor(2.2860)
Loss:	 68.42424774169922
Clip gradient :  tensor(1.9109)
Loss:	 67.38847351074219
Clip gradient :  tensor(2.4701)
Loss:	 65.09686279296875
Clip gradient :  tensor(5.6103)
Loss:	 56.96276092529297
Clip gradient :  tensor(16.7404)
Loss:	 53.17577362060547
Clip gradient :  tensor(32.7584)
Loss:	 51.35645294189453
Clip gradient :  tensor(55.3093)
Loss:	 48.9168586730957
Clip gradient :  tensor(10.5449)
Loss:	 47.04542541503906
Clip gradient :  tensor(49.0878)
Loss:	 45.27079772949219
Clip gradient :  tensor(12.3696)
Loss:	 43.58772659301758
Clip gradient :  tensor(21.7121)
Loss:	 41.91994857788086
Clip gradient :  tensor(11.0562)
Loss:	 40.462646484375
Clip gradient :  tensor(98.0084)
Loss:	 39.14867401123047
Clip gradient :  tensor(118.2272)
Loss:	 37.7833137512207
Clip gradient :  tensor(60.2380)
Loss:	 36.6358604431152

In [None]:
lstm.eval()

hh = torch.zeros(lstm.hidden)
cc = torch.zeros(lstm.cell)
id = 0
softmax  = nn.Softmax(dim=-1)
predword = ds.get_char_by_id(id)

for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze_(0)
    y, (hh, cc) = lstm(x, (hh, cc))
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)

print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 ololoasdasddqweqw123456789
Original:	 ololoasdasddqweqw123456789


This is funny how the model only managed to overfit on the sequence when trained for 1000 epochs with tuned optimizer parameters. Also, the model only started converging when I substituted SGD with Adam (the impact was huge).
I do not use BN