In [15]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F
import numpy as np
import itertools
import pickle

%autosave 180

Autosaving every 180 seconds


### Recap:
In part one of this tutorial series, we demonstrated the matrix operations used to estimate the hidden states and outputs for the forward pass of a GRU. Based on the poor results we obvioulsy need to optimize our algorithm and test it on a test set to ensure generalizability. This is typically done using several steps/techniques. In this tutorial we will walkthrough what happens under the hood during optimization, specifically calculating the loss function and performing backpropagation through time to update the weights over several epochs. 

### Input text

In [16]:
# This will be our input ---> x
text = 'MathMathMathMathMath'

### Integer representation of inputs

In [28]:
character_list = list(set(text))   # get all of the unique letters in our text variable
vocabulary_size = len(character_list)   # count the number of unique elements
character_dictionary = {'h': 0, 'a': 1, 't': 2, 'M': 3}
# {char:e for e, char in enumerate(character_list)}  # create a dictionary mapping each unique char to a number
encoded_chars = [character_dictionary[char] for char in text] #integer representation of our vocabulary 

### One hot encode 

In [29]:
def one_hot_encode(encoded, vocab_size):
    result = torch.zeros((len(encoded), vocab_size))
    for i, idx in enumerate(encoded):
        result[i, idx] = 1.0
    return result

### Training data

In [30]:
# One hot encode our encoded charactes
batch_size = 2
seq_length = 3
num_samples = (len(encoded_chars) - 1) // seq_length # time lag of 1 for creating the labels
vocab_size = 4

data = one_hot_encode(encoded_chars[:seq_length*num_samples], vocab_size).reshape((num_samples, seq_length, vocab_size))
num_batches = len(data) // batch_size
X = data[:num_batches*batch_size].reshape((num_batches, batch_size, seq_length, vocab_size))
# swap batch_size and seq_length axis to make later access easier
X = X.transpose(1, 2)

### Label encoding

In [31]:
# +1 shift the labels by one so that given the previous letter the char we should predict would be or next char
labels = one_hot_encode(encoded_chars[1:seq_length*num_samples+1], vocab_size) 
y = labels.reshape((num_batches, batch_size, seq_length, vocab_size))
y = y.transpose(1, 2) # transpose the first and second index

### Intitialize weight matrices and bias vectors

In [32]:
torch.manual_seed(1) # reproducibility

####  Define the network parameters:
hiddenSize = 2 # network size, this can be any number (depending on your task)
numClass = 4 # this is the same as our vocab_size

#### Weight matrices for our inputs 
Wz = torch.randn(vocab_size, hiddenSize)
Wr = torch.randn(vocab_size, hiddenSize)
Wh = torch.randn(vocab_size, hiddenSize)

## Intialize the hidden state
# this is for demonstration purposes only, in the actual model it will be initiated during training a loop over the 
# the number of bacthes and updated before passing to the next GRU cell.
h_t_demo = torch.zeros(batch_size, hiddenSize) 

#### Weight matrices for our hidden layer
Uz = torch.randn(hiddenSize, hiddenSize)
Ur = torch.randn(hiddenSize, hiddenSize)
Uh = torch.randn(hiddenSize, hiddenSize)

#### bias vectors for our hidden layer
bz = torch.zeros(hiddenSize)
br = torch.zeros(hiddenSize)
bh = torch.zeros(hiddenSize)

#### Output weights
Wy = torch.randn(hiddenSize, numClass)
by = torch.zeros(numClass)

### Define network

In [33]:
def gru(x, h):
    outputs = []
    for i in range(num_batches):  # this loops over the batches 
        x = X[i]
        for i,sequence in enumerate(x): # iterates over the sequences in each batch
            z = torch.sigmoid(torch.matmul(sequence, Wz) + torch.matmul(h, Uz) + bz)
            r = torch.sigmoid(torch.matmul(sequence, Wr) + torch.matmul(h, Ur) + br)
            h_tilde = torch.tanh(torch.matmul(sequence, Wh) + torch.matmul(r * h, Uh) + bh)
            h = z * h + (1 - z) * h_tilde

            # Linear layer
            y_linear = torch.matmul(h, Wy) + by

            # Softmax activation function
            y_t = F.softmax(y_linear, dim=1)

            outputs.append(y_t)
        return torch.stack(outputs), h
    

### Sample to generate text

In [34]:
def sample(primer, length_chars_predict):
    
    word = primer

    primer_dictionary = [character_dictionary[char] for char in word]
    test_input = one_hot_encode(primer_dictionary, vocab_size)
    

    h = torch.zeros(1, hiddenSize)

    for i in range(length_chars_predict):
        outputs, h = gru(test_input, h)
        choice = np.random.choice(vocab_size, p=outputs[-1][0].numpy())
        word += character_list[choice]
        input_sequence = one_hot_encode([choice],vocab_size)
    return word

In [39]:
# h gets updated and then we calculate for the next 
h_t_1 = []
h = h_t_demo
for i,sequence in enumerate(X[0]):   # iterate over each sequence in the batch to calculate the hidden state h 
    z = torch.sigmoid(torch.matmul(sequence, Wz) + torch.matmul(h, Uz) + bz)
    r = torch.sigmoid(torch.matmul(sequence, Wr) + torch.matmul(h, Ur) + br)
    h_tilde = torch.tanh(torch.matmul(sequence, Wh) + torch.matmul(r * h, Uh) + bh)
    h = z * h + (1 - z) * h_tilde
    h_t_1.append(h)
    print(f'h{i}:{h}')
h_t_1 = torch.stack(h_t_1)

h0:tensor([[ 0.7565, -0.3472],
        [-0.1355, -0.2040]])
h1:tensor([[-0.1535, -0.5712],
        [ 0.7664, -0.5062]])
h2:tensor([[ 0.7495, -0.8616],
        [-0.2399, -0.6680]])


### Training loop

In [40]:
max_epochs = 1  # passes through the data
for e in range(max_epochs):
    h = torch.zeros(batch_size, hiddenSize)
    for i in range(num_batches):
        x_in = X[i]
        y_in = y[i]
        
        out, h = gru(x_in, h)
        print(f'h{h} at i{i}')


        print(sample('Ma',20))

htensor([[ 0.7495, -0.8616],
        [-0.2399, -0.6680]]) at i0
MatMhMhhahhthhMMtMhhhM
htensor([[ 0.7937, -0.8401],
        [-0.2344, -0.6548]]) at i1
MahMMaMtttththhtthaMaM
htensor([[ 0.7943, -0.8410],
        [-0.2364, -0.6575]]) at i2
MaMMMahtthMahhMhthaMth


### What happen here?
As we pointed out in the first tutorial the first couple of strings generated are a bit erratic, but after a few passes it seems to get at least the next two characters correct. However, in order to measure how inconsistent our predictions are versus the true labels, we need a metric. This metric is call the loss function, that measures how well the model is performing. It is a positive value that decreases as the network becomes more confident with it's predictions. This loss function for multiclass classification problems is defined as:

<h1><center>Cross Entropy = $-\frac{1}{N}\sum_{j}^M {y * log(\hat{y}})$<br>


$\text{loss}(x, class) = -\log\left(\frac{\exp(x[class])}{\sum_j \exp(x[j])}\right)
                   = -x[class] + \log\left(\sum_j \exp(x[j])\right)$

In [36]:
out

tensor([[[0.4662, 0.1189, 0.2620, 0.1529],
         [0.1668, 0.2208, 0.4420, 0.1704]],

        [[0.1353, 0.1514, 0.6092, 0.1041],
         [0.4362, 0.1577, 0.1986, 0.2074]],

        [[0.4401, 0.1330, 0.2603, 0.1666],
         [0.1831, 0.1812, 0.4930, 0.1426]]])

In [37]:
h

tensor([[ 0.7943, -0.8410],
        [-0.2364, -0.6575]])