# Homework 2 - Recurrent Neural Networks

In this part of the homework we are going to work with Recurrent Neural Networks, in particular GRU. One of the greatest things that Recurrent Neural Networks can do when working with sequences is retaining data from several timesteps in the past. We are going to explore that property by constructing an 'echo' Recurrent Neural Network.

The goal here is to make a model that given a sequence of letters or digits will output that same sequence, but with a certain delay. Let's say the input is a string 'abacaba', we want the model to not output anything for 3 steps (delay length), and then output the original string step by step, except the last 3 characters. So, target output is then 'XXXabac', where 'X' is empty output.

This is similar to [this notebook](https://github.com/Atcold/pytorch-Deep-Learning/blob/master/09-echo_data.ipynb) (which you should refer to when doing this assignment), except we're working not with a binary string, but with a sequence of integers between 0 and some N. In our case N is 26, which is the number of letters in the alphabet.

## Dataset

Let's implement the dataset. In our case, the data is basically infinite, as we can always generate more examples on the fly, so don't need to load anything from disk.

In [None]:
import random
import string

import torch

# Max value of the generated integer. 26 is chosen becuase it's
# the number of letters in English alphabet.
N = 26


def idx_to_onehot(x, k=N+1):
    """ Converts the generated integers to one-hot vectors """
    ones = torch.sparse.torch.eye(k)
    shape = x.shape
    res = ones.index_select(0, x.view(-1).type(torch.int64))
    return res.view(*shape, res.shape[-1])


class EchoDataset(torch.utils.data.IterableDataset):

    def __init__(self, delay=4, seq_length=15, size=1000):
        self.delay = delay
        self.seq_length = seq_length
        self.size = size
  
    def __len__(self):
        return self.size

    def __iter__(self):
        """ Iterable dataset doesn't have to implement __getitem__.
            Instead, we only need to implement __iter__ to return
            an iterator (or generator).
        """
        for _ in range(self.size):
            seq = torch.tensor([random.choice(range(1, N + 1)) for i in range(self.seq_length)], dtype=torch.int64)
            result = torch.cat((torch.zeros(self.delay), seq[:self.seq_length - self.delay])).type(torch.int64)
            yield seq, result

DELAY = 4
DATASET_SIZE = 200000
ds = EchoDataset(delay=DELAY, size=DATASET_SIZE)

## Model

Now, we want to implement the model. For our purposes, we want to use GRU. The architecture consists of GRU and a decoder. Decoder is responsible for decoding the GRU hidden state to yield a predicting for the next output. The parts you are responsible for filling with your code are marked with `TODO`. 

In [None]:
class GRUMemory(torch.nn.Module):

    def __init__(self, hidden_size):
        super().__init__()
        self.gru = torch.nn.GRU(
            input_size=N + 1,
            hidden_size=50,
            num_layers=1,
            bias=True,
        )
        self.decoder = torch.nn.Linear(
            hidden_size, N + 1,
        )

    def forward(self, x):
        # inputs: x - input tensor of shape (seq_length, N+1)
        # returns:
        # logits (scores for softmax) of shape (seq_length, N + 1)
        z0, _ = self.gru(x)#.to(torch.float32))
        z1 = self.decoder(z0)
        return z1

    @torch.no_grad()
    def test_run(self, s):
        # This function accepts one string s containing lowercase characters a-z. 
        # You need to map those characters to one-hot encodings, 
        # then get the result from your network, and then convert the output 
        # back to a string of the same length, with 0 mapped to ' ', 
        # and 1-26 mapped to a-z.
        input_one_hot = letters_to_one_hot(s)
        output_one_hot = self.forward(input_one_hot)
        return one_hot_to_letters(output_one_hot)


In [None]:
def letter_to_num(letter: str) -> int:
    try:
        return max(ord(letter) - 96, 0)
    except Exception:
        print(type(letter))
        print(letter)
        raise


def num_to_letter(number: int) -> str:
    letter = chr(number + 96)
    if letter < "a":
        letter = " "
    return letter


def numbers_to_one_hot(numbers):
    numbers = torch.as_tensor(numbers)
    if numbers.ndim == 1:
        # (seq_length) -> (seq_length, N + 1)
        one_hot = torch.zeros((len(numbers), N + 1), dtype=torch.float32)
        one_hot[torch.arange(len(numbers)), numbers] = 1
    elif numbers.ndim == 2:
        # (batch, seq_length) -> (batch, seq_length, N + 1)
        one_hot = torch.zeros(*numbers.size() + (N + 1,), dtype=torch.float32)
        i, j = torch.meshgrid(
            torch.arange(numbers.size(0)),
            torch.arange(numbers.size(1)),
            indexing="ij",
        )
        one_hot[i, j, numbers] = 1
    else:
        raise ValueError(f"Unexpected number of dimensions: {numbers.ndim}")
    return one_hot


def one_hot_to_numbers(one_hot):
    return one_hot.argmax(dim=-1)


def letters_to_one_hot(batch):
    if isinstance(batch, str):
        # batch = [batch]
        letters = batch
        numbers = [letter_to_num(let) for let in letters]
    else:
        numbers = [[letter_to_num(let) for let in letters] for letters in batch]
    return numbers_to_one_hot(numbers)


def one_hot_to_letters(one_hot):
    numbers = one_hot_to_numbers(one_hot)
    if numbers.ndim == 1:
        # (seq_length, N + 1) -> (seq_length)
        return "".join(num_to_letter(n) for n in numbers.ravel())
    elif numbers.ndim == 2:
        # (batch, seq_length, N + 1) -> (batch, seq_length)
        return [
            "".join(num_to_letter(n) for n in seq.ravel()) for seq in numbers
        ]
    else:
        raise ValueError(f"Unexpected number of dimensions: {one_hot.ndim}")


## Training
Below you need to implement the training of the model. We give you more freedom as for the implementation. The two limitations are that it has to execute within 10 minutes, and that error rate should be below 1%.

In [None]:
def test_model(model, sequence_length=15):
    """
    This is the test function that runs 100 different strings through your model,
    and checks the error rate.
    """
    total = 0
    correct = 0
    D = DELAY
    for i in range(500):
        s = ''.join([random.choice(string.ascii_lowercase) for i in range(random.randint(15, 25))])
        result = model.test_run(s)
        assert D > 0, 's[:-D] won\'t work for D=0'
        for c1, c2 in zip(s[:-D], result[D:]):
            correct += int(c1 == c2)
        total += len(s) - D

    return correct / total

In [None]:
import time
import matplotlib.pyplot as plt

start_time = time.time()

model = GRUMemory(hidden_size=50)
optimizer = torch.optim.RMSprop(model.parameters(), lr=3e-4)
criterion = torch.nn.CrossEntropyLoss()
losses = []
accuracies = []

for i, (sequence, result) in enumerate(ds):
    optimizer.zero_grad()
    one_hot_input = numbers_to_one_hot(sequence).to(dtype=torch.float32)
    one_hot_result = numbers_to_one_hot(result).to(dtype=torch.float32)
    prediction = model(one_hot_input)
    loss = criterion(prediction, one_hot_result)
    loss.backward()
    optimizer.step()
    if i % 100 == 0:
        losses.append(loss.item())
        accuracy = test_model(model)
        accuracies.append(accuracy)
        print(f"Iteration {i}, accuracy {accuracy:.3%}", end="\r")
    if len(accuracies) >= 5 and min(accuracies[-5:]) >= 0.99:
        print(f"Model has converged after {i} iterations")
        break


end_time = time.time()

fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True)
axes[0].plot(losses)
axes[0].set_ylabel('Training Loss')
axes[1].plot(accuracies)
axes[1].set_ylabel('Validation Accuracy')
axes[1].set_xlabel('Iteration / 100')

duration = end_time - start_time
accuracy = test_model(model)
print(f"Duration: {duration / 60:.1f} minutes")
print(f"Accuracy: {accuracy:.3%}")
assert duration < 600, 'execution took f{duration:.2f} seconds, which longer than 10 mins'
assert accuracy > 0.99, f'accuracy is too low, got {accuracy}, need 0.99'
print('tests passed')

## Variable delay model

Now, to make this more complicated, we want to have varialbe delay. So, now, the goal is to transform a sequence of pairs (character, delay) into a character sequence with given delay. Delay stays constant within one sequence.

### Dataset
As before, we first implement the dataset:

In [None]:
import math

class VariableDelayEchoDataset(torch.utils.data.IterableDataset):

    def __init__(self, max_delay=8, seq_length=20, size=1000):
        self.max_delay = max_delay
        self.seq_length = seq_length
        self.size = size
  
    def __len__(self):
        return self.size

    def __iter__(self):
        worker_info = torch.utils.data.get_worker_info()
        if worker_info is None:
            num_workers = 1
        else:
            num_workers = worker_info.num_workers
        for _ in range(math.ceil(self.size / num_workers)):
            seq = torch.tensor(
                [random.choice(range(1, N + 1))
                 for i in range(self.seq_length)],
                dtype=torch.int64
            )
            delay = random.randint(0, self.max_delay)
            result = torch.cat(
                (torch.zeros(delay), seq[:self.seq_length - delay])
            ).type(torch.int64)
            yield seq, delay, result

### Model

And the model.

In [None]:
class VariableDelayGRUMemory(torch.nn.Module):

    def __init__(self, max_delay, hidden_size=None, num_layers=1):
        super().__init__()
        self.max_delay = max_delay
        self.io_size = N + 1
        if hidden_size is None:
            hidden_size = 2 * self.io_size
        self.gru = torch.nn.GRU(
            input_size=self.io_size + self.max_delay + 1,
            hidden_size=hidden_size,
            num_layers=num_layers,
            bias=True,
            batch_first=True,
        )
        self.decoder = torch.nn.Linear(
            hidden_size, self.io_size,
        )

    def forward(self, x, delay, verbose=False):
        # inputs:
        # x - tensor of shape (batch_size, seq length, N + 1)
        # delay - sequence[int]
        # returns:
        # logits (scores for softmax) of shape (batch_size, seq_length, N + 1)
        
        # The only way I can think of for feeding the desired delay
        # into the network is by concatenating it with the input.
        # The most sensible way to do that seems to be to duplicate
        # the delay along the length of the sequence and concatenate
        # it such that it appears as another feature. One-hot encoding
        # seems to work better than simply concatenating the delay
        # directly.
        delays = torch.zeros(
            x.size()[:-1] + (self.max_delay + 1,),
            dtype=x.dtype,
        )
        if x.ndim == 2:
            delays[:, delay] = 1
        elif x.ndim == 3:
            delays[torch.arange(x.size(0)), :, delay] = 1
        else:
            raise ValueError(f"Unexpected number of dimensions: {x.ndim}")
        x = torch.cat((x, delays), dim=-1)
        if verbose:
            print(delays)
            print(x)
            print()
        z0, _ = self.gru(x)
        z1 = self.decoder(z0)
        return z1

    def test_run(self, s, delay, verbose=False):
        # This function accepts one string s containing lowercase characters a-z, 
        # and a delay - the desired output delay.
        # You need to map those characters to one-hot encodings, 
        # then get the result from your network, and then convert the output 
        # back to a string of the same length, with 0 mapped to ' ', 
        # and 1-26 mapped to a-z.
        input_numbers = [letter_to_num(char) for char in s]
        output_numbers = self.test_run_numbers(input_numbers, delay, verbose)
        return "".join(num_to_letter(num) for num in output_numbers.ravel())

    @torch.no_grad()
    def test_run_numbers(self, numbers, delay, verbose=False):
        input_one_hot = numbers_to_one_hot(numbers)
        output_one_hot = self.forward(input_one_hot, delay, verbose)
        return one_hot_to_numbers(output_one_hot)
    

### Train

As before, you're free to do what you want, as long as training finishes within 10 minutes and accuracy is above 0.99 for delays between 0 and 8.

In [None]:
def test_variable_delay_model(
    model, max_delay, seq_length=20, size=500, batch_size=64, num_workers=4,
):
    """
    This is the test function that runs different strings through your model,
    and checks the error rate.
    """
    total = 0
    correct = 0
    dataset = VariableDelayEchoDataset(
        max_delay=model.max_delay, seq_length=seq_length, size=size
    )
    dataloader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers,
    )
    for sequence, delay, target in dataloader:
        result = model.test_run_numbers(sequence, delay)
        assert result.size() == target.size()
        correct += (result == target).sum()
        total += target.numel()
        # if delay > 0:
        #     z = zip(sequence[:-delay], result[delay:])
        # else:
        #     z = zip(sequence, result)
        # for c1, c2 in z:
        #     correct += int(c1 == c2)
        # total += len(sequence) - delay

    return correct / total

In [None]:
import time
start_time = time.time()

MAX_DELAY = 8
SEQ_LENGTH = 20
SIZE = 500_000
TEST_SIZE = 100
BATCH_SIZE = 64
NUM_WORKERS = 4

model = VariableDelayGRUMemory(
    max_delay=MAX_DELAY, hidden_size=500, num_layers=2
)
optimizer = torch.optim.RMSprop(model.parameters(), lr=3e-3)
criterion = torch.nn.CrossEntropyLoss()
training_dataset = VariableDelayEchoDataset(
    max_delay=MAX_DELAY, seq_length=SEQ_LENGTH, size=SIZE
)
dataloader = torch.utils.data.DataLoader(
    training_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS,
)
losses = []
accuracies = []

for i, (sequence, delay, result) in enumerate(dataloader):
    optimizer.zero_grad()
    one_hot_input = numbers_to_one_hot(sequence)
    one_hot_result = numbers_to_one_hot(result)
    prediction = model(one_hot_input, delay)
    loss = criterion(prediction, one_hot_result)
    loss.backward()
    optimizer.step()
    if i % 100 == 0:
        losses.append(loss.item())
        accuracy = test_variable_delay_model(
            model, max_delay=MAX_DELAY, seq_length=SEQ_LENGTH, size=TEST_SIZE,
        )
        accuracies.append(accuracy)
        print(f"Batch iteration {i}, accuracy {accuracy:.3%}", end="\r")
    if len(accuracies) >= 5 and min(accuracies[-5:]) >= 0.99:
        print(f"Model has converged after {i} iterations")
        break
else:
    print()
    print("Dataset exhausted")
    

end_time = time.time()
duration = end_time - start_time

fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True)
axes[0].plot(losses)
axes[0].set_ylabel('Training Loss')
axes[1].plot(accuracies)
axes[1].set_ylabel('Validation Accuracy')
axes[1].set_xlabel('Batch Iteration')

print(f"Duration: {duration / 60:.1f} minutes")
print(f"Accuracy: {accuracy:.3%}")
assert end_time - start_time < 600, 'executing took longer than 10 mins'
assert test_variable_delay_model(
    model, max_delay=MAX_DELAY, seq_length=SEQ_LENGTH, size=TEST_SIZE,
) > 0.99, 'accuracy is too low'
print('tests passed')

In [None]:
model.test_run('abcdefghijk', 8)