# RNN
While our previous model is training let's look into RNNs which seem like they would be a better fit for our problem.

Working off of https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

from trainer.dataset import CrosswordClueAnswersDataset

PADDING_TOKEN_INDEX = 0
PAD_TO_SIZE = 10

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, num_layers=1, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input):
        h0 = Variable(torch.zeros(1, input.size(0), self.hidden_size))
        output, hn = self.rnn(input, h0)
        output = self.fc(output[:, -1, :]) 
        return self.softmax(output)

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)
        
def run(args):
    device = 'cpu'
    print(f'Running on:', device)

    # load data, split datasets, build vocabs
    #TODO: Parameterize this
    dataset = CrosswordClueAnswersDataset("cleaned_data/no_dupes_10_or_less_tokens.csv")
    train_size = int(0.01 * len(dataset))
    dev_size = int(0.001 * len(dataset))
    test_size = len(dataset) - train_size - dev_size
    g = torch.Generator().manual_seed(42) # this manual_seed is important to ensure that we consistently split the dataset
    train_dataset, test_dataset, dev_dataset = torch.utils.data.random_split(dataset, [train_size, test_size, dev_size], generator=g)

    # build vocab onlt off of training data (for now...)
    tokenizer = get_tokenizer('basic_english')

    clues_iter = map(lambda data: tokenizer(data[1]), train_dataset)
    answers_iter = map(lambda data: tokenizer(data[0]), train_dataset)
    
    clues_vocab = build_vocab_from_iterator(clues_iter, specials=['<pad>', '<unk>'])
    clues_vocab.set_default_index(1)

    answers_vocab = build_vocab_from_iterator(answers_iter, specials=['<unk>'])
    answers_vocab.set_default_index(0)

    print(f'{len(dataset)=}\n{len(train_dataset)=}\n{len(test_dataset)=}\n{len(dev_dataset)=}')
    print(f'{len(answers_vocab)=}\n{len(clues_vocab)=}')

    def collate_batch(batch):
        answer_list, clue_list = [], []

        for (answer, clue) in batch:
            clue_indicies = clues_vocab(tokenizer(clue))
            clue_indicies += [PADDING_TOKEN_INDEX] * (PAD_TO_SIZE - len(clue_indicies))
            clue_list.append(clue_indicies)

            answer_list.append(answers_vocab([answer])[0])

        answer_list = torch.tensor(answer_list).to(device)
        clue_list = torch.tensor(clue_list).to(device)

        return answer_list, clue_list

    # shuffle the training dataloader so we go through different batches each time
    train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=False, collate_fn=collate_batch)
    dev_dataloader = DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, collate_fn=collate_batch) 

    print(f'{len(train_dataloader)=}\n{len(dev_dataloader)=}')

    n_hidden = 128
    input_size = len(clues_vocab)
    n_categories = len(answers_vocab)
    rnn = RNN(input_size, n_hidden, n_categories)

    data_item = next(iter(train_dataloader))
    print(f'{data_item=}')
    clue = data_item[1]

    # Turn a clue into a <line_length x 1 x n_letters>,
    # or an array of one-hot letter vectors
    def clue_to_tensor(clue):
        tensor = torch.zeros(len(clue), 1, len(clues_vocab))
        for li, token_index in enumerate(clue):
            tensor[li][0][token_index] = 1
        return tensor

    learning_rate = 0.005 # If you set this too high, it might explode. If too low, it might not learn
    criterion = nn.NLLLoss()

    def train(answer_tensor, clue_tensor, optimizer):
        rnn.zero_grad()
        output = rnn(clue_tensor)
        loss = criterion(output, answer_tensor)
        loss.backward()

        optimizer.step()

        # Add parameters' gradients to their values, multiplied by learning rate
        # for p in rnn.parameters():
        #     p.data.add_(p.grad.data, alpha=-learning_rate)

        return output, loss.item()

    import time
    import math

    epochs = 10
    print_every = 1000
    plot_every = 100

    # Keep track of losses for plotting
    current_loss = 0
    all_losses = []

    optimizer = torch.optim.AdamW(rnn.parameters())

    def timeSince(since):
        now = time.time()
        s = now - since
        m = math.floor(s / 60)
        s -= m * 60
        return '%dm %ds' % (m, s)
    
    def categoryFromOutput(output):
        top_n, top_i = output.topk(1)
        category_i = top_i[0].item()
        return category_i

    start = time.time()

    for i in range(epochs):
        for idx, (answer, clue) in enumerate(train_dataloader):
            clue_tensor = clue_to_tensor(clue[0])
            output, loss = train(answer, clue_tensor, optimizer)
            current_loss += loss

            # Print iter number, loss, name and guess
            if idx % print_every == 0:
                guess_i = categoryFromOutput(output)
                correct = '✓' if guess_i == answer else '✗ (%s)' % answer
                print('%d %d%% (%s) %.4f / %s' % (idx, idx / len(train_dataloader) * 100, timeSince(start), loss, correct))

            # Add current loss avg to list of losses
            if idx % plot_every == 0:
                all_losses.append(current_loss / plot_every)
                current_loss = 0

    

    
    # print(one_hot[0][41])
    
    # answer = data_item[0]
    # print(f'{clue=}')
    # print(f'{answer=}')

    
    # output, next_hidden = rnn(one_hot, hidden)
    # print(f'{output.shape=}')
    # print(f'{next_hidden.shape=}')


    # model = SimpleCrosswordModel(
    #     vocab_size=len(clues_vocab),
    #     embed_dim=args.embedding_dimensions,
    #     input_size=PAD_TO_SIZE,
    #     hidden_size=args.hidden_layer_size,
    #     output_size=len(answers_vocab),
    #     device=device,
    #     hidden_depth=args.hidden_depth)

    # trainable_model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    # params_count = sum(p.numel() for p in model.parameters())
    # trainable_params_count = sum(p.numel() for p in trainable_model_parameters)
    # print(f'{params_count=}\n{trainable_params_count=}')

    # criterion = torch.nn.CrossEntropyLoss(ignore_index=0)
    # optimizer = torch.optim.AdamW(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)

    # train_dir = os.path.join('training_results', args.output_folder)
    # if not os.path.exists(train_dir):
    #     os.makedirs(train_dir)
    # print(f'Outputting to: {train_dir}')

    # log_interval = int(len(train_dataloader) / 5)
    # reporter = TrainingReporter(train_dir, log_interval)
    # training = Trainer(model=model, criterion=criterion, optimizer=optimizer, reporter=reporter, output_dir=train_dir)
    # training.start(num_epochs=args.num_epochs, train_dataloader=train_dataloader, test_dataloader=dev_dataloader)


In [10]:
from argparse import Namespace
args = Namespace(batch_size=1)
run(args)

Running on: cpu
len(dataset)=538925
len(train_dataset)=5389
len(test_dataset)=532998
len(dev_dataset)=538
len(answers_vocab)=4607
len(clues_vocab)=6848
len(train_dataloader)=5389
len(dev_dataloader)=538
data_item=(tensor([379]), tensor([[  48,   13, 1644,  975,    0,    0,    0,    0,    0,    0]]))


ValueError: Expected input batch_size (10) to match target batch_size (1).