### Step 0: Define Vocabularies and Create Datasets

In [14]:
from transformer import Transformer
from transformer import LayerNorm
import torch
import torch.nn as nn
import pytest
import math
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from dataset import RecursionDatasetCreator
from transformer import Tokenizer
from transformer import generate_masks_tokenized
import numpy as np

In [15]:
START = '<sos>'
PAD = '<pad>'
END = '<eos>'

recursion_vocabulary = [START, 'a', '_', 'n', '+', '1', '=', '*', '/', '-', '^',
                        '2', '3', '4', '5', '6', '7', '8', '0', '9', '(', ')',' ',END, PAD]

solution_vocabulary = [START, 'a', '_', 'n', '+', '1', '=', '*', '/', '-', '^',
                        '2', '3', '4', '5', '6', '7', '8', '9', '0', '(', ')',' ', END, PAD]

dataset_creator = RecursionDatasetCreator("./recursions.txt", "./solutions.txt", 100, 10, recursion_vocabulary, solution_vocabulary)
recursion_dataset = dataset_creator.create_recursion_dataset()

#Define the input and output tokenizers
recursion_tokenizer = Tokenizer(recursion_vocabulary, START, PAD, END)
solution_tokenizer = Tokenizer(solution_vocabulary, START, PAD, END)

recursion_padding_index = len(recursion_vocabulary) -1
solution_padding_index = len(solution_vocabulary) - 1


4.0


In [16]:
batch_size = 3
epochs = 1000
d_model = 10
num_heads = 2
chain_length = 2
max_sequence_length = 10
ffn_hidden = 10
drop_prob = 0.1
output_vocab_size = len(solution_vocabulary)
input_vocab_size = len(recursion_vocabulary)
#Define the transformer model: 
recursion_transformer = Transformer(d_model, num_heads, chain_length, max_sequence_length, ffn_hidden, drop_prob, 
                                    input_vocab_size, output_vocab_size)

train = DataLoader(recursion_dataset, batch_size)

#### Define the loss function/optimizer

In [17]:
criterion = nn.CrossEntropyLoss(ignore_index=solution_padding_index,
                                reduction='none')

optim = torch.optim.Adam(recursion_transformer.parameters(), lr = 1e-4)

#### Full Training + Inference is documented here

Training:

You start with encoded input sentences, with start = False and End = false. So in all essence, you just have padding tokens at the end of the input. The model will understand the input because all the non padding token embeddings will be positionally encoded.

For the decoder, you encode the data with start = True and end = True(this one is not as important). This is because during inference, you want to get your model good at understanding the similarity between start token and the input sentence, so it can generate text.

Inference:

Input is embedded with start = False, end = false as usual.

Decoder is first inputted with a start = True, just the start token. Then, you want to basically feed the input and the start token through the model, and you take the prediction of the first word(0th index), and this will be the 2nd word(1st index) in your sentence. Repeat this process, each time picking the next word that is predicted(if decoder sentence length is k, pick the k -1th index as the prediction values used to infer the next token). if you reach the max sequence length, or hit the eos token, then you are done

#### Define Hyperparameters for Training

In [11]:
for i in range(epochs):
    
    print("---------------------Epoch: {a}---------------------".format(a = i))
    recursion_transformer.train()
    train_data = iter(train)
    
    for batch_num , batch in enumerate(train_data):

        print("------Batch: {a}------".format(a = batch_num))
        
        recursion_sentences, solution_sentences = batch
        recursion_tokenized = recursion_tokenizer.tokenize(recursion_sentences)
        recursion_tokenized = recursion_tokenizer.pad(recursion_tokenized, max_sequence_length, start = False, end = False)
        solution_tokenized = solution_tokenizer.tokenize(solution_sentences)
        solution_tokenized = solution_tokenizer.pad(solution_tokenized, max_sequence_length, start = True, end = True)

        #generate the masks to prevent look ahead for decoder and comparing similarity with padding tokens
        enc_mask, dec_mask, cross_mask = generate_masks_tokenized(recursion_tokenized, solution_tokenized,
                                        recursion_padding_index, solution_padding_index)
        
        
        optim.zero_grad()
        
        predictions = recursion_transformer(recursion_tokenized, solution_tokenized, enc_mask, dec_mask, cross_mask)
        print(predictions.size())
        
        labels = solution_tokenizer.tokenize(solution_sentences)
        labels = solution_tokenizer.pad(labels, max_sequence_length, start = False, end = True)
        # print(labels)
        
        batch_loss = criterion(predictions.view(-1, output_vocab_size), labels.view(-1))
        # print(batch_loss)
        
        #The batch loss represents an array of the loss of each token predicted by the sentence with it's correct label
        #not including the prediction of the tokens that correspond to padding tokens
        #we cannot backwards this loss, so instead let us try to add all the values of this vector, and divide by the
        #non padding token losses
        num_non_padding = (labels != solution_padding_index).sum().item()
        loss = batch_loss.sum() / num_non_padding
        print("-----TOTAL LOSS: {a}-------".format(a = loss))
        loss.backward()
        optim.step()

    


---------------------Epoch: 0---------------------
------Batch: 0------
torch.Size([3, 10])
torch.Size([3, 10])
torch.Size([3, 10, 25])
torch.Size([3, 10])
-----TOTAL LOSS: 3.2341830730438232-------
------Batch: 1------
torch.Size([3, 10])
torch.Size([3, 10])
torch.Size([3, 10, 25])
torch.Size([3, 10])
-----TOTAL LOSS: 3.2328150272369385-------
------Batch: 2------
torch.Size([3, 10])
torch.Size([3, 10])
torch.Size([3, 10, 25])
torch.Size([3, 10])
-----TOTAL LOSS: 3.2370457649230957-------
------Batch: 3------
torch.Size([1, 10])
torch.Size([1, 10])
torch.Size([1, 10, 25])
torch.Size([1, 10])
-----TOTAL LOSS: 3.236971378326416-------
---------------------Epoch: 1---------------------
------Batch: 0------
torch.Size([3, 10])
torch.Size([3, 10])
torch.Size([3, 10, 25])
torch.Size([3, 10])
-----TOTAL LOSS: 3.2318789958953857-------
------Batch: 1------
torch.Size([3, 10])
torch.Size([3, 10])
torch.Size([3, 10, 25])
torch.Size([3, 10])
-----TOTAL LOSS: 3.235668182373047-------
------Batch:

#### Running Inference:

In [18]:
from inference import run_inference

out_sentences = run_inference(["12n4","a_n4","a_n4"], recursion_transformer, recursion_tokenizer, solution_tokenizer)

print(out_sentences)

torch.Size([3, 10])
torch.Size([3, 10])
['/6666)))--', '/6666)))--', '/6666)))--']


#### Beam Search Implemented

In [19]:
from inference import run_beam_search

In [20]:
small_solution_vocabulary = [START, 'a', '_', 'n', ' ', '=', END, PAD]
batch_size = 3
epochs = 1000
d_model = 10
num_heads = 2
chain_length = 2
max_sequence_length = 10
ffn_hidden = 10
drop_prob = 0.1
output_vocab_size = len(solution_vocabulary)
input_vocab_size = len(recursion_vocabulary)

small_solution_padding_index = len(small_solution_vocabulary) - 1
small_solution_tokenizer = Tokenizer(small_solution_vocabulary, START, PAD, END)

small_recursion_transformer = Transformer(d_model, num_heads, chain_length, max_sequence_length, ffn_hidden, drop_prob, 
                                    input_vocab_size, output_vocab_size)


In [21]:
run_beam_search(["an+6"], recursion_transformer, recursion_tokenizer, solution_tokenizer, num_beams = 9)

torch.Size([1, 10])
torch.Size([1, 10])


['<sos>/<pad><pad><pad><pad><pad><pad><pad><pad><eos>',
 '<sos>/<pad><pad>66-))--',
 '<sos>/<pad><pad>66)))--',
 '<sos>/<pad><pad>66)))--',
 '<sos>/<pad><pad>66-))--',
 '<sos>/<pad><pad>66)-)--',
 '<sos>/<pad><pad>66))---',
 '<sos>/<pad><pad>66))-)-',
 '<sos>/<pad><pad>66-))-)']