# Demo 3: Beatles lyrics

In this demo we will train a decoder-only transformer to generate lyrics in the style of The Beatles.
We will do this by training it on a corpus consisting of Beatles lyrics.

In [1]:
import os
import sys
import torch
from torch.utils.data import DataLoader

# set to true if working in google colab
colab = False

if colab:
    from google.colab import drive
    drive.mount('/content/drive')
    root_dir = '/content/drive/MyDrive/Projects/Transformer'
    if root_dir not in sys.path:
        sys.path.append(root_dir)
else:
    root_dir = os.getcwd()

# custom imports
from transformer import layers
from transformer.layers import Transformer
from transformer.nlp import Vocab, TokenizedDataset, InferenceSampler

# set device
if torch.cuda.is_available():
    device = torch.device('cuda')
    print('CUDA is available. Using GPU.')
else:
    device = torch.device('cpu')
    print('CUDA not available. Using CPU.')

CUDA not available. Using CPU.


We use the custom Vocab class to load the text corpus and initialize the vocabulary according to this corpus. We will utilize a subword tokenization scheme in which we split the corpus at boundaries between word and non-word characters. The Vocab class provides useful methods for converting strings to tokens and vice versa. 

In [2]:
filepath = os.path.join(root_dir, 'data/beatles.txt')

# special tokens
sos = -1   # start of sequence
unk = -3   # unknown subword
pad = -100 # padding

# pattern matching boundary between word and non-word characters
pattern = r'(?<=\w)(?=\W)|(?<=\W)(?=\w)'

vocab = Vocab(filepath, sos, unk, pad, pattern)

We can now use the vocab to tokenize the corpus, using the method `tokenize_from_file`:

In [3]:
corpus = vocab.tokenize_from_file(filepath)  # list
corpus = torch.tensor(corpus, dtype=torch.long, device=device)  # tensor

print(f'Corpus: {corpus}')

Corpus: tensor([   0,    1,    2,  ..., 2978,    1,    5])


Let's split our corpus so that 90% goes into the training set and 10% into the test set. We use the custom class TokenizedDataset to define the dataset. This class creates training examples consisting of (input, target) pairs, where input is the same token sequence as target but right shifted by one position to allow space for the start-of-sequence (sos) token.

In [4]:
context_window = 32
batch_size = 32

train_corpus = corpus[:round(0.9 * len(corpus))]
test_corpus = corpus[round(0.9 * len(corpus)):]

train_set = TokenizedDataset(train_corpus, context_window, sos, pad, device)
test_set = TokenizedDataset(test_corpus, context_window, sos, pad, device)

train_loader = DataLoader(train_set, batch_size, shuffle=True)
test_loader = DataLoader(test_set, batch_size)

Next we define the decoder-only transformer. It consists of the following layers:
- embedding and positional encoding
- decoder stack
- linear (de-embedding) layer
- softmax

Each decoder in the decoder stack consists of a masked multihead attention layer followed by a position-wise fully connected two-layer feed forward neural network.

In [5]:
model = Transformer(
    vocab=len(vocab),
    d_model=8,
    num_heads=2,
    num_stacks=1,
    d_ff=16,
    dropout=0.0
).to(device)

Let's see how the untrained model performs. To do this we first define an InferenceSampler object using the model and vocab. The custom InferenceSampler class provides methods for autoregressively generating outputs given a prompt, according to different sampling strategies.

In [6]:
sampler = InferenceSampler(model, vocab, context_window, device)

The sampler allows for three sampling methods: greedy sampling, top_p sampling, and beam search. For example, greedy sampling simply outputs the most next token, given the previous tokens. Let's consider the input prompt 'You and me baby'. Greedy sampling gives the following output:

In [7]:
prompt = 'You and me baby'
output = sampler.greedy(prompt)
print(output)

auditiongiveauditionauditionHeadingangelinsteadLizzynoteangelangelHeadingHeadinggivegiveauditiongiveangelawokendoodlethispostcardsanyoneanyoneoutside


In [8]:
output = sampler.top_p(prompt, temp=1, p=0.0)
print(output)

auditiongiveauditionauditionHeadingangelinsteadLizzynoteangelangelHeadingHeadinggivegiveauditiongiveangelawokendoodlethispostcardsanyoneanyoneoutside


In [17]:
output = sampler.beam_search(prompt, 10)
print(output)

carouselsongshouldmeandershortcarouselcarouselpretendstartnotemeansangryphotographsLizzycalldeanyoneangelwhoooofromthisMichelleangelwinks...)



Unsurprisingly the untrained transformer's output doesn't make any sense. 

Now let's define the testing and training loops. We will use a cross entropy loss function and an Adam optimizer.

In [10]:
cost_fn = layers.CrossEntropyLoss(ignore_index=pad, swap_dims=True)
optim = torch.optim.Adam(model.parameters(), lr=0.001)

def test(model):
    model.eval()
    with torch.no_grad():
        total_loss = 0
        for input, target in test_loader:
            output = model(input)
            total_loss += cost_fn(output, target)

    model.train()

    num_tokens = len(test_loader.dataset.corpus)

    # return average loss per token
    return total_loss / num_tokens

def train(model):
    print_period = len(train_loader) // 100

    for batch, (input, target) in enumerate(train_loader):
        
        # output
        output = model(input)

        # loss
        loss = cost_fn(output, target)

        # backprop
        optim.zero_grad()
        loss.backward()

        # step
        optim.step()

        # print progress
        if batch % print_period == 0 or batch == len(train_loader) - 1:
            test_loss = test(model)
            print(f'Batch {batch:3d}. Test loss = {test_loss:5.3}')