# Demo 3: Beatles lyrics

In this demo we will train a decoder-only transformer to generate lyrics in the style of The Beatles.
We will do this by training it on a corpus consisting of Beatles lyrics.

In [1]:
import os
import sys
import matplotlib.pyplot as plt
from IPython.display import clear_output
import torch
from torch.utils.data import DataLoader

# set to true if working in google colab
colab = False

if colab:
    from google.colab import drive
    drive.mount('/content/drive')
    root_dir = '/content/drive/MyDrive/Projects/Transformer'
    sys.path.append(root_dir)
else:
    root_dir = os.getcwd()

# custom imports
from transformer import layers
from transformer.layers import Transformer
from transformer.nlp import Vocab, TokenizedDataset, InferenceSampler

# set device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'Device is {device}')

CUDA not available. Using CPU.


We use the custom Vocab class to load the text corpus and initialize the vocabulary according to this corpus. We will utilize a subword tokenization scheme in which we split the corpus at boundaries between word and non-word characters. The Vocab class provides useful methods for converting strings to tokens and vice versa. 

In [2]:
filepath = os.path.join(root_dir, 'data/beatles.txt')

# special tokens
sos = -1   # start of sequence
unk = -3   # unknown subword
pad = -100 # padding

# pattern matching boundary between word and non-word characters
pattern = r'(?<=\w)(?=\W)|(?<=\W)(?=\w)'

vocab = Vocab(filepath, sos, unk, pad, pattern)

We can now use the vocab to tokenize the corpus, using the method `tokenize_from_file`:

In [3]:
corpus = vocab.tokenize_from_file(filepath)  # list
corpus = torch.tensor(corpus, dtype=torch.long, device=device)  # tensor

print(f'Corpus: {corpus}')

Corpus: tensor([0, 1, 2,  ..., 2, 6, 0])


Let's split our corpus so that 90% goes into the training set and 10% into the test set. We use the custom class TokenizedDataset to define the dataset. This class creates training examples consisting of (input, target) pairs, where input is the same token sequence as target but right shifted by one position to allow space for the start-of-sequence (sos) token.

In [4]:
context_window = 128
batch_size = 64

train_corpus = corpus[:round(0.9 * len(corpus))]
test_corpus = corpus[round(0.9 * len(corpus)):]

train_set = TokenizedDataset(train_corpus, context_window, sos, pad, device)
test_set = TokenizedDataset(test_corpus, context_window, sos, pad, device)

train_loader = DataLoader(train_set, batch_size, shuffle=True)
test_loader = DataLoader(test_set, batch_size)

print(f'Number of training batches: {len(train_loader)}')

Number of training batches: 1167


Next we define the decoder-only transformer. It consists of the following layers:
- embedding and positional encoding
- decoder stack
- linear (de-embedding) layer
- softmax

Each decoder in the decoder stack consists of a masked multihead attention layer followed by a position-wise fully connected two-layer feed forward neural network.

In [5]:
model = Transformer(
    vocab=len(vocab),
    n_pe=10000,
    d_model=512,
    num_heads=8,
    num_stacks=6,
    d_ff=2048,
    dropout=0.1
).to(device)

Let's see how the untrained model performs. To do this we first define an InferenceSampler object using the model and vocab. The custom InferenceSampler class provides methods for autoregressively generating outputs given a prompt, according to different sampling strategies.

In [6]:
sampler = InferenceSampler(model, vocab, context_window, device)

The sampler allows for three sampling methods: greedy sampling, top_p sampling, and beam search. For example, greedy sampling simply outputs the most next token, given the previous tokens. Let's consider the input prompt 'You and me baby'. Greedy sampling gives the following output:

In [7]:
prompt = 'You and me baby'
output = sampler.greedy(prompt)
print(output)

captaincaptaincaptainPostmancaptainPostmanokayDown,
PeopleDown,
kneesmirrormirrortriesmirror,
ToweringDrippingfloorDownDrippingDownmirrorasworseworseworsemirrorworseworseworseworseworseworseworseburstdeserveDrippingdeserveangryniceniceDoradoDrippingwhoaDrippingmirrorweektimeworsematterworseThinkwhooLivingPostmanDrippingwhoaDrippingDrippingToweringcrowdlendToweringreturningdeserveworseangrymirrormirror,
Towering,
Working,
WorkingoctopusfloorToweringsorrypineappleFBISoSamedeservewhoobrightPostmantimelendunpleasantBBCgrinningAlthoughcarPostmanfishingreturningfranticcarfollowSoceilingKrishnaDrippingaccidentsAlthoughpreparationDrippingfuseToweringangryDrippingDripping,
justperfectly


Unsurprisingly the untrained transformer's output doesn't make any sense. 

Now let's define the testing and training loops. We will use a cross entropy loss function and an Adam optimizer.

In [10]:
cost_fn = layers.CrossEntropyLoss(ignore_index=pad, swap_dims=True)
optim = torch.optim.Adam(model.parameters(), lr=0.001)

def test(model):
    model.eval()
    with torch.no_grad():
        total_loss = 0
        for input, target in test_loader:
            output = model(input)
            total_loss += cost_fn(output, target)

    model.train()

    # return average loss
    return total_loss.item() / len(test_loader)

def train(model, epochs, test_period, starting_patience):

    train_losses = []
    test_losses = []

    # early stopping variables
    patience = starting_patience
    min_loss = float('inf')

    step = -1

    for epoch in range(epochs):

        for batch, (input, target) in enumerate(train_loader):
            
            step += 1

            # output
            output = model(input)

            # loss
            loss = cost_fn(output, target)
            train_losses.append((step, loss.item()))

            # backprop
            optim.zero_grad()
            loss.backward()

            # step
            optim.step()

            # test
            if batch % test_period == 0 or batch == len(train_loader) - 1:
                test_loss = test(model)
                test_losses.append((step, test_loss))

                # print progress
                clear_output(wait=True)
                print(f'Epoch {epoch}, batch {batch}/{len(train_loader)}. ' 
                      f'Test loss = {test_loss:.3}')

                # abort if no more patience (after epoch 0)
                if epoch > 0:
                    if test_loss < 0.99 * min_loss:
                        min_loss = test_loss
                        patience = starting_patience
                    else:
                        patience -= 1
                        if patience < 0:
                            # save checkpoint
                            path = os.path.join(root_dir,'checkpoints/final.pt')
                            torch.save(model.state_dict(), path)

                            print('Ran out of patience.')
                            return train_losses, test_losses
                        
        path = os.path.join(root_dir, f'checkpoints/e{epoch}b{batch}.pt')
        torch.save(model.state_dict(), path)

    return train_losses, test_losses
    
losses = train(model, epochs=5, test_period=10, starting_patience=20)

KeyboardInterrupt: 

In [None]:
x_train, y_train = zip(*losses[0])
x_test, y_test = zip(*losses[1])

fig, ax = plt.subplots()

ax.plot(x_train, y_train, label='train loss')
ax.plot(x_test, y_test, label='test loss')

ax.legend()
ax.set_xlabel('step')
ax.set_title('Average cross entropy loss')
plt.show()

In [None]:
prompt = 'You and me baby'
output = sampler.greedy(prompt)
print(output)

In [None]:
output = sampler.beam_search(prompt, 10)
print(output)