# Advanced Methods in Text Analytics
# Exercise 6: Transformers - Part 2
### Daniel Ruffinelli
## FSS 2025

## Language Models with Transformers

In [1]:
# set device to "cpu" if you don't have a GPU
DEVICE="cpu"

### Question (a)

In [2]:
import torch
import torch.nn as nn


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
            )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1).to(DEVICE)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)
    

### Questions (b) to (e)

In [3]:
class TransformerModel(nn.Module):
    def __init__(self, ntoken, d_model, nhead, d_hid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = nn.TransformerEncoderLayer(
            d_model, nhead, d_hid, dropout
            )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layers, nlayers
            )
        self.encoder = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.decoder = nn.Linear(d_model, ntoken)

    # mask for language modeling
    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        # change all the zeros to negative infinity and all the ones to zeros 
        # as follows:
        mask = mask.float().masked_fill(
            mask == 0, float('-inf')
            ).masked_fill(mask == 1, float(0.0))
        return mask

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask

        ### WRITE YOUR CODE HERE ###

        src = self.encoder(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return output


### Questions (e) to (f)

In [19]:
# our whitespace tokenizer that removes punctuation
def tokenize(text):
    """ 
    Given text, returns all words separated by white space after removing all
    punctuation, except full stops.

    Args:
        text: string with text to tokenize

    Returns:
        list of tokens
    """

    import string

    # separate punctuation symbols with whitespaces
    for symbol in string.punctuation:
        text = text.replace(symbol, " " + symbol + " ")
    text_split = text.split()

    return text_split

In [20]:
# as before, we work with embeddings now, not just strings
from collections import defaultdict as ddict

# these are our splits
shakespeare_splits = {
    "train": "shakespeare_train.txt", 
    "valid": "shakespeare_valid.txt", 
    "text": "shakespeare_test.txt"
}

# we create a vocabulary dict of the form {token: ID}
shakespeare_vocab = {}
for text_file in shakespeare_splits.values():
    with open(text_file) as f:
        split_text = f.read()
        tokenized_split = tokenize(split_text)
        for token in tokenized_split:
            if token not in shakespeare_vocab:
                shakespeare_vocab[token] = len(shakespeare_vocab)
# we add the padding symbol to our vocabulary
shakespeare_vocab["<s>"] = len(shakespeare_vocab)
print("Size of vocabulary:", len(shakespeare_vocab))

# we turn our splits into sequences of token IDs
shakespeare_splits_ids = ddict(list)
for split_id, split_file in shakespeare_splits.items():
    with open(split_file) as f:
            tokenized_split = tokenize(f.read())
    for token in tokenized_split:
        shakespeare_splits_ids[split_id].append(shakespeare_vocab[token])

Size of vocabulary: 29245


In [21]:
# our torch dataset object
from torch.utils.data import Dataset

class SelfSupervisedTextDataset(Dataset):

    def __init__(self, tokenized_text, example_length):
        """
        Dataset to process text examples constructed with self-supervision.

        Args:
            tokenized_text (string): list of tokens to construct examples
            example_length (int): length of inputs strings for model
        """

        # we divide tokenized text into subsequences of (equal) example_length
        # we ignore leftover tokens at the end
        self._examples = []

        for i in range(0, len(tokenized_text), example_length):
            self._examples.append(tokenized_text[i:i + example_length])
        if len(self._examples[-1]) < example_length:
               self._examples.pop()

    def __len__(self):
        return len(self._examples)
    
    def __getitem__(self, idx):
        return self._examples[idx]

In [22]:
# create shakespeare dataset
max_input_length = 64
training_dataset = SelfSupervisedTextDataset(shakespeare_splits_ids["train"], 
                                             max_input_length)
print(training_dataset)

<__main__.SelfSupervisedTextDataset object at 0x7fe3c81b8340>


In [23]:
# our collate function
from torch.utils.data import DataLoader

def collate_fn(batch):
    """
    Function to construct labeled example from given batch.

    Args:
        batch (tensor): tensor of size batch_size x sentence_length with tokens
    """

    # we create two lists for our training examples: inputs and corresponding 
    # targets    
    inputs = []
    targets = []

    for example in batch:
        inputs.append(torch.tensor(example[:-1]))
        targets.append(torch.tensor(example[1:]))

    return torch.stack(inputs, dim=0), torch.stack(targets, dim=0)

In [24]:
# our dataloader for training data
batch_size = 128
training_dataloader = DataLoader(training_dataset, 
                                 collate_fn=collate_fn,
                                 batch_size=batch_size, 
                                 shuffle=True, 
                                 num_workers=0)
print(training_dataloader)

<torch.utils.data.dataloader.DataLoader object at 0x7fe3c81b3fd0>


In [29]:
# our training loop
import time, math

def train(
        model, dataloader, num_tokens, num_epochs=10, print_batch_stats=False
        ):
    """
    Training loop

    Args:
        model: some LM implemented in PyTorch
        dataloader: dataloader that returns sentences as examples
        num_epochs (int): number of epochs to train 
    """

    # set training hyperparameters
    loss_fn = nn.CrossEntropyLoss()
    learning_rate = 0.1
    optimizer = torch.optim.Adagrad(model.parameters(), lr=learning_rate)

    # set model to train mode
    model.train()

    # we iterate over epochs
    num_batches = len(dataloader)
    for epoch in range(num_epochs):
        total_loss = 0.
        start_time = time.time()

        # we iterate over batches
        for batch_num, (inputs, targets) in enumerate(dataloader):

            # move inputs and targets to device
            inputs = inputs.to(DEVICE)
            targets = targets.to(DEVICE)

            # forward pass
            output = model(inputs)
            loss_value = loss_fn(output.view(-1, num_tokens), targets.view(-1))

            # backward pass
            # PyTorch sums up gradients that are computed in sequence
            # so unless we "erase" those gradients after every update, 
            # we will backpropagate through different batches
            # so we set them to zero every time
            optimizer.zero_grad()
            # here we compute gradients
            loss_value.backward()
            # here we update model weights
            optimizer.step()

            total_loss += loss_value.item()

            # log batch stats    
            if print_batch_stats:
                print(f"| Batch {batch_num+1:6d}/{num_batches:6d} "
                      f"| Loss {loss_value:6.4f} "
                      f"| Batch PPL {math.exp(loss_value):8.2f}")
        
        # compute avg loss per batch
        avg_loss = total_loss / num_batches
        
        # compute perplexity where avg loss is likelihood (empirical risk)
        ppl = math.exp(avg_loss)

        # compute epoch time
        epoch_time = time.time() - start_time

        # log epoch stats
        print(
            f"| Epoch {epoch+1:2d}/{num_epochs:2d} | Epoch Time {epoch_time:5f} "
            f"| Avg Loss {avg_loss:6.4f} | PPL {ppl:8.2f}"
        )

        # reset total loss and timer
        total_loss = 0.
        start_time = time.time()


In [87]:
# instantiate your transformer
num_tokens = len(shakespeare_vocab)
token_size = 16 
hidden_size = 16
num_layers = 2
num_heads = 2
transformer_lm = TransformerModel(num_tokens,token_size,num_heads,hidden_size,num_layers).to(DEVICE)
print(transformer_lm)

TransformerModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=16, out_features=16, bias=True)
        )
        (linear1): Linear(in_features=16, out_features=16, bias=True)
        (dropout): Dropout(p=0.5, inplace=False)
        (linear2): Linear(in_features=16, out_features=16, bias=True)
        (norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.5, inplace=False)
        (dropout2): Dropout(p=0.5, inplace=False)
      )
    )
  )
  (encoder): Embedding(29245, 16)
  (decoder): Linear(in_features=16, out_features=29245, bias=True)
)


In [88]:
# train your transformer
train(transformer_lm, training_dataloader, num_tokens, num_epochs=1)

| Epoch  1/ 1 | Epoch Time 5.123611 | Avg Loss 6.4762 | PPL   649.48


In [89]:
# our evaluation loop
def evaluate(model, dataloader, num_tokens, print_batch_stats=False):
    """
    Evaluate model on given dataset.

    Args:
        model: some LM implemented in PyTorch
        dataloader: dataloader that returns sentences as examples
    """

    # we use cross entropy so we can compute perplexity from this
    # we sum loss up, to then divide by number of examples
    loss_fn = nn.CrossEntropyLoss(reduction="sum")

    # set model to eval mode (turns off dropout, etc.)
    model.eval()

    num_batches = len(dataloader)
    num_examples = 0
    total_loss = 0.

    with torch.no_grad():
        # we iterate over batches
        start_time = time.time()
        for batch_num, (inputs, targets) in enumerate(dataloader):

            # move inputs and targets to device
            inputs = inputs.to(DEVICE)
            targets = targets.to(DEVICE)

            # add up number of examples
            num_examples += len(inputs)

            # compute loss
            output = model(inputs)
            loss_value = loss_fn(output[:, -1, :], targets.view(-1))

            # add up loss
            total_loss += loss_value.item()

            # log batch stats    
            if print_batch_stats:
                print(f"| Batch {batch_num+1:6d}/{num_batches:6d} "
                        f"| Loss {loss_value:6.4f} "
                        f"| Batch PPL {math.exp(loss_value):8.2f}")
        
        # compute avg loss per batch
        avg_loss = total_loss / num_examples
        
        # compute perplexity where avg loss is likelihood (empirical risk)
        ppl = math.exp(avg_loss)

        # compute epoch time
        total_time = time.time() - start_time

        # log epoch stats
        print(
            f"| Run Time {total_time:5f} "
            f"| Avg Loss {avg_loss:6.4f} "
            f"| PPL {ppl:8.2f}"
        )


In [90]:
# our collate function for validation data
# we use the same as with the FNN and RNN for comparable results
# this one has a single target per input sequence, unlike the one used for
# teacher forcing during training
def validation_collate_fn(batch):
    """
    Function to construct labeled example from given batch.

    Args:
        batch (tensor): tensor of size batch_size x sentence_length with tokens
    """

    # we create two lists for our training examples: inputs and corresponding 
    # targets    
    inputs = []
    targets = []

    for example in batch:
        inputs.append(torch.tensor(example[:-1]))
        targets.append(torch.tensor(example[-1]))

    return torch.stack(inputs, dim=0), torch.stack(targets, dim=0)

In [91]:
# our validation dataset
validation_dataset = SelfSupervisedTextDataset(shakespeare_splits_ids["valid"], 
                                               max_input_length)
print(validation_dataset)

<__main__.SelfSupervisedTextDataset object at 0x7fe3c1bd6df0>


In [92]:
# our dataloader for validation
batch_size = 128 
validation_dataloader = DataLoader(validation_dataset, 
                                 collate_fn=validation_collate_fn,
                                 batch_size=batch_size, 
                                 shuffle=True, 
                                 num_workers=0)

In [93]:
# evaluate transformer
# we use the same dataloader as with our model model, so performance is 
# comparable, but the evaluate function must be able to handle both models
evaluate(transformer_lm, validation_dataloader, num_tokens)

| Run Time 0.073762 | Avg Loss 6.1747 | PPL   480.45


# PPL on Shakespeare's Validation Split

* N-Gram: ~16K (different construction of validation examples, i.e. perhaps not comparable)
* FNN: ~5K
* RNN: ~400
* TF1: ~500 (2 layers, 2 attention heads each)

Traning and evaluation conditions and are the same for the RNN and TF1, so based on these numbers, RNN wins.
But more can and should be done.


### Question (g)

In [110]:
# let's train it for longer to see if it converges
num_tokens = len(shakespeare_vocab)
token_size = 16 
hidden_size = 16
num_layers = 2
num_heads = 2
tf1 = TransformerModel(num_tokens,token_size,num_heads,hidden_size,num_layers).to(DEVICE)
print(tf1)

TransformerModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=16, out_features=16, bias=True)
        )
        (linear1): Linear(in_features=16, out_features=16, bias=True)
        (dropout): Dropout(p=0.5, inplace=False)
        (linear2): Linear(in_features=16, out_features=16, bias=True)
        (norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.5, inplace=False)
        (dropout2): Dropout(p=0.5, inplace=False)
      )
    )
  )
  (encoder): Embedding(29245, 16)
  (decoder): Linear(in_features=16, out_features=29245, bias=True)
)


In [111]:
# train 
train(tf1, training_dataloader, num_tokens, num_epochs=20)

| Epoch  1/20 | Epoch Time 5.098485 | Avg Loss 6.4761 | PPL   649.44
| Epoch  2/20 | Epoch Time 5.067635 | Avg Loss 6.0588 | PPL   427.88
| Epoch  3/20 | Epoch Time 5.197321 | Avg Loss 5.9272 | PPL   375.09
| Epoch  4/20 | Epoch Time 5.199514 | Avg Loss 5.8511 | PPL   347.62
| Epoch  5/20 | Epoch Time 5.200850 | Avg Loss 5.8010 | PPL   330.64
| Epoch  6/20 | Epoch Time 5.218910 | Avg Loss 5.7631 | PPL   318.34
| Epoch  7/20 | Epoch Time 5.123864 | Avg Loss 5.7347 | PPL   309.43
| Epoch  8/20 | Epoch Time 5.054551 | Avg Loss 5.7117 | PPL   302.38
| Epoch  9/20 | Epoch Time 5.040707 | Avg Loss 5.6938 | PPL   297.01
| Epoch 10/20 | Epoch Time 5.039849 | Avg Loss 5.6782 | PPL   292.43
| Epoch 11/20 | Epoch Time 5.029094 | Avg Loss 5.6638 | PPL   288.24
| Epoch 12/20 | Epoch Time 5.067154 | Avg Loss 5.6527 | PPL   285.05
| Epoch 13/20 | Epoch Time 5.078906 | Avg Loss 5.6426 | PPL   282.19
| Epoch 14/20 | Epoch Time 5.056858 | Avg Loss 5.6336 | PPL   279.68
| Epoch 15/20 | Epoch Time 5.03452

In [112]:
# evaluate 
evaluate(tf1, validation_dataloader, num_tokens)

| Run Time 0.039761 | Avg Loss 5.9963 | PPL   401.92


In [113]:
# let's try decreasing depth
num_tokens = len(shakespeare_vocab)
token_size = 16 
hidden_size = 16
num_layers = 1
num_heads = 2
tf2 = TransformerModel(num_tokens,token_size,num_heads,hidden_size,num_layers).to(DEVICE)
print(tf2)

TransformerModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=16, out_features=16, bias=True)
        )
        (linear1): Linear(in_features=16, out_features=16, bias=True)
        (dropout): Dropout(p=0.5, inplace=False)
        (linear2): Linear(in_features=16, out_features=16, bias=True)
        (norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.5, inplace=False)
        (dropout2): Dropout(p=0.5, inplace=False)
      )
    )
  )
  (encoder): Embedding(29245, 16)
  (decoder): Linear(in_features=16, out_features=29245, bias=True)
)


In [114]:
# train 
train(tf2, training_dataloader, num_tokens, num_epochs=20)

| Epoch  1/20 | Epoch Time 4.701346 | Avg Loss 6.4598 | PPL   638.92
| Epoch  2/20 | Epoch Time 4.888820 | Avg Loss 6.0790 | PPL   436.61
| Epoch  3/20 | Epoch Time 4.950058 | Avg Loss 5.9554 | PPL   385.84
| Epoch  4/20 | Epoch Time 4.842011 | Avg Loss 5.8824 | PPL   358.66
| Epoch  5/20 | Epoch Time 4.785655 | Avg Loss 5.8323 | PPL   341.13
| Epoch  6/20 | Epoch Time 4.818069 | Avg Loss 5.7980 | PPL   329.65
| Epoch  7/20 | Epoch Time 4.924874 | Avg Loss 5.7694 | PPL   320.33
| Epoch  8/20 | Epoch Time 4.792286 | Avg Loss 5.7468 | PPL   313.17
| Epoch  9/20 | Epoch Time 4.782013 | Avg Loss 5.7294 | PPL   307.78
| Epoch 10/20 | Epoch Time 4.815236 | Avg Loss 5.7138 | PPL   303.01
| Epoch 11/20 | Epoch Time 4.833667 | Avg Loss 5.6987 | PPL   298.47
| Epoch 12/20 | Epoch Time 4.810474 | Avg Loss 5.6867 | PPL   294.91
| Epoch 13/20 | Epoch Time 4.842332 | Avg Loss 5.6763 | PPL   291.87
| Epoch 14/20 | Epoch Time 5.184415 | Avg Loss 5.6671 | PPL   289.19
| Epoch 15/20 | Epoch Time 5.34312

In [115]:
# evaluate 
evaluate(tf2, validation_dataloader, num_tokens)

| Run Time 0.036302 | Avg Loss 6.0790 | PPL   436.61


In [117]:
# let's try increasing depth
num_tokens = len(shakespeare_vocab)
token_size = 16 
hidden_size = 16
num_layers = 3
num_heads = 2
tf3 = TransformerModel(num_tokens,token_size,num_heads,hidden_size,num_layers).to(DEVICE)
print(tf3)

TransformerModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=16, out_features=16, bias=True)
        )
        (linear1): Linear(in_features=16, out_features=16, bias=True)
        (dropout): Dropout(p=0.5, inplace=False)
        (linear2): Linear(in_features=16, out_features=16, bias=True)
        (norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.5, inplace=False)
        (dropout2): Dropout(p=0.5, inplace=False)
      )
    )
  )
  (encoder): Embedding(29245, 16)
  (decoder): Linear(in_features=16, out_features=29245, bias=True)
)




In [118]:
# train 
train(tf3, training_dataloader, num_tokens, num_epochs=20)

| Epoch  1/20 | Epoch Time 5.400482 | Avg Loss 6.5141 | PPL   674.56
| Epoch  2/20 | Epoch Time 5.381609 | Avg Loss 6.0816 | PPL   437.71
| Epoch  3/20 | Epoch Time 5.315792 | Avg Loss 5.9446 | PPL   381.68
| Epoch  4/20 | Epoch Time 5.308254 | Avg Loss 5.8695 | PPL   354.09
| Epoch  5/20 | Epoch Time 5.294880 | Avg Loss 5.8195 | PPL   336.81
| Epoch  6/20 | Epoch Time 5.505810 | Avg Loss 5.7831 | PPL   324.75
| Epoch  7/20 | Epoch Time 5.493141 | Avg Loss 5.7546 | PPL   315.63
| Epoch  8/20 | Epoch Time 5.524028 | Avg Loss 5.7316 | PPL   308.46
| Epoch  9/20 | Epoch Time 5.367386 | Avg Loss 5.7119 | PPL   302.45
| Epoch 10/20 | Epoch Time 5.316175 | Avg Loss 5.6956 | PPL   297.57
| Epoch 11/20 | Epoch Time 5.361366 | Avg Loss 5.6813 | PPL   293.34
| Epoch 12/20 | Epoch Time 5.316273 | Avg Loss 5.6677 | PPL   289.38
| Epoch 13/20 | Epoch Time 5.427605 | Avg Loss 5.6566 | PPL   286.18
| Epoch 14/20 | Epoch Time 5.351125 | Avg Loss 5.6464 | PPL   283.28
| Epoch 15/20 | Epoch Time 5.36386

In [119]:
# evaluate 
evaluate(tf3, validation_dataloader, num_tokens)

| Run Time 0.045038 | Avg Loss 5.9818 | PPL   396.16


In [128]:
# let's try increasing depth even more
num_tokens = len(shakespeare_vocab)
token_size = 16 
hidden_size = 16
num_layers = 6
num_heads = 2
tf4 = TransformerModel(num_tokens,token_size,num_heads,hidden_size,num_layers).to(DEVICE)
print(tf4)

TransformerModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=16, out_features=16, bias=True)
        )
        (linear1): Linear(in_features=16, out_features=16, bias=True)
        (dropout): Dropout(p=0.5, inplace=False)
        (linear2): Linear(in_features=16, out_features=16, bias=True)
        (norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.5, inplace=False)
        (dropout2): Dropout(p=0.5, inplace=False)
      )
    )
  )
  (encoder): Embedding(29245, 16)
  (decoder): Linear(in_features=16, out_features=29245, bias=True)
)




In [129]:
# train 
train(tf4, training_dataloader, num_tokens, num_epochs=20)

| Epoch  1/20 | Epoch Time 6.254042 | Avg Loss 6.6811 | PPL   797.21
| Epoch  2/20 | Epoch Time 6.256685 | Avg Loss 6.5647 | PPL   709.58
| Epoch  3/20 | Epoch Time 6.228310 | Avg Loss 6.5253 | PPL   682.17
| Epoch  4/20 | Epoch Time 6.303316 | Avg Loss 6.3468 | PPL   570.67
| Epoch  5/20 | Epoch Time 6.451703 | Avg Loss 6.2024 | PPL   493.95
| Epoch  6/20 | Epoch Time 6.476623 | Avg Loss 6.1223 | PPL   455.93
| Epoch  7/20 | Epoch Time 6.303729 | Avg Loss 6.0588 | PPL   427.84
| Epoch  8/20 | Epoch Time 6.258466 | Avg Loss 6.0063 | PPL   405.97
| Epoch  9/20 | Epoch Time 6.419229 | Avg Loss 5.9625 | PPL   388.57
| Epoch 10/20 | Epoch Time 6.438334 | Avg Loss 5.9225 | PPL   373.36
| Epoch 11/20 | Epoch Time 6.477327 | Avg Loss 5.8882 | PPL   360.77
| Epoch 12/20 | Epoch Time 6.480778 | Avg Loss 5.8557 | PPL   349.22
| Epoch 13/20 | Epoch Time 6.481721 | Avg Loss 5.8287 | PPL   339.90
| Epoch 14/20 | Epoch Time 6.456977 | Avg Loss 5.8051 | PPL   331.98
| Epoch 15/20 | Epoch Time 6.40960

In [130]:
# evaluate 
evaluate(tf4, validation_dataloader, num_tokens)

| Run Time 0.056360 | Avg Loss 6.0392 | PPL   419.54


In [135]:
# how about training with a learning rate scheduler
import time, math

def train_with_scheduler(
        model, 
        dataloader, 
        num_tokens, 
        num_epochs=10,
        lr=1.0,
        print_batch_stats=False
        ):
    """
    Training loop

    Args:
        model: some LM implemented in PyTorch
        dataloader: dataloader that returns sentences as examples
        num_epochs (int): number of epochs to train 
        lr (float): learning rate
    """

    # set training hyperparameters
    loss_fn = nn.CrossEntropyLoss()
    learning_rate = lr
    optimizer = torch.optim.Adagrad(model.parameters(), lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

    # set model to train mode
    model.train()

    # we iterate over epochs
    num_batches = len(dataloader)
    for epoch in range(num_epochs):
        total_loss = 0.
        start_time = time.time()

        # we iterate over batches
        for batch_num, (inputs, targets) in enumerate(dataloader):

            # move inputs and targets to device
            inputs = inputs.to(DEVICE)
            targets = targets.to(DEVICE)

            # get current learning rate
            learning_rate = scheduler.get_last_lr()[0]

            # forward pass
            output = model(inputs)
            loss_value = loss_fn(output.view(-1, num_tokens), targets.view(-1))

            # backward pass
            # PyTorch sums up gradients that are computed in sequence
            # so unless we "erase" those gradients after every update, 
            # we will backpropagate through different batches
            # so we set them to zero every time
            optimizer.zero_grad()
            # here we compute gradients
            loss_value.backward()
            # here we update model weights
            optimizer.step()

            total_loss += loss_value.item()

            # log batch stats    
            if print_batch_stats:
                print(f"| Batch {batch_num+1:6d}/{num_batches:6d} "
                      f"| LR {learning_rate:6.4f} "
                      f"| Loss {loss_value:6.4f} "
                      f"| Batch PPL {math.exp(loss_value):8.2f}")
        
        # compute avg loss per batch
        avg_loss = total_loss / num_batches
        
        # compute perplexity where avg loss is likelihood (empirical risk)
        ppl = math.exp(avg_loss)

        # compute epoch time
        epoch_time = time.time() - start_time

        # log epoch stats
        print(
            f"| Epoch {epoch+1:2d}/{num_epochs:2d} | Epoch Time {epoch_time:5f} "
            f"| Avg Loss {avg_loss:6.4f} | PPL {ppl:8.2f}"
        )

        # reset total loss and timer
        total_loss = 0.
        start_time = time.time()


In [144]:
#  we'll train this one with a learning rate scheduler
m_tokens = len(shakespeare_vocab)
token_size = 16 
hidden_size = 16
num_layers = 3
num_heads = 2
lr = 0.1
tf5 = TransformerModel(num_tokens,token_size,num_heads,hidden_size,num_layers).to(DEVICE)
print(tf5)

TransformerModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=16, out_features=16, bias=True)
        )
        (linear1): Linear(in_features=16, out_features=16, bias=True)
        (dropout): Dropout(p=0.5, inplace=False)
        (linear2): Linear(in_features=16, out_features=16, bias=True)
        (norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.5, inplace=False)
        (dropout2): Dropout(p=0.5, inplace=False)
      )
    )
  )
  (encoder): Embedding(29245, 16)
  (decoder): Linear(in_features=16, out_features=29245, bias=True)
)


In [145]:
# train 
train_with_scheduler(tf5, training_dataloader, num_tokens, num_epochs=20, lr=lr)

| Epoch  1/20 | Epoch Time 5.361673 | Avg Loss 6.5346 | PPL   688.59
| Epoch  2/20 | Epoch Time 5.328326 | Avg Loss 6.1103 | PPL   450.46
| Epoch  3/20 | Epoch Time 5.340834 | Avg Loss 5.9639 | PPL   389.13
| Epoch  4/20 | Epoch Time 5.381404 | Avg Loss 5.8837 | PPL   359.13
| Epoch  5/20 | Epoch Time 5.356963 | Avg Loss 5.8315 | PPL   340.88
| Epoch  6/20 | Epoch Time 5.362206 | Avg Loss 5.7950 | PPL   328.67
| Epoch  7/20 | Epoch Time 5.439713 | Avg Loss 5.7629 | PPL   318.27
| Epoch  8/20 | Epoch Time 5.450390 | Avg Loss 5.7386 | PPL   310.64
| Epoch  9/20 | Epoch Time 5.351116 | Avg Loss 5.7202 | PPL   304.97
| Epoch 10/20 | Epoch Time 5.383507 | Avg Loss 5.7011 | PPL   299.20
| Epoch 11/20 | Epoch Time 5.360246 | Avg Loss 5.6869 | PPL   294.97
| Epoch 12/20 | Epoch Time 5.376559 | Avg Loss 5.6735 | PPL   291.06
| Epoch 13/20 | Epoch Time 5.373191 | Avg Loss 5.6632 | PPL   288.07
| Epoch 14/20 | Epoch Time 5.380653 | Avg Loss 5.6522 | PPL   284.93
| Epoch 15/20 | Epoch Time 5.35445

In [146]:
# evaluate 
evaluate(tf5, validation_dataloader, num_tokens)

| Run Time 0.045606 | Avg Loss 5.9262 | PPL   374.72
