# Advanced Methods in Text Analytics
# Exercise 6: Transformers - Part 2
### Daniel Ruffinelli
## FSS 2025

## Language Models with Transformers

### Question (a)

In [None]:
import torch
import torch.nn as nn


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

### Question (b) to (d)

In [None]:
class TransformerModel(nn.Module):
    def __init__(self, ntoken, d_model, nhead, d_hid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.decoder = nn.Linear(d_model, ntoken)

    # mask for language modeling
    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        # change all the zeros to negative infinity and all the ones to zeros as follows:
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask

        ### WRITE YOUR CODE HERE ###


### Question (e) to (g)

In [None]:
# our whitespace tokenizer that removes punctuation
def tokenize(text):
    """ 
    Given text, returns all words separated by white space after removing all
    punctuation, except full stops.

    Args:
        text: string with text to tokenize

    Returns:
        list of tokens
    """

    import string

    # separate punctuation symbols with whitespaces
    for symbol in string.punctuation:
        text = text.replace(symbol, " " + symbol + " ")
    text_split = text.split()

    return text_split

In [None]:
# as before, we work with embeddings now, not just strings
from collections import defaultdict as ddict

# these are our splits
shakespeare_splits = {
    "train": "shakespeare_train.txt", 
    "valid": "shakespeare_valid.txt", 
    "text": "shakespeare_test.txt"
}

# we create a vocabulary dict of the form {token: ID}
shakespeare_vocab = {}
for text_file in shakespeare_splits.values():
    with open(text_file) as f:
        split_text = f.read()
        tokenized_split = tokenize(split_text)
        for token in tokenized_split:
            if token not in shakespeare_vocab:
                shakespeare_vocab[token] = len(shakespeare_vocab)
# we add the padding symbol to our vocabulary
shakespeare_vocab["<s>"] = len(shakespeare_vocab)
print("Size of vocabulary:", len(shakespeare_vocab))

# we turn our splits into sequences of token IDs
shakespeare_splits_ids = ddict(list)
for split_id, split_file in shakespeare_splits.items():
    with open(split_file) as f:
            tokenized_split = tokenize(f.read())
    for token in tokenized_split:
        shakespeare_splits_ids[split_id].append(shakespeare_vocab[token])

In [None]:
# our torch dataset object
from torch.utils.data import Dataset

class SelfSupervisedTextDataset(Dataset):

    def __init__(self, tokenized_text, example_length):
        """
        Dataset to process text examples constructed with self-supervision.

        Args:
            tokenized_text (string): list of tokens to construct examples
            example_length (int): length of inputs strings for model
        """

        # we divide tokenized text into subsequences of (equal) example_length
        # we ignore leftover tokens at the end
        self._examples = []

        for i in range(0, len(tokenized_text), example_length):
            self._examples.append(tokenized_text[i:i + example_length])
        if len(self._examples[-1]) < example_length:
               self._examples.pop()

    def __len__(self):
        return len(self._examples)
    
    def __getitem__(self, idx):
        return self._examples[idx]

In [None]:
# create shakespeare dataset
max_input_length = 64
training_dataset = SelfSupervisedTextDataset(shakespeare_splits_ids["train"], 
                                             max_input_length)
print(training_dataset)

In [None]:
# our collate function
from torch.utils.data import DataLoader

def collate_fn(batch):
    """
    Function to construct labeled example from given batch.

    Args:
        batch (tensor): tensor of size batch_size x sentence_length with tokens
    """

    # we create two lists for our training examples: inputsand corresponding 
    # targets    
    inputs = []
    targets = []

    for example in batch:
        inputs.append(torch.tensor(example[:-1]))
        targets.append(torch.tensor(example[1:]))

    return torch.stack(inputs, dim=0), torch.stack(targets, dim=0)

In [None]:
# our dataloader for training data
batch_size = 128
training_dataloader = DataLoader(training_dataset, 
                                 collate_fn=collate_fn,
                                 batch_size=batch_size, 
                                 shuffle=True, 
                                 num_workers=0)
print(training_dataloader)

In [None]:
# our training loop
import time, math

def train(model, dataloader, num_tokens, num_epochs=10, print_batch_stats=False):
    """
    Training loop

    Args:
        model: some LM implemented in PyTorch
        dataloader: dataloader that returns sentences as examples
        num_epochs (int): number of epochs to train 
    """

    # set training hyperparameters
    loss_fn = nn.CrossEntropyLoss()
    learning_rate = 0.1
    optimizer = torch.optim.Adagrad(model.parameters(), lr=learning_rate)

    # set model to train mode
    model.train()

    # we iterate over epochs
    num_batches = len(dataloader)
    for epoch in range(num_epochs):
        total_loss = 0.
        start_time = time.time()

        # we iterate over batches
        for batch_num, (inputs, targets) in enumerate(dataloader):

            # forward pass
            output = model(inputs)
            loss_value = loss_fn(output.view(-1, num_tokens), targets.view(-1))

            # backward pass
            # PyTorch sums up gradients that are computed in sequence
            # so unless we "erase" those gradients after every update, 
            # we will backpropagate through different batches
            # so we set them to zero every time
            optimizer.zero_grad()
            # here we compute gradients
            loss_value.backward()
            # here we update model weights
            optimizer.step()

            total_loss += loss_value.item()

            # log batch stats    
            if print_batch_stats:
                print(f"| Batch {batch_num+1:6d}/{num_batches:6d} "
                      f"| Loss {loss_value:6.4f} "
                      f"| Batch PPL {math.exp(loss_value):8.2f}")
        
        # compute avg loss per batch
        avg_loss = total_loss / num_batches
        
        # compute perplexity where avg loss is likelihood (empirical risk)
        ppl = math.exp(avg_loss)

        # compute epoch time
        epoch_time = time.time() - start_time

        # log epoch stats
        print(
            f"| Epoch {epoch+1:2d}/{num_epochs:2d} | Epoch Time {epoch_time:5f} "
            f"| Avg Loss {avg_loss:6.4f} | PPL {ppl:8.2f}"
        )

        # reset total loss and timer
        total_loss = 0.
        start_time = time.time()


In [None]:
# instantiate your transformer
num_tokens = len(shakespeare_vocab)
token_size = 16 
hidden_size = 16
num_layers = 2
num_heads = 2
transformer_lm = TransformerModel(num_tokens,token_size,num_heads,hidden_size,num_layers)
print(transformer_lm)

In [None]:
# train your transformer
train(transformer_lm, training_dataloader, num_tokens, num_epochs=1)

In [None]:
# our evaluation loop
def evaluate(model, dataloader, num_tokens, print_batch_stats=False):
    """
    Evaluate model on given dataset.

    Args:
        model: some LM implemented in PyTorch
        dataloader: dataloader that returns sentences as examples
    """

    # we use cross entropy so we can compute perplexity from this
    # we sum loss up, to then divide by number of examples
    loss_fn = nn.CrossEntropyLoss(reduction="sum")

    # set model to eval mode (turns off dropout, etc.)
    model.eval()

    num_batches = len(dataloader)
    num_examples = 0
    total_loss = 0.

    with torch.no_grad():
        # we iterate over batches
        start_time = time.time()
        for batch_num, (inputs, targets) in enumerate(dataloader):

            # add up number of examples
            num_examples += len(inputs)

            # compute loss
            output = model(inputs)
            loss_value = loss_fn(output[:, -1, :], targets.view(-1))

            # add up loss
            total_loss += loss_value.item()

            # log batch stats    
            if print_batch_stats:
                print(f"| Batch {batch_num+1:6d}/{num_batches:6d} "
                        f"| Loss {loss_value:6.4f} "
                        f"| Batch PPL {math.exp(loss_value):8.2f}")
        
        # compute avg loss per batch
        avg_loss = total_loss / num_examples
        
        # compute perplexity where avg loss is likelihood (empirical risk)
        ppl = math.exp(avg_loss)

        # compute epoch time
        total_time = time.time() - start_time

        # log epoch stats
        print(
            f"| Run Time {total_time:5f} "
            f"| Avg Loss {avg_loss:6.4f} "
            f"| PPL {ppl:8.2f}"
        )


In [None]:
# our collate function for validation data
# we use the same as with the FNN and RNN for comparable results
# this one has a single target per input sequence, unlike the one used for
# teacher forcing during training
def validation_collate_fn(batch):
    """
    Function to construct labeled example from given batch.

    Args:
        batch (tensor): tensor of size batch_size x sentence_length with tokens
    """

    # we create two lists for our training examples: inputs and corresponding 
    # targets    
    inputs = []
    targets = []

    for example in batch:
        inputs.append(torch.tensor(example[:-1]))
        targets.append(torch.tensor(example[-1]))

    return torch.stack(inputs, dim=0), torch.stack(targets, dim=0)

In [None]:
# our validation dataset
validation_dataset = SelfSupervisedTextDataset(shakespeare_splits_ids["valid"], 
                                               max_input_length)
print(validation_dataset)

In [None]:
# our dataloader for validation
batch_size = 128 
validation_dataloader = DataLoader(validation_dataset, 
                                 collate_fn=validation_collate_fn,
                                 batch_size=batch_size, 
                                 shuffle=True, 
                                 num_workers=0)

In [None]:
# evaluate transformer
# we use the same dataloader as with our model model, so performance is 
# comparable, but the evaluate function must be able to handle both models
evaluate(transformer_lm, validation_dataloader, num_tokens)