# Chapter 5: Full Code Implementation for Exercises

Sarah Marvin

### Exercise 5.1:

In [33]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import tiktoken

# Define a small vocabulary
vocab = {
    "closer": 0, "every": 1, "effort": 2, "forward": 3,
    "inches": 4, "moves": 5, "pizza": 6, "toward": 7, "you": 8
}
inverse_vocab = {v: k for k, v in vocab.items()}

# Example logits for the next token prediction
next_token_logits = torch.tensor([4.51, 0.89, -1.90, 6.75, 1.63, -1.62, -1.89, 6.28, 1.79])

# Temperature-scaled softmax function
def softmax_with_temperature(logits, temperature):
    """
    Applies temperature scaling to the logits and computes softmax.
    """
    scaled_logits = logits / temperature
    return torch.softmax(scaled_logits, dim=0)

# Sampling function to print sampled tokens based on probabilities
def print_sampled_tokens(probas):
    """
    Sample tokens from the probability distribution and print their frequencies.
    """
    torch.manual_seed(123)  # Ensure reproducibility
    sample = [torch.multinomial(probas, num_samples=1).item() for _ in range(1000)]
    sampled_ids = torch.bincount(torch.tensor(sample))
    for i, freq in enumerate(sampled_ids):
        print(f"{freq} x {inverse_vocab[i]}")

# Set of temperatures to experiment with
temperatures = [1, 0.1, 5]  # Normal, lower randomness, higher randomness
scaled_probas = [softmax_with_temperature(next_token_logits, T) for T in temperatures]

# Print sampled tokens for each temperature
for i, probas in enumerate(scaled_probas):
    print(f"Temperature: {temperatures[i]}")
    print_sampled_tokens(probas)
    print("\n")

# Check the actual probability of "pizza" at temperature 5
temp5_idx = 2
pizza_idx = 6
pizza_prob = scaled_probas[temp5_idx][pizza_idx]
print(f"Actual probability of 'pizza': {pizza_prob:.4f}")


Temperature: 1
71 x closer
2 x every
0 x effort
544 x forward
2 x inches
1 x moves
0 x pizza
376 x toward
4 x you


Temperature: 0.1
0 x closer
0 x every
0 x effort
992 x forward
0 x inches
0 x moves
0 x pizza
8 x toward


Temperature: 5
153 x closer
68 x every
55 x effort
223 x forward
102 x inches
50 x moves
43 x pizza
218 x toward
88 x you


Actual probability of 'pizza': 0.0430


### Exercise 5.2:

In [34]:
# Top-k sampling function
def apply_top_k_sampling(logits, top_k):
    """
    Applies top-k sampling to logits to limit the options to the top-k probable tokens.
    """
    values, indices = torch.topk(logits, top_k)
    probs = torch.softmax(values, dim=0)
    return torch.zeros_like(logits).scatter_(0, indices, probs)

# Apply top-k sampling and temperature scaling
logits = apply_top_k_sampling(next_token_logits, top_k=3)  # Using top-3 tokens
logits = softmax_with_temperature(logits, temperature=1.0)
print_sampled_tokens(logits)


111 x closer
85 x every
117 x effort
169 x forward
106 x inches
94 x moves
93 x pizza
144 x toward
81 x you


### Exercise 5.3:


In [36]:
import torch

# Dummy logits for demonstration (replace with your actual logits)
next_token_logits = torch.tensor([4.51, 0.89, -1.90, 6.75, 1.63, -1.62, -1.89, 6.28, 1.79])

# Softmax with temperature and clipping for numerical stability
def softmax_with_temperature(logits, temperature):
    """
    Apply temperature scaling to logits and compute softmax with numerical stability.
    """
    logits = logits / temperature if temperature != 0 else logits
    # Clip logits to avoid extreme values
    logits = torch.clamp(logits, min=-100, max=100)
    return torch.softmax(logits, dim=-1)

def apply_top_k_sampling(logits, top_k=1):
    """
    Apply top-k sampling to logits to keep only the top-k most probable tokens.
    """
    if top_k > 1:
        values, indices = torch.topk(logits, top_k)
        logits = torch.zeros_like(logits).scatter_(-1, indices, values)
    return logits

def print_sampled_tokens(probas):
    """
    Sample tokens from the probability distribution and print their frequencies.
    """
    torch.manual_seed(123)  # Ensure reproducibility
    if torch.any(torch.isnan(probas)) or torch.any(torch.isinf(probas)):
        print("Invalid probability values found!")
        return

    sample = [torch.multinomial(probas, num_samples=1).item() for _ in range(1000)]
    sampled_ids = torch.bincount(torch.tensor(sample))
    for i, freq in enumerate(sampled_ids):
        print(f"{freq} x Token {i}")

# Set top_k to 1 and temperature to 0 for deterministic output
def generate_deterministic_text(logits):
    """
    Generate deterministic text by setting top_k=1 and temperature=0.
    """
    logits = apply_top_k_sampling(logits, top_k=1)  # Choose the most probable token
    logits = softmax_with_temperature(logits, temperature=0.0)  # Set temperature to 0 for deterministic behavior
    print_sampled_tokens(logits)

# Call the function with the sample logits
generate_deterministic_text(next_token_logits)

71 x Token 0
2 x Token 1
0 x Token 2
544 x Token 3
2 x Token 4
1 x Token 5
0 x Token 6
376 x Token 7
4 x Token 8


### Exercise 5.4:

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np


# Define GPT model (simplified version)
class GPTModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, seq_length):
        super(GPTModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)  # Embedding layer
        self.transformer = nn.Transformer(d_model=hidden_size, nhead=8, num_encoder_layers=num_layers)
        self.fc_out = nn.Linear(hidden_size, vocab_size)  # Output layer

    def forward(self, src, tgt=None):
        """
        Forward pass through the GPT model.
        Args:
            src: Source sequence (input tokens)
            tgt: Target sequence (output tokens, for training purposes)
        """
        # Embedding source and target sequences
        src = self.embed(src)
        if tgt is not None:
            tgt = self.embed(tgt)

        # Ensure the shape (seq_len, batch_size, embed_dim) for transformer
        src = src.permute(1, 0, 2)  # (batch_size, seq_len, embed_dim) -> (seq_len, batch_size, embed_dim)
        if tgt is not None:
            tgt = tgt.permute(1, 0, 2)  # Same for target

        # Pass through the transformer (encoder-decoder)
        if tgt is not None:
            output = self.transformer(src, tgt)  # Training mode, both src and tgt provided
        else:
            output = self.transformer(src)  # Generation mode, only src is provided

        # Convert back to (batch_size, seq_len, hidden_size)
        output = output.permute(1, 0, 2)

        # Output layer to map to vocab size
        logits = self.fc_out(output)
        return logits

    def generate_text(self, input_ids, max_length=50):
        """
        Generate text given an initial sequence of tokens (input_ids).
        """
        self.eval()  # Set model to evaluation mode
        generated = input_ids
        for _ in range(max_length):
            logits = self(generated, generated)  # Using the generated text as input and target
            next_token = torch.argmax(logits[:, -1, :], dim=-1).unsqueeze(-1)
            generated = torch.cat((generated, next_token), dim=-1)
        return generated


# Create mock input and output tensors for model training
input_tensor = torch.tensor([[0, 1, 2, 3, 4], [1, 2, 3, 4, 5]], dtype=torch.long)
output_tensor = torch.tensor([[1, 2, 3, 4, 5], [2, 3, 4, 5, 6]], dtype=torch.long)  # Dummy target tensor

# Model training loop
model = GPTModel(vocab_size=9, embed_size=128, hidden_size=128, num_layers=2, seq_length=5)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Example training loop
for epoch in range(5):
    model.train()
    optimizer.zero_grad()
    output = model(input_tensor, output_tensor)  # Pass both source and target
    loss = criterion(output.view(-1, 9), output_tensor.view(-1))  # Flatten and compute loss
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

### Exercise 5.5:

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load pre-trained GPT-2 model and tokenizer from Hugging Face
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Set pad_token_id explicitly, as GPT2 does not have a pad token by default
tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as pad_token
model.config.pad_token_id = tokenizer.pad_token_id  # Update model config with pad_token_id

# Tokenize input text
start_context = "Every effort moves you"
input_ids = tokenizer.encode(start_context, return_tensors="pt", padding=True, truncation=True, max_length=50)

# Generate the attention mask (1 for actual tokens, 0 for padding tokens)
attention_mask = torch.ones(input_ids.shape, device=input_ids.device)

# Generate text using the model
output = model.generate(input_ids, attention_mask=attention_mask, max_length=50)  # Generate a sequence of tokens
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)  # Decode back to text

print(generated_text)

### Exercise 5.6:

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

start_context = "Every effort moves you"
input_ids = tokenizer.encode(start_context, return_tensors="pt", padding=True, truncation=True, max_length=50)
attention_mask = torch.ones(input_ids.shape, device=input_ids.device)

output = model.generate(input_ids, attention_mask=attention_mask, max_length=50)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)