In [55]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass

In [56]:
# def create_mask(n_context: int) -> Float[torch.Tensor, "n_context n_context"]:
def create_mask(n_context: int) -> torch.Tensor:
    mask = torch.zeros(n_context, n_context)
    indices = torch.triu_indices(n_context, n_context, offset=1)
    mask[indices[0], indices[1]] = float('-inf') 
    return mask

In [57]:

# Configuration class for our transformer
@dataclass
class GPTConfig:
	d_vocab: int = 10_000 
	d_model: int = 256
	d_mlp: int = 512
	n_heads: int = 4
	d_head: int = 32
	n_layers: int = 6
	max_ctx: int = 512


class AttentionHead(nn.Module):

	def __init__(self, cfg: GPTConfig):
		super().__init__()

		# Define learned attention matrices
		self.W_Q = nn.Linear(cfg.d_model, cfg.d_head)
		self.W_K = nn.Linear(cfg.d_model, cfg.d_head)
		self.W_O = nn.Linear(cfg.d_head, cfg.d_model)
		self.W_V = nn.Linear(cfg.d_model, cfg.d_head)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
		M = create_mask(x.size(0)).to(x.device) # Make sure mask is on same device as input tensor
		return F.softmax(self.W_Q(x) @ self.W_K(x).T + M) @ self.W_O(self.W_V(x)) # Attention equation
		


class MultiHeadedAttention(nn.Module):

	def __init__(self, cfg: GPTConfig):
		super().__init__()
		self.cfg = cfg

		# List of attention heads
		self.heads = nn.ModuleList([AttentionHead(cfg) for i in range(cfg.n_heads)])

	def forward(self, x: torch.Tensor) -> torch.Tensor:
		
		
		
		head_outputs = [head(x) for head in self.heads]  # List of tensors
		sum_output = sum(head_outputs) # Adds all head outputs together

		return sum_output




class MLP(nn.Module):

	def __init__(self, cfg: GPTConfig):
		super().__init__()
		self.cfg = cfg

		# Define layers of MLP
		self.Hidden = nn.Linear(cfg.d_model, cfg.d_mlp)
		self.Output = nn.Linear(cfg.d_mlp, cfg.d_model)

		# Using GELU activation function
		self.gelu = nn.GELU()
		
	def forward(self, x: torch.Tensor) -> torch.Tensor:

		# It's an MLP
		return self.gelu(self.Output(self.gelu(self.Hidden(x))))



class Transformer(nn.Module):

	def __init__(self, cfg: GPTConfig):
		super().__init__()
		self.cfg = cfg

		# Creates positional embedding matrix that covers all possible context lengths
		self.pos_embedding = nn.Embedding(cfg.max_ctx, cfg.d_model)

		# Embedding and unembedding matrices
		self.embedding = nn.Embedding(cfg.d_vocab, cfg.d_model)
		self.unembedding = nn.Linear(cfg.d_model, cfg.d_vocab)	

		# Layernorm
		self.norm = nn.LayerNorm(cfg.d_model)

		# Creates dictionary of attention heads and mlps depending on transformer depth
		self.layers = nn.ModuleList(
			nn.ModuleDict({
				'attn': MultiHeadedAttention(cfg),
				'mlp': MLP(cfg)
			}) for _ in range(cfg.n_layers)
		)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
  
		x = self.embedding(x) # shape (n_context, d_model)
		
		positions = torch.arange(x.size(0), device=x.device)  # (n_context, 1)
		pos_emb = self.pos_embedding(positions)  # (n_context, d_model)

		x = x + pos_emb

		for layer in self.layers:
			
			# Residual connections for each layer
			x = x + layer['attn'](self.norm(x)) 
			x = x + layer['mlp'](self.norm(x))
		
		return self.unembedding(x)

In [58]:

from get_books import get_many_books

# Import some books and pull them all into one giant string
book_ids = [15, 63, 16, 41, 14, 76, 99, 209, 110, 9]
dataset = get_many_books(book_ids, data_temp="./data/gutenberg_data")
rawtext = ""
for book in dataset:
    rawtext += book

Getting book 15...
	1218778 characters read
Getting book 63...
	103137 characters read
Getting book 16...
	255644 characters read
Getting book 41...
	69837 characters read
Getting book 14...
	1897512 characters read
Getting book 76...
	571137 characters read
Getting book 99...
	45748 characters read
Getting book 209...
	228531 characters read
Getting book 110...
	841270 characters read
Getting book 9...
	21311 characters read


In [59]:
import json

from huffman_encoder import huffman_scramble, huffman_unscramble

with open("huffman_codebook.json", "r") as f:
    loaded_dict = json.load(f)

codebook = loaded_dict

# Reverse the codebook to map binary codes to characters
reverse_codebook = {v: k for k, v in codebook.items()}

In [60]:
with open("tokenizer_dict.json", "r") as f:
    loaded_dict = json.load(f)

tokenizer = loaded_dict

# Create reverse tokenizer for decoding token ids
reverse_tokenizer = {v: k for k, v in tokenizer.items()}

# Find the maximum length of the keys
max_key_length = max(len(key) for key in tokenizer)

print(max_key_length)

def tokenize(text, vocab = tokenizer, max_token_length = max_key_length):

    unk_token = "This shouldn't ever happen"

    tokens = []
    i = 0
    while i < len(text):
        match = None
        # Try to find the longest matching token up to max_token_length
        for j in range(min(i + max_token_length, len(text)), i, -1):
            sub = text[i:j]
            if sub in vocab:
                match = sub
                break
        if match:
            tokens.append(vocab[match])  # Append the token ID from vocab
            i += len(match)  # Move by the length of the matched token
        else:
            tokens.append(vocab[unk_token])  # Use unknown token if no match
            i += 1  # Move by one character if no match
    return tokens

def untokenize(token_ids, vocab = reverse_tokenizer):
    text = ""
    for token_id in token_ids:
        text += vocab[token_id]
    return text

75


In [61]:
scrambled_text = huffman_scramble(rawtext, codebook)
print(scrambled_text[0:100])
token_ids = tokenize(scrambled_text)
print(token_ids)

Õ}$ÃÚ+KúD+õV=×ýV7}Q^Õ_~·<5è¦(Ç!^ç@BÃÙJµ¶°öþ½Ò¼Uctðö¤èéV²6ÀZñUÐoÃÚTT# oÖíx¡ÓªçlZ
[8086, 7, 214, 8719, 1429, 152, 4152, 9835, 20653, 4153, 139, 3467, 219, 3251, 9094, 6189, 926, 4885, 31390, 2038, 44313, 3162, 235, 19101, 39, 44314, 40, 5634, 142, 22449, 63, 34653, 8, 19102, 138, 152, 27300, 13905, 4693, 97, 12982, 1275, 38, 42638, 7153, 102, 30534, 260, 12982, 31784, 38, 42638, 2761, 15081, 104, 44, 71, 160, 27302, 170, 10257, 5, 20654, 45331, 7561, 20655, 20656, 11221, 6803, 95, 11757, 24631, 1376, 19, 45332, 19103, 4624, 34654, 4154, 2464, 132, 34655, 13738, 27301, 9434, 5997, 19102, 138, 236, 23177, 4155, 9435, 197, 176, 4886, 20656, 12983, 17599, 134, 9435, 292, 10731, 15506, 145, 8, 12984, 161, 19104, 12320, 3468, 24632, 19102, 11222, 968, 6580, 852, 156, 27303, 177, 17686, 9436, 1275, 38, 1638, 89, 15507, 3959, 1324, 177, 17686, 9436, 31784, 38, 11758, 31596, 13, 1456, 3469, 34657, 4625, 21, 6386, 40226, 132, 42641, 37423, 4278, 6803, 19, 1514, 2070, 34657, 13738,

In [62]:
token_ids = token_ids[0:14520]

token_ids = torch.tensor(token_ids)
# Reorganize tokens into lengths of chunk_size
chunk_size = 100
to_remove = len(token_ids) % chunk_size
new_shape = len(token_ids) // chunk_size
input_ids = token_ids[:-to_remove].reshape(new_shape, chunk_size)

print(input_ids)

tensor([[ 8086,     7,   214,  ..., 17599,   134,  9435],
        [  292, 10731, 15506,  ...,  5819, 34659,   200],
        [19106,   725,  9836,  ...,    84, 15506, 22437],
        ...,
        [   47,  4918,  4770,  ...,  2387, 34988,  2055],
        [ 6444, 13869, 13544,  ...,   194,    18, 13870],
        [  127,  7820, 40269,  ..., 10843,  1328,     3]])


In [63]:
from torch.utils.data import DataLoader, TensorDataset

# Format tokens for dataloader and load them in
tensor = TensorDataset(input_ids)
dataloader = DataLoader(tensor, batch_size=8, shuffle=True)

In [64]:
import torch
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("Torch built with CUDA:", torch.version.cuda)

Torch version: 2.5.1+cu121
CUDA available: True
Torch built with CUDA: 12.1


In [65]:
import torch.optim as optim

import plotly.graph_objects as go

from IPython.display import clear_output

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

model_cfg = GPTConfig(d_vocab=vocab_size) # Configure the model for GPT2 vocab size
model = Transformer(model_cfg).to(device)

# Set up optimizer and loss
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

# Put model into training mode to store derivatives (important to do this after it is on gpu)
model.train()

# Pre‐define layout so each redraw looks identical
layout = go.Layout(
    title="Training Loss Over Time",
    xaxis=dict(title="Step"),
    yaxis=dict(title="Loss"),
)

losses = []

n_epochs = 1
print_interval = 10
for epoch in range(n_epochs):
    for step, batch in enumerate(dataloader):
        # [batch_size, seq_len]
        input_ids_batch = batch[0].to(device)

        # We'll accumulate the losses for each sequence in this mini-batch
        total_loss = 0.0
        batch_size = input_ids_batch.size(0)
        
        # Process each sequence individually
        for i in range(batch_size):
            # Extract a single sequence of shape [seq_len]
            seq_ids = input_ids_batch[i]

            # Next-token language modeling: input is all but last token, target is all but first
            inp = seq_ids[:-1]   # shape [seq_len - 1]
            targ = seq_ids[1:]    # shape [seq_len - 1]

            # Forward pass
            # Your model returns logits of shape [seq_len-1, d_vocab]
            logits = model(inp)

            # Compute loss across this sequence
            # CrossEntropyLoss expects [batch, vocab], so we can pass [seq_len-1, d_vocab] vs. [seq_len-1]
            loss = criterion(logits, targ)

            # Accumulate
            total_loss += loss

        # Average across all sequences in the batch
        total_loss = total_loss / batch_size

        # Backprop and update
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

        # Record and push update to Plotly trace
        losses.append(total_loss.item())

        # Print progress
        if (step + 1) % print_interval == 0:
            # redraw the figure
            clear_output(wait=True)
            fig = go.Figure(
                data=[go.Scatter(x=list(range(len(losses))),
                                y=losses,
                                mode="lines",
                                name="Training Loss")],
                layout=layout
            )
            fig.show(renderer="notebook")  # or omit renderer in many cases
            print(f"Epoch {epoch+1}, Step {step+1}, Loss: {total_loss.item():.4f}")

# The loss initially drops fast, and then it becomes more gradual over time. 
# Due to the nature of sgdm, the loss doesn't always decrease after every batch.

cuda


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# Move the model back to the cpu for inference
model.to('cpu')
model.eval()

# Generate function
def generate(input_text: str, output_tokens: int, model) -> str:

    scrambled_text = huffman_scramble(input_text, codebook)
    token_ids = tokenize(scrambled_text)

    for new_token in range(output_tokens):

        to_model = torch.tensor(token_ids)
        with torch.no_grad():
            out_probs = F.softmax(model(to_model), dim=-1) 
        samples = torch.multinomial(out_probs, 1)
        output_id = samples[-1][0].item()
        token_ids.append(output_id)

    scrambled_text = untokenize(token_ids)
    return huffman_unscramble(scrambled_text, reverse_codebook)

print(generate("Give me a home where the buffalo roam", 5, model))

Give me a home where the buffalo roam”n ieelde



Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.

