
## Baby Transformer GPT


In [1]:

import torch
import torch.nn as nn
import torch.nn.functional as F


In [2]:

# === Config ===
batch_size = 2
seq_len = 10
vocab_size = 1000
embed_dim = 64
ff_dim = 128


In [3]:

# === Fake input: token IDs ===
tokens = torch.randint(0, vocab_size, (batch_size, seq_len))  # shape: [2, 10]
tokens 


tensor([[251, 528, 243,  43, 851, 227, 169, 821, 314, 854],
        [185, 719, 518, 847, 244, 180, 661,  95, 118, 687]])

In [4]:

# === Embedding layers ===
token_embed = nn.Embedding(vocab_size, embed_dim)
pos_embed   = nn.Embedding(seq_len, embed_dim)


In [5]:

# Token + Positional Embedding

x_token   = token_embed(tokens)  # [2, 10, 64]
positions = torch.arange(seq_len).unsqueeze(0).expand(batch_size, seq_len)
x_pos = pos_embed(positions)   # [2, 10, 64]


In [6]:

x = x_token + x_pos  # [2, 10, 64]
x.shape


torch.Size([2, 10, 64])

In [7]:

# === Self-Attention ===
Wq = nn.Linear(embed_dim, embed_dim)
Wk = nn.Linear(embed_dim, embed_dim)
Wv = nn.Linear(embed_dim, embed_dim)


In [8]:

Q = Wq(x)  # [2, 10, 64]
K = Wk(x)
V = Wv(x)

K.shape


torch.Size([2, 10, 64])

In [9]:

# Scaled dot-product attention
attn_scores = Q @ K.transpose(-2, -1) / (embed_dim ** 0.5)  # [2, 10, 10]
attn_weights = F.softmax(attn_scores, dim=-1)              # [2, 10, 10]
                           
attn_weights.shape


torch.Size([2, 10, 10])

In [10]:

attn_output = attn_weights @ V  

attn_output.shape


torch.Size([2, 10, 64])

In [11]:


# === Feedforward layer ===
ff1 = nn.Linear(embed_dim, ff_dim)
ff2 = nn.Linear(ff_dim, embed_dim)


In [12]:

ff_output = ff2(F.relu(ff1(attn_output)))  # [2, 10, 64]
ff_output.shape


torch.Size([2, 10, 64])

In [13]:

# === Final projection to vocab ===
to_vocab = nn.Linear(embed_dim, vocab_size)
logits   = to_vocab(ff_output)  # [2, 10, vocab_size]



In [14]:


print("Logits shape:", logits.shape)  # [2, 10, 1000]


Logits shape: torch.Size([2, 10, 1000])
