In [1]:
# https://brunomaga.github.io/GPT-lite

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# set the random seed, for reproducibility
torch.manual_seed(42)

# device: where to execute computation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# how often to do an evaluation step
eval_interval = 100

# number of training iterations
max_iters = 500

# optimizer's learning rate
learning_rate=1e-4

# minibatch size, how many inputs to 'pack' per iteration 
batch_size = 3

In [3]:

# block size is the maximum sequence length used as input.
# E.g. for block_size 4 and input ABCD, we have training samples A->B, AB->C, ABC->C, ABCD->E
block_size = 4

# size of the embeddings
n_embd = 16

# number of attention heads in Multi-Attention mechanism (the Nx in the GPT decoder diagram)
n_head = 6

# depth of the network as number of decoder blocks.
# Each block contains a normalization, an attention and a feed forward unit
n_layer = 6

# dropout rate (variable p) for dropout units
dropout = 0.2

In [4]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

with open("input.txt") as f:
    text = f.read()

--2024-05-31 22:47:13--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: 'input.txt'

     0K .......... .......... .......... .......... ..........  4% 4.80M 0s
    50K .......... .......... .......... .......... ..........  9% 14.7M 0s
   100K .......... .......... .......... .......... .......... 13% 13.4M 0s
   150K .......... .......... .......... .......... .......... 18% 54.5M 0s
   200K .......... .......... .......... .......... .......... 22% 21.5M 0s
   250K .......... .......... .......... .......... .......... 27% 48.5M 0s
   300K .......... .......... .......... .......... .......... 32% 18.5M 0s
   350K .......... ..

In [5]:
# collect sorted list of input characters and create 
# string-to-int (stoi) and int-to-string (itos) representations:
chars = sorted(list(set(text)))
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }

# define encode and decode functions that convert strings to arrays of tokens and vice-versa
encode = lambda x: torch.tensor([stoi[ch] for ch in x], dtype=torch.long) #encode text to integers
decode = lambda x: ''.join([itos[i] for i in x]) #decode integers to text
vocab_size = len(stoi)

In [7]:
import tiktoken
enc = tiktoken.encoding_for_model("gpt-4")
print(enc.n_vocab)
print(enc.encode("Hello world"))

100277
[9906, 1917]


In [8]:
token_embedding_table = nn.Embedding(vocab_size, n_embd)    # from tokens to embedding
position_embedding_table = nn.Embedding(block_size, n_embd) # from position to embedding

In [9]:
data = encode(text)  #use any encoder here
n = int(0.9*len(data))
train_data, valid_data = data[:n], data[n:]

In [10]:
def get_batch(source):
  """ get batch of size block_size from source """
  
  # generate `batch_size` random offsets on the data 
  ix = torch.randint(len(source)-block_size, (batch_size,) )
  # collect `batch_size` subsequences of length `block_size` from source, as data and target
  x = torch.stack([source[i:i+block_size] for i in ix])
  # target is just x shifted right (ie the predicted token is the next in the sequence)
  y = torch.stack([source[i+1:i+1+block_size] for i in ix])
  return x.to(device), y.to(device)


# test get_batch()
xb, yb = get_batch(train_data)
print("input:\n",xb)
print("target:\n",yb)

for b in range(batch_size): #for every batches
  print(f"\n=== batch {b}:")
  for t in range(block_size): #for each sequence in block
    context = xb[b,:t+1]
    target = yb[b,t]
    print(f"for input {context.tolist()} target is {target.tolist()}")

input:
 tensor([[52,  1, 39, 50],
        [56, 56, 63,  6],
        [56, 50,  8,  0]], device='cuda:0')
target:
 tensor([[ 1, 39, 50, 50],
        [56, 63,  6,  1],
        [50,  8,  0, 16]], device='cuda:0')

=== batch 0:
for input [52] target is 1
for input [52, 1] target is 39
for input [52, 1, 39] target is 50
for input [52, 1, 39, 50] target is 50

=== batch 1:
for input [56] target is 56
for input [56, 56] target is 63
for input [56, 56, 63] target is 6
for input [56, 56, 63, 6] target is 1

=== batch 2:
for input [56] target is 50
for input [56, 50] target is 8
for input [56, 50, 8] target is 0
for input [56, 50, 8, 0] target is 16


In [11]:
class Head(nn.Module):

  def __init__(self, head_size):
    super().__init__()
    self.key   = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
    self.dropout = nn.Dropout(dropout)
    #Note: this dropout randomly prevents some tokens from communicating with each other

  def forward(self, x):
    B,T,C = x.shape
    k = self.key(x) #shape (B,T, head_size)
    q = self.query(x) #shape (B,T, head_size)
    v = self.value(x) #shape (B,T, head_size)

    #compute self-attention scores
    wei = q @ k.transpose(-2, -1) #shape (B,T, head_size) @ (B,head_size,T) --> (B,T,T)
    wei *= C**-0.5 #scale by sqrt(d_k) as per paper, so that variance of the wei is 1
    wei = wei.masked_fill(self.tril[:T,:T]==0, float('-inf')) # (B,T,T)
    wei = F.softmax(wei, dim=-1) # (B, T, T)
    wei = self.dropout(wei)

    #perform weighted aggregation of values
    out = wei @ v # (B, T, T) @ (B, T, head_size) --> (B, T, head_size)
    return out

In [12]:
class MultiHeadAttention(nn.Module):
  """ Multi-head attention as a collection of heads with concatenated outputs."""
  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj  = nn.Linear(n_embd, n_embd)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    out = torch.cat([head(x) for head in self.heads], dim=-1)
    out = self.proj(out)
    out = self.dropout(out)
    return out

In [13]:
class FeedForward(nn.Module):
  """ the feed forward network (FFN) in the paper"""

  def __init__(self, n_embd):
    super().__init__()
    # Note: in the paper (section 3.3) we have d_{model}=512 and d_{ff}=2048.
    # Therefore the inner layer is 4 times the size of the embedding layer
    self.net = nn.Sequential(
        nn.Linear(n_embd, n_embd*4),
        nn.ReLU(),
        nn.Linear(n_embd*4, n_embd),
        nn.Dropout(dropout)
      )

  def forward(self, x):
    return self.net(x)

In [14]:
class Block(nn.Module):
  """ Transformer block: comunication (attention) followed by computation (FFN) """

  def __init__(self, n_embd, n_head):
    # n_embd: embedding dimension
    # n_heads : the number of heads we'd like to use
    super().__init__()
    head_size = n_embd // n_head
    self.sa = MultiHeadAttention(n_head, head_size)
    self.ffwd = FeedForward(n_embd)
    self.ln1 = nn.LayerNorm(n_embd)
    self.ln2 = nn.LayerNorm(n_embd)

  def forward(self, x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x

In [15]:

class GPTlite(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    
    # vocabulary embedding and positional embedding
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd)

    #sequence of attention heads and feed forward layers
    self.blocks = nn.Sequential( *[Block(n_embd, n_head) for _ in range(n_layer)])

    #one layer normalization layer after transformer blocks and before linear layer that outputs the vocabulary
    self.ln = nn.LayerNorm(n_embd)
    self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)
  

  def forward(self, idx):
    """ call the model with idx and targets (training) or without targets (generation)"""

    #idx and targets are both of shape (B,T)
    B, T = idx.shape
    tok_emb = self.token_embedding_table(idx) #shape (B,T,C)
    pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device)) #shape (T,C)
    x = tok_emb + pos_emb #shape (B,T,C)
    x = self.blocks(x)
    x = self.ln(x)
    logits = self.lm_head(x) #shape (B,T,C)
    logits = torch.swapaxes(logits, 1, 2) #shape (B,C,T) to comply with CrossEntropyLoss
    return logits


  def generate(self, idx, max_new_tokens):
    """ given a context idx, generate max_new_tokens tokens and append them to idx """
    for _ in range(max_new_tokens):
      idx_cond = idx[:, -block_size:] #we can never have any idx longer than block_size
      logits = self(idx_cond) #call fwd without targets
      logits = logits[:, :, -1] # take last token. shape (B, C)
      #convert logits to probabilities
      probs = F.softmax(logits, dim=-1) # shape (B, C)
      #randomly sample the next tokens, 1 for each of the previous probability distributions
      #(one could take instead the argmax, but that would be deterministic and boring)
      idx_next = torch.multinomial(probs, num_samples=1) # shape (B, 1)
      #append next token ix to the solution sequence so far
      idx = torch.cat([idx, idx_next], dim=-1) # shape (B, T+1)
    return idx  

In [16]:
m  = GPTlite(vocab_size).to(device)

In [17]:
# train the model
optimizer = torch.optim.Adam(m.parameters(), lr=learning_rate)
for steps in range(max_iters):
  idx, targets = get_batch(train_data)   #get a batch of training data
  logits = m(idx)   #forward pass
  loss = F.cross_entropy(logits, targets)
  loss.backward()   #backward pass
  optimizer.step()   #update parameters
  optimizer.zero_grad(set_to_none=True)  #sets to None instead of 0, to save memory

  #print progress
  if steps % 100 == 0: print(f"step {steps}, loss {loss.item():.2f}")
    
  @torch.no_grad()
  # eval loop: no backprop on this data, to avoid storing all intermediatte variables
  def eval_loss():
    idx, targets = get_batch(valid_data)   #get a batch of validation data
    logits = m(idx)   #forward pass
    loss = F.cross_entropy(logits, targets)
    print(f"step {steps}, eval loss {loss.item():.2f}")
    return loss
  
  if steps % eval_interval == 0: eval_loss().item()

RuntimeError: mat1 and mat2 shapes cannot be multiplied (12x12 and 16x16)

In [None]:
#a 1x1 tensor with batch size 1 and sequence length 1 and starting value 0 (0 is the \n character)
idx = torch.zeros((1,1), dtype=torch.long, device=device)

# test the same generate() function, now with the trained model
print(decode(m.generate(idx, max_new_tokens=500).tolist()[0]))