https://github.com/Infatoshi/fcc-intro-to-llms/blob/main/bigram.ipynb

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F


In [2]:
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()


In [3]:
print(len(text), len(list(text)))
print(text[:100])


232310 232310
﻿  DOROTHY AND THE WIZARD IN OZ

  BY

  L. FRANK BAUM

  AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ


In [4]:
chars = sorted(set(text))
print(len(chars))
print(chars)


81
['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']


In [5]:
vocab_size = len(chars)

string_to_int = {c:i for i,c in enumerate(chars)}
int_to_string = {i:c for i,c in enumerate(chars)}

encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

print(encode('hello'))
print(decode([61, 58, 65, 65, 68]))


[61, 58, 65, 65, 68]
hello


In [6]:
data = torch.tensor(encode(text), dtype=torch.long)


In [7]:
data[:100]


tensor([80,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,
         1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26,
        49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,
         0,  0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1,
        36, 25, 38, 28,  1, 39, 30,  1, 39, 50])

In [7]:
train_len = int(len(data) * 0.8)
train_data = data[:train_len]
val_data = data[train_len:]

print(len(train_data), len(val_data))


185848 46462


In [9]:
# Illustration of batch size

block_size = 5

x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    print(f"Context: {x[:t+1]}, target: {y[t]}")


Context: tensor([80]), target: 1
Context: tensor([80,  1]), target: 1
Context: tensor([80,  1,  1]), target: 28
Context: tensor([80,  1,  1, 28]), target: 39
Context: tensor([80,  1,  1, 28, 39]), target: 42


In [8]:
device  = "cuda" if torch.cuda.is_available() else "cpu"
print(device)


cuda


In [9]:
block_size = 8
batch_size = 4

n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    # print(f"Batch indices: {ix}")
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs:')
# print(x.shape)
print(x)
print('targets:')
print(y)


inputs:
tensor([[58, 54, 71,  1, 73, 61, 58,  1],
        [72, 62, 57, 58, 72, 23,  1, 73],
        [67, 60,  1, 54, 73,  1, 54,  1],
        [76, 61, 68, 71, 72, 58,  1, 62]], device='cuda:0')
targets:
tensor([[54, 71,  1, 73, 61, 58,  1, 72],
        [62, 57, 58, 72, 23,  1, 73, 61],
        [60,  1, 54, 73,  1, 54,  1, 73],
        [61, 68, 71, 72, 58,  1, 62, 72]], device='cuda:0')


In [18]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets=None):
        # index - context or sequence
        logits = self.token_embedding_table(index)

        if targets is None:
            loss = None
        else:
            # print(logits.shape)           # (4, 8, 81)
            # B(batch), T(time), C(channel)
            batch, seq_len, vocab_size = logits.shape
            # N -batch_size, C - number of classes as in pytorch docs
            logits = logits.view(batch * seq_len, vocab_size)
            targets = targets.view(batch * seq_len)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            
            # get the predictions
            logits, loss = self.forward(index)

            # Focus only on the last time step
            logits = logits[:, -1, :]   # (B, C)

            # Apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)   # (B, C)

            # Sample from the distribution
            ind1 = torch.multinomial(probs, num_samples=1)

            index = torch.cat((index, ind1), dim=1) # (B, T+1)

        return index
  

In [19]:
model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)   # tensor([[0.]])
output = m.generate(context, max_new_tokens=500)
output.shape


torch.Size([1, 501])

In [17]:
output[0]


tensor([ 0, 20, 17, 35, 76, 58, 72, 63, 79, 60, 17, 12, 48, 56, 50, 14, 25, 33,
         9, 66, 14, 35, 17, 66, 49, 62, 67, 10,  7, 50,  7, 10, 13,  3, 10, 41,
        49, 45, 36, 49, 76, 17,  5, 55, 40, 44, 50, 30, 24, 19, 54, 72, 74,  9,
         5, 14, 48, 21,  6, 51, 53, 50, 68, 30, 42, 46,  5, 79, 19, 70, 71, 28,
        57, 53, 61, 16, 80, 53, 75, 24, 44, 53, 44, 42, 46, 26, 22, 21, 64, 62,
        37, 28, 47, 31, 47, 35, 76, 75, 50, 34, 75, 25, 33, 35, 76, 49, 66, 42,
        74, 78, 77, 47, 69, 52, 39, 56, 48, 34, 39, 45,  4, 23, 42,  6, 46, 52,
        73, 35, 19, 13, 34, 39, 20, 42,  4, 51, 75, 11, 22, 66, 54,  5, 31,  7,
        65, 11, 48, 78, 77,  1, 58, 55, 21, 70,  8, 69, 20, 42, 68, 29, 64, 29,
        50, 14, 77,  5, 48, 56, 65, 78, 70, 44, 38, 39, 28, 31, 51, 25, 34, 56,
        38,  4, 17, 64, 25, 75, 80, 41, 67, 79, 24, 17, 70, 41, 17,  1, 47, 49,
        76, 47, 56, 42, 79, 62, 77, 17, 44, 24, 63, 37, 67, 49, 80, 29, 24, 12,
        76, 75, 28, 27, 76,  7, 24, 59, 

In [18]:
output = output[0].tolist()
generated_chars = decode(output)
print(generated_chars)



85Kwesjzg50XcZ2AI,m2K5mYin-)Z)-1"-QYULYw5'bPTZF?7asu,'2X9([_ZoFRV'z7qrDd_h4﻿_v?T_TRVB:9kiMDWGWKwvZJvAIKwYmRuyxWp]OcXJOU&;R(V]tK71JO8R&[v.:ma'G)l.Xyx eb9q*p8RoEkEZ2x'XclyqTNODG[AJcN&5kAv﻿Qnz?5qQ5 WYwWcRzix5T?jMnY﻿E?0wvDCw)?fzy_17E,dVVF.i1C  AJynWd6PG6zzTD(Epw7WC:"M;xS,I)?qK)jkAwiRRoQ64MgNtPI,bFsK8bm,I
5gvasdoWH&9OS]_ Ws*6sH'h]_HVWCgV6
j3xpwB5v.ESznP
w C0uPqcBYU:tj,Ut?:R_J[Y4x"&mq0fkO.fd:a(jlb0oQE;rC
"oF
t[ZD
:8w):tj!L,d68z;?:s7rh;4fmROJo_[wCqckJT?qT4ZVX&"Y6RV:'1,N?i
vfJc1Klkz4K:a(qV!0a0N?:ZS,A4r'


In [20]:
max_iters = 1000
# eval_interval = 2500
learning_rate = 3e-4
eval_iters = 100

# Create a pytorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

@torch.no_grad()
def estimate_loss():
    out = {}
    losses = torch.zeros(eval_iters)

    model.eval()    
    for iter in range(eval_iters):
        X, Y = get_batch('validation')
        logits, loss = model(X, Y)
        losses[iter] = loss.item()

    return losses.mean()


In [21]:
# Training loop
for iter in range(max_iters):

    # sample a batch of  data
    xb, yb = get_batch('train')

    model.train()

    # Forward pass and evaluate the loss
    logits, loss = model(xb, yb)    # OR model.forward(xb, yb)

    # Step 3 - Computes gradients for both "b" and "w" parameters
    loss.backward()

    # Step 4 - Updates parameters using gradients and the learning rate
    optimizer.step()

    # Step 5 - After applying gradients, reset gradients for the next loop
    optimizer.zero_grad(set_to_none=True)

    if iter % eval_iters == 0:
        val_loss = estimate_loss()
        print(f"Step: {iter}, Training Loss: {loss:.3f}, Validation Loss: {val_loss:.3f}")

print(loss.item())


Step: 0, Training Loss: 4.771, Validation Loss: 4.767
Step: 100, Training Loss: 4.493, Validation Loss: 4.769
Step: 200, Training Loss: 4.643, Validation Loss: 4.729
Step: 300, Training Loss: 4.664, Validation Loss: 4.742
Step: 400, Training Loss: 4.476, Validation Loss: 4.674
Step: 500, Training Loss: 4.785, Validation Loss: 4.659
Step: 600, Training Loss: 4.928, Validation Loss: 4.649
Step: 700, Training Loss: 4.888, Validation Loss: 4.624
Step: 800, Training Loss: 4.520, Validation Loss: 4.602
Step: 900, Training Loss: 4.626, Validation Loss: 4.561
4.600480556488037


### Positional Encoding

In [2]:
seq_len = 6
d_model = 100

# Create a matrix of shape (seq_len, d_model)
pe = torch.zeros(seq_len, d_model)
pe.shape

torch.Size([6, 100])

In [3]:
# POS OR position - create a vector of shape seq_len
position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)     # (seq_len, 1)
position.shape


torch.Size([6, 1])

In [4]:
import math

# Division term in the formula - create a vector of  shape (d_model)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))  # (d_model / 2)
div_term.shape

torch.Size([50])

In [None]:
# Apply sine to even indices
pe[:, 0::2] = torch.sin(position * div_term)    # sin(position * (10000 ** (2i / d_model))


In [None]:
# Apply cos to odd indices
pe[:, 1::2] = torch.cos(position * div_term)    # cos(position * (10000 ** (2i / d_model))


In [None]:
# Add a batch dimension to positional encoding
# (1, seq_len, d_model)
pe = pe.unsqueeze(0)
