In [None]:
#%%

!curl https://raw.githubusercontent.com/karpathy/ng-video-lecture/refs/heads/master/input.txt -o input_data

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1089k  100 1089k    0     0  2043k      0 --:--:-- --:--:-- --:--:-- 2039k


In [2]:
# %%

with open('../input_data', 'r', encoding='utf-8') as f:
    text = f.read()

print("length of dataset in chars:", len(text))
print("first thousand chars: \n", text[:1000])

length of dataset in chars: 1115394
first thousand chars: 
 First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I


# Defining our tokenizer

In [3]:
# %%

# get all unique characters
vocab = sorted(list(set(text)))
vocab_size = len(vocab)
print('vocab size: ',vocab_size)
print('vocab: ',''.join(vocab))

vocab size:  65
vocab:  
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [4]:
# %%

# tokenize the input text by characters

str2int = { ch:i for i,ch in enumerate(vocab)}
int2str = { i:ch for i,ch in enumerate(vocab)}

encoder = lambda s: [str2int[c] for c in s] # given string, return list of ints 
decorder = lambda l: ''.join([int2str[i] for i in l]) # given list, return string

print(encoder("my name is rawamily"))
print(decorder(encoder("my name is rawamily")))

[51, 63, 1, 52, 39, 51, 43, 1, 47, 57, 1, 56, 39, 61, 39, 51, 47, 50, 63]
my name is rawamily


In [6]:
# %%
import torch
# encode input dataset and place it in tensor 
data = torch.tensor(encoder(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

# Data perp

In [7]:
# %%

# train validation split 
n = int(.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [8]:
#%%

block_size = 8
train_data[:block_size+1]

# illustration of prediction based on full context of the block:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context}, the target is {target}")

when input is tensor([18]), the target is 47
when input is tensor([18, 47]), the target is 56
when input is tensor([18, 47, 56]), the target is 57
when input is tensor([18, 47, 56, 57]), the target is 58
when input is tensor([18, 47, 56, 57, 58]), the target is 1
when input is tensor([18, 47, 56, 57, 58,  1]), the target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]), the target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target is 58


In [9]:
# %%

# batching for parallel processing of blocks 

torch.manual_seed(1337)
batch_size = 4 # number of independent sequences processed in parallel
block_size = 8 # maximum context length for prediction

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data)-block_size, (batch_size,)) # generate $batch_size number of random indexes
    x = torch.stack([data[i:i+block_size] for i in ix]) # batch chunks for each random index 
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # offset by 1 for next char prediction
    return x, y

xb, yb = get_batch('train')
print('inputs: \n', xb.shape, "\n", xb)
print('targets: \n', yb.shape, "\n", yb)

inputs: 
 torch.Size([4, 8]) 
 tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets: 
 torch.Size([4, 8]) 
 tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [10]:
#%%

# inputs vs targets illustration
for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        print(context)
        target = yb[b, t]
        print(f"when the input is {context.tolist()}, the target: {target}")

tensor([24])
when the input is [24], the target: 43
tensor([24, 43])
when the input is [24, 43], the target: 58
tensor([24, 43, 58])
when the input is [24, 43, 58], the target: 5
tensor([24, 43, 58,  5])
when the input is [24, 43, 58, 5], the target: 57
tensor([24, 43, 58,  5, 57])
when the input is [24, 43, 58, 5, 57], the target: 1
tensor([24, 43, 58,  5, 57,  1])
when the input is [24, 43, 58, 5, 57, 1], the target: 46
tensor([24, 43, 58,  5, 57,  1, 46])
when the input is [24, 43, 58, 5, 57, 1, 46], the target: 43
tensor([24, 43, 58,  5, 57,  1, 46, 43])
when the input is [24, 43, 58, 5, 57, 1, 46, 43], the target: 39
tensor([44])
when the input is [44], the target: 53
tensor([44, 53])
when the input is [44, 53], the target: 56
tensor([44, 53, 56])
when the input is [44, 53, 56], the target: 1
tensor([44, 53, 56,  1])
when the input is [44, 53, 56, 1], the target: 58
tensor([44, 53, 56,  1, 58])
when the input is [44, 53, 56, 1, 58], the target: 46
tensor([44, 53, 56,  1, 58, 46])


# Defining our torch model

In [11]:
# %%

import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None: # in the case where we are running inference
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # convert array to 2D
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, target=targets)

        return logits, loss
    

    def generate(self, idx, max_new_tokens):
        # idx is (B,T) array of indices for the current context
        for _ in range(max_new_tokens):
            # get predictions
            logits, loss = self(idx)
            logits = logits[:, -1, :] # get last time step, (B,C)
            probs = F.softmax(logits, dim=1) # get probability 
            next_idx = torch.multinomial(probs, num_samples=1) # (B,1)
            idx = torch.cat((idx, next_idx), dim=1) # (B, T+1)
        return idx
    
m = BigramLanguageModel(vocab_size=vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decorder(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


# Training using an Adam optimizer

In [13]:
# %%

# get pytorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [16]:
#%%

# training loop
batch_size = 32
for steps in range(10000):
    # sample a batch
    xb, yb = get_batch('train')
    # evaluate loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if steps % 1000 == 0:

        print(f"loss at step {steps}: {loss.item()}")

loss at step 0: 2.4636545181274414
loss at step 1000: 2.390366792678833
loss at step 2000: 2.5757007598876953
loss at step 3000: 2.584615468978882
loss at step 4000: 2.4348089694976807
loss at step 5000: 2.449105739593506
loss at step 6000: 2.4397647380828857
loss at step 7000: 2.4875221252441406
loss at step 8000: 2.3294029235839844
loss at step 9000: 2.418365716934204


In [None]:
# %%

print(decorder(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens=500)[0].tolist()))



Ofows ht IUS:
S:

ING flvenje ssutefr,
M:
War cl igagimous pray whars:
Panalit I It aithit terised thevermenghau buaror VOubed spo mng as chathab llll:
Ware,

ee her,
Thooured aly y hindr's.
Fashat--
MNGes s, share hathure Anfaneof f s llon!

ICLiroushange

Then
Magend cugss, be jollrty

AROUFLom, ifay wil wher, gheatalloult llats howheprshakengayoref f f abighine ck orors n s?
NGABerd Foutheig vemy.
NG t isoststor hnor 'myougorme whe s'car n r toun t pridie are he of t ad
BY:
Hatamethat vint i


# Mathematical trick of self-attention!

In [2]:
import torch

torch.manual_seed(1337)
B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B,T,C)
x.shape


torch.Size([4, 8, 2])

In [3]:
# information should flow from past context to the current idx
# We want x[b,t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B,T,C)) # bag of words which will hold averaging using previous context
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]
        xbow[b,t] = torch.mean(xprev, 0)


x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [4]:
xbow[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [7]:
# use matrix multiplication for efficiency 
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()

c = a @ b

print('a=\n', a)
print('b=\n', b)
print('c=\n', c)


a=
 tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b=
 tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
c=
 tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [8]:
# apply to the attention average mechanism 
wei = torch.tril(torch.ones(T,T)) # weighted aggregation/weighted sum. the triangular shape allows us to only pull context from the past
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # size (B,T,T) multiplied by (B,T,C) results in (B,T,C)

In [10]:
torch.allclose(xbow, xbow2)

True

In [12]:
# using softmax
from torch.nn import functional as F
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf')) # tokens from the future will not be aggregated by setting them to -inf and taking softmax
wei = F.softmax(wei, dim=-1) # normalize
xbow3 = wei @ x # weighted aggregation
torch.allclose(xbow, xbow3)

True

# Self Attention

In [13]:
torch.manual_seed(1337)
B, T, C = 4, 8, 32 # batch, time, channels
x = torch.randn(B,T,C)


tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T)) 
wei = wei.masked_fill(tril == 0, float('-inf')) # tokens from the future will not be aggregated by setting them to -inf and taking softmax
wei = F.softmax(wei, dim=-1) # normalize
out = wei @ x # weighted aggregation

In [19]:
# every single token will produce 2 vectors 
# query vector: what information am I looking for from previous tokens?
# key vector: what information do I contain?
# affinity = dot product of query and key vector 

# create a single head of self-attention 
head_size = 16
key = torch.nn.Linear(C, head_size, bias=False)
query = torch.nn.Linear(C, head_size, bias=False)
value = torch.nn.Linear(C, head_size, bias=False)
# all tokens in x produce a key and a query 
k = key(x) # size B,T,16
q = query(x) # size B,T,16
v = value(x)
wei = q @ k.transpose(-2, -1) # B,T,16 multiplies B,16,T to produce B,T,T

tril = torch.tril(torch.ones(T,T))
wei = wei.masked_fill(tril == 0, float('-inf')) # tokens from the future will not be aggregated by setting them to -inf and taking softmax
wei = F.softmax(wei, dim=-1)
out = wei @ v # weighted aggregation


In [17]:
wei

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.9456, 0.0544, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.8952, 0.0486, 0.0562, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0223, 0.0651, 0.1234, 0.7892, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0711, 0.0019, 0.0034, 0.0080, 0.9155, 0.0000, 0.0000, 0.0000],
         [0.0069, 0.1353, 0.2752, 0.0972, 0.4712, 0.0141, 0.0000, 0.0000],
         [0.1561, 0.1033, 0.1465, 0.0880, 0.0698, 0.3634, 0.0728, 0.0000],
         [0.4031, 0.0104, 0.0134, 0.0060, 0.5049, 0.0365, 0.0194, 0.0064]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0180, 0.9820, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3105, 0.2458, 0.4437, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0730, 0.3275, 0.2227, 0.3769, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1411, 0.5030, 0.0321, 0.2000, 0.1238, 0.0000, 0.0000, 0.0000],
         [0.0613, 0.555

In [18]:
out

tensor([[[ 0.1808, -0.0700, -0.3596,  ..., -0.8016,  1.5236,  2.5086],
         [ 0.1349, -0.0798, -0.2852,  ..., -0.6747,  1.5283,  2.3503],
         [ 0.0827, -0.0413, -0.2757,  ..., -0.6677,  1.3859,  2.2030],
         ...,
         [-1.4614,  0.8593,  0.7174,  ..., -0.1654,  0.6562, -0.6048],
         [-0.7645,  0.0571,  0.2274,  ...,  0.7557,  0.4655,  0.5263],
         [-1.0257,  0.8860,  0.5548,  ..., -0.5685,  1.4220,  0.6832]],

        [[ 0.4562, -1.0917, -0.8207,  ...,  0.0512, -0.6576, -2.5729],
         [ 0.0288,  0.9683, -1.2415,  ...,  0.7727, -1.1413,  1.2434],
         [ 1.1232, -0.1891, -0.3211,  ..., -0.0769, -0.0063, -0.8308],
         ...,
         [ 0.2722,  0.8824, -0.5814,  ...,  0.5520, -0.6591,  0.6601],
         [ 0.4160,  0.3460,  0.5326,  ...,  0.1628, -0.0332, -0.0418],
         [ 0.7334,  0.5238,  0.5093,  ..., -0.0491,  0.1059,  0.1174]],

        [[-0.6067,  1.8328,  0.2931,  ...,  1.0041,  0.8656,  0.1688],
         [-0.5217,  1.3545,  0.2291,  ...,  0

## notes on the self attention block
- attention is a communication mechanism, can be seen as a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights. 
- no notion of space, as opposed to convolutions, attention acts over a set of vectors. positional encodings add in that information
- examples across batch dimensions are processed independently and never communicate  
- the reason for not allowing communication between current tokens and future tokens is because of the language generation use case, if this were sentiment analysis, we could allow all tokens to communicate with all other tokens to capture full meaning. in this use case the next token prediction can only depend on past tokens, so future tokens are masked out in the `wei = wei.masked_fill(tril == 0, float('-inf'))` step
- in the 'Attention is All You Need' paper, the following is the formula:
### Attention(Q, K, V) = softmax((Q*K.transpose)/sqrt(head_size))*V
which we have implemented, however its missing a normalization. "Scaled" attention additionally divides wei by 1/sqrt(head_size), makes it so when input Q, K (query and key) are unit variance, wei will be unit variance too and Softmax will stay diffused and not saturate too much:

In [21]:
k = torch.randn(B,T,head_size)
q = torch.randn(B,T,head_size)
wei = q @ k.transpose(-2, -1)

print(k.var())
print(q.var())
print(wei.var())
wei = q @ k.transpose(-2, -1) * head_size**-0.5
print("scaled means the variance goes down to 1: ", wei.var())


tensor(1.0104)
tensor(1.0204)
tensor(17.6841)
tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])
scaled means the variance goes down to 1:  tensor(1.1053)
tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])


In [24]:
# if wei has high variance, softmax will converge to one hot 
print("low variance softmax: ", torch.softmax(torch.tensor([.1, -.2, .3, -.2, .5]), dim=-1))
print("high variance softmax: ", torch.softmax(torch.tensor([.1, -.2, .3, -.2, .5])*9, dim=-1)) 
# with hgih variance, softmax sharpens towards the highest value, and every token will then aggregate information from a single node, 
# the maximum, which is not what we want when trying to build context form a string of tokens 

low variance softmax:  tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])
high variance softmax:  tensor([0.0228, 0.0015, 0.1382, 0.0015, 0.8359])
