In [1]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
print(len(text))

1115394


In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [4]:
stoi = { ch:i for i, ch in enumerate(chars) }
itos = { i:ch for i, ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])

encode("hello world")

[46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42]

In [14]:
import torch
device = "mps" if torch.backends.mps.is_available() else "cpu"
data = torch.tensor(encode(text), dtype=torch.long).to(device)

In [15]:
#train val split
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [16]:
block_size = 8
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'Context: {context}, Target: {target}')


Context: tensor([18], device='mps:0'), Target: 47
Context: tensor([18, 47], device='mps:0'), Target: 56
Context: tensor([18, 47, 56], device='mps:0'), Target: 57
Context: tensor([18, 47, 56, 57], device='mps:0'), Target: 58
Context: tensor([18, 47, 56, 57, 58], device='mps:0'), Target: 1
Context: tensor([18, 47, 56, 57, 58,  1], device='mps:0'), Target: 15
Context: tensor([18, 47, 56, 57, 58,  1, 15], device='mps:0'), Target: 47
Context: tensor([18, 47, 56, 57, 58,  1, 15, 47], device='mps:0'), Target: 58


In [17]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) 
    return x, y

xb, yb = get_batch('train')
xb, yb

(tensor([[24, 43, 58,  5, 57,  1, 46, 43],
         [44, 53, 56,  1, 58, 46, 39, 58],
         [52, 58,  1, 58, 46, 39, 58,  1],
         [25, 17, 27, 10,  0, 21,  1, 54]], device='mps:0'),
 tensor([[43, 58,  5, 57,  1, 46, 43, 39],
         [53, 56,  1, 58, 46, 39, 58,  1],
         [58,  1, 58, 46, 39, 58,  1, 46],
         [17, 27, 10,  0, 21,  1, 54, 39]], device='mps:0'))

In [22]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)



class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size, device):
        super().__init__()
        self.embedding_table = nn.Embedding(vocab_size, vocab_size, device=device)

    def forward(self, idx, targets=None):
        logits = self.embedding_table(idx)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

m = BigramLanguageModel(vocab_size, device)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)
idx = torch.zeros((1, 1), dtype=torch.long).to(device)
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.7951, device='mps:0', grad_fn=<NllLossBackward0>)

tasU3;UdYMfPfxcVkPwQtRe'
LOOUfZE&e'zyjQQ-Vak,VCOIi'eg Kbgqu
pyDrrvMLVOANyDXOfHdng&?of!OA3rFoMZyL:DL&


In [23]:
optimizer = torch.optim.Adam(m.parameters(), lr=1e-3)

In [25]:
batch_size = 32
for steps in range(10000):
    xb, yb = get_batch('train')

    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.4112563133239746


In [27]:

print(decode(m.generate(idx, max_new_tokens=300)[0].tolist()))


ARICUCK:
IONTouly me Edjerk mondrn itheland's oe, oghithet f, badogienthofathatey foueD: wad,
ureisold array n
ICoyockind m murs, in mamybalorthyongmyoorord Vofetthindy st
HBy l brveseay alsteanerm to, oupomp! wee d pre h, gavitfithrerean thoms lathind my d erouerse IOLUEDid nghathicerire.F n IS:
Yo


In [28]:
# self attention
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [33]:
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]
        xbow[b, t] = torch.mean(xprev, 0)

In [32]:
torch.manual_seed(42)
# the lower triangle is the key; the first row only adds up the first element in b,
# the second row adds up the first two, and the last adds up all elements in b
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
a, b, c

(tensor([[1.0000, 0.0000, 0.0000],
         [0.5000, 0.5000, 0.0000],
         [0.3333, 0.3333, 0.3333]]),
 tensor([[2., 7.],
         [6., 4.],
         [6., 5.]]),
 tensor([[2.0000, 7.0000],
         [4.0000, 5.5000],
         [4.6667, 5.3333]]))

In [35]:
wei = torch.tril(torch.ones(T, T))
wei = wei / torch.sum(wei, 1, keepdim=True)
xbow2 = wei @ x
torch.allclose(xbow, xbow2)

True

In [46]:
# self attention
torch.manual_seed(1337)
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

# attention head
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False) # the information aggregator for the attention head: what info you'll pass forward
k = key(x)
q = query(x)
v = value(x)

# What this is saying: how important is each token to each other token? weights for an average
wei = q @ k.transpose(-2, -1) * head_size**-0.5 # (B, T, 16) @ (B, 16, T) -> (B, T, T)
# scaled attention: keep unit variance by dividing by sqrt of head size
# we want the values of wei to be diffuse to prevent the softmax from pulling values to 1 and 0
tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros(T, T)
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, -1)
out = wei @ v



Attention is position-agnostic, so we have to add positional encodings
Attention is a flow of information between nodes, aggregate info via a weighted sum
In autoregressive self-attention, each node is pointed to by its previous nodes and itself
But attention could happen in any such directed graph
Each example processsed independently; this means we can parallelize hella
Models like BERT allow future tokens to talk to the past (encoder block), just get rid of the upper triangle zeros
But obviously you can't use that to generate tokens (decoder block)
cross attention: keys, queries, and values produced from different places, instead of all from the same place
Encoder-decoder transformers take keys and values from other places like other encoder blocks

In [47]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

In [52]:

class LayerNorm1d:

    def __init__(self, dim, device, eps=1e-5, momentum=0.1):
        self.eps = eps

        # params
        self.gamma = torch.ones(dim).to(device)
        self.beta = torch.zeros(dim).to(device)

        # running vals
        self.running_mean = torch.zeros(dim).to(device)
        self.running_var = torch.ones(dim).to(device)
    
    def __call__(self, x):
        # normalize over the rows????
        xmean = x.mean(1, keepdim=True)
        xvar = x.var(1, keepdim=True)
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
        self.out = self.gamma * xhat + self.beta
       
        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]

module = BatchNorm1d(100, device)
x = torch.randn(32, 100, device=device)
x = module(x)
 

In [53]:
x[:,0].mean(), x[:,0].std()

(tensor(0.1834, device='mps:0'), tensor(1.0442, device='mps:0'))

In [54]:
x[0,:].mean(), x[0,:].std()

(tensor(4.7684e-09, device='mps:0'), tensor(1.0000, device='mps:0'))