In [57]:
# Download the tiny Shakespeare dataset
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [58]:
# Reading the tiny Shakespeare
with open("input.txt", 'r', encoding='utf-8') as f:
    text = f.read()

In [59]:
# Number of characters
len(text)

1115394

In [60]:
print(text[:300])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us


In [61]:
# Getting the set of unique characters
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


So the model will be character level LLM

In [62]:
for i, ch in enumerate(chars):
    print(i, ' - ', ch)

0  -  

1  -   
2  -  !
3  -  $
4  -  &
5  -  '
6  -  ,
7  -  -
8  -  .
9  -  3
10  -  :
11  -  ;
12  -  ?
13  -  A
14  -  B
15  -  C
16  -  D
17  -  E
18  -  F
19  -  G
20  -  H
21  -  I
22  -  J
23  -  K
24  -  L
25  -  M
26  -  N
27  -  O
28  -  P
29  -  Q
30  -  R
31  -  S
32  -  T
33  -  U
34  -  V
35  -  W
36  -  X
37  -  Y
38  -  Z
39  -  a
40  -  b
41  -  c
42  -  d
43  -  e
44  -  f
45  -  g
46  -  h
47  -  i
48  -  j
49  -  k
50  -  l
51  -  m
52  -  n
53  -  o
54  -  p
55  -  q
56  -  r
57  -  s
58  -  t
59  -  u
60  -  v
61  -  w
62  -  x
63  -  y
64  -  z


In [63]:
# Creating a mapping from charactters to integers
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}

In [64]:
for c in 'Hello World':
    print(stoi[c])

20
43
50
50
53
1
35
53
56
50
42


In [65]:
# Createing the encoder and decode
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join(([itos[i] for i in l]))

In [66]:
print(encode('Hello World'))

[20, 43, 50, 50, 53, 1, 35, 53, 56, 50, 42]


So the tokenization code above is our naive way of tokenization for character level

Now we are encoding the entire text dataset and store it into a torch Tensor

In [67]:
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape)
print(data[:300])

torch.Size([1115394])
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
        47, 59, 57

Spliting the dataset

In [68]:
n = int(0.9*len(data))

train_data = data[:n]
val_data =data[n:]

data loader: batches of chunks of data

In [69]:
block_size = 8

train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [70]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]

    print(f'{context} -> {target}')

tensor([18]) -> 47
tensor([18, 47]) -> 56
tensor([18, 47, 56]) -> 57
tensor([18, 47, 56, 57]) -> 58
tensor([18, 47, 56, 57, 58]) -> 1
tensor([18, 47, 56, 57, 58,  1]) -> 15
tensor([18, 47, 56, 57, 58,  1, 15]) -> 47
tensor([18, 47, 56, 57, 58,  1, 15, 47]) -> 58


In [71]:
torch.manual_seed(1337)

batch_size = 4 
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    # ix will be a random 4 blocks consist of 8 characters
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs')
print(xb.shape)
print(xb)

print('targets')
print(yb.shape)
print(yb)

print('__________')

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f'{context.tolist()} -> {target}')


inputs
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
__________
[24] -> 43
[24, 43] -> 58
[24, 43, 58] -> 5
[24, 43, 58, 5] -> 57
[24, 43, 58, 5, 57] -> 1
[24, 43, 58, 5, 57, 1] -> 46
[24, 43, 58, 5, 57, 1, 46] -> 43
[24, 43, 58, 5, 57, 1, 46, 43] -> 39
[44] -> 53
[44, 53] -> 56
[44, 53, 56] -> 1
[44, 53, 56, 1] -> 58
[44, 53, 56, 1, 58] -> 46
[44, 53, 56, 1, 58, 46] -> 39
[44, 53, 56, 1, 58, 46, 39] -> 58
[44, 53, 56, 1, 58, 46, 39, 58] -> 1
[52] -> 58
[52, 58] -> 1
[52, 58, 1] -> 58
[52, 58, 1, 58] -> 46
[52, 58, 1, 58, 46] -> 39
[52, 58, 1, 58, 46, 39] -> 58
[52, 58, 1, 58, 46, 39, 58] -> 1
[52, 58, 1, 58, 46, 39, 58, 1] -> 46
[25] -> 17
[25, 17] -> 2

Simplest LLM - Bigram

In [72]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx) # (B, T, C)

        if targets is None:
            loss = None
        else:
            # Reshaping the logits and targets as the cross_entropy function
            # on pytorch expects it as shown below
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)

            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):

        for _ in range(max_new_tokens):
            logits, _ = self(idx) # this will run forward func above
            logits = logits[:, -1, :] 
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx
    
m = BigramLanguageModel(vocab_size=vocab_size)
logits, loss = m(xb, yb)


In [73]:
print(logits.shape)
print(loss)

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)


In [74]:
print(decode(m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100).tolist()[0]))


Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


Training the bigram model

In [75]:
# Creating the optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

Try the training code below multiple times to get the better results

In [76]:
batch_size = 32

for steps in range(10000):
    xb, yb = get_batch('train')

    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.5727508068084717


In [77]:
print(decode(m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=300).tolist()[0]))


Iyoteng h hasbe pave pirance
Rie hicomyonthar's
Plinseard ith henoure wounonthioneir thondy, y heltieiengerofo'dsssit ey
KIN d pe wither vouprrouthercc.
hathe; d!
My hind tt hinig t ouchos tes; st yo hind wotte grotonear 'so it t jod weancotha:
h hay.JUCle n prids, r loncave w hollular s O:
HIs; ht 


The results above looks much better.

However, we do the predictions on the single characters so the tokens are not talking each other.

___________________________________________

Starting the transformer based approach (self-attention)

*Lets make the tokens to talk to each other*

In [103]:
import torch

torch.manual_seed(1337)

B, T, C = 4, 8, 2 # batch, time, channels
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [104]:
# x bag of words
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] # t, C
        xbow[b, t] = torch.mean(xprev, 0)

In [105]:
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [106]:
xbow[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

But now there is a mathematical way to optimize the code above: torch.tril

In [107]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b

print(a)
print(b)
print(c)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


Here is the trick applied below

In [108]:
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x
xbow2[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [110]:
torch.allclose(xbow[0], xbow2[0])

True

In [113]:
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow2, xbow3)

True

So what was happening until now is that the current token and all the past token was being averaged. So the new token was kinda based on the previous and current token. 

Of course, averaging them is a very naive way to use the previous information.

That is why self-attention is coming on board.

Basically self-attention (roughly) has the ability to make the interact with all the tokens and choose the tokens which are more interesting. So the tril matrix is not going to be uniform, it will have different values for differnt tokens. 

In [114]:
# Version 4: self-attention!
torch.manual_seed(1337)

B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)
out = wei @ x

out.shape

torch.Size([4, 8, 32])

In [115]:
tril

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

Every single token at each position will emit 2 vectors: 1. Query, 2. Key.

Query vector is "what am i looking for"
Key vector is "what do i contain"

To get affinities between those tokens in a sequence, we basically do dot products between keys and queries. My query dot product with all the other keys, that dot product becomes "wei" above. If the my key and a query is having more amount (so they are alligned), I will get to learn more that token than any other token in the sequence.

In [116]:
# Version 4: self-attention!
torch.manual_seed(1337)

B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

# Single head self attention
head_size = 16
# bias false will just allow matrix multiply
key = nn.Linear(C, head_size, bias=False) 
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

# Forwarding x to key and query
k = key(x) # (B, T, 16)
q = query(x) # (B, T, 16)

# Communication is happening now
wei = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) -> (B, T, T) 

tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros((T, T))

# This line of code below, is making the token to talk to itself
# and the previous tokens. it does not let it talk to future tokens.
wei = wei.masked_fill(tril==0, float('-inf')) 
wei = F.softmax(wei, dim=-1)

v = value(x)
# out = wei @ x
out = wei @ v

out.shape

torch.Size([4, 8, 32])

In [118]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

Important normalization on "Scaled" attention

In [141]:
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
wei = q @ k.transpose(-2, -1) * head_size**-0.5 
#* head_size**-0.5

In [142]:
k.var()

tensor(1.0204)

In [143]:
q.var()

tensor(1.0632)

In [144]:
wei.var()

tensor(0.9144)

As it can be seen above, the wei variance without * head_size**-0.5 will be on the order of head_size, but with * head_size**-0.5 the wei variance will be 1. Why is this important? it is because the wei will be fed into softmax (espacially important for initialization). 

In [146]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [148]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])*8, dim=-1)

tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])

When wei will contain very negative and positive values inside of it, it will converge towards on hot vectors (as shown above). So the softmax sharpens towards the max. So it will not agregate any information from the other tokens, it will just use a single token (with the highest value).

_____________________________________________________________

# Deep Residual Learning for Image Recognition

This Residual (skip) connection also appears on the transformers. 

The skip connections here is adding the previous features with the transformed data. 
You are free to fork off the residual pathway, perform some computation and then project back to residual pathway via addition. You go from the inputs to the targets only via plus and plus and plus.

Addition distributes gradients equally to both of its branches that fed as the input. So gradients from loss hop through every addition node all the way to the input and then fork off into the residual blocks. Basically you have the gradient super highway that goes directly from the supervision all the way to the input unimpeded. 

These residual blocks are usually initialized to contribute very very little to the residual pathway. So they are initialized the way they sort of almost not there, but during the optimization they come online over time and they start to contribute. Only at the initialization you can go from directy supervision to the input gradient is unimpeded and just flows, and blocks over time kick in.

This thing drammatically helps with the optimization.


# End of the coding notes

What we implemented is decoder only transformer and we self attention and feed-forward we do not have cross attention. The reason is we are just generating text that is unconditioned on anything. 

The reason that the encoder is used on the original Attention is all you need paper, is because it was a machine translation model. So they are taking the additional information to the encoder and cross attention is taking that with the word generation.


