In [1]:
# Start with a dataset to train on. Download the tiny shakespeare dataset
!curl -0 https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -o input.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1089k  100 1089k    0     0  2925k      0 --:--:-- --:--:-- --:--:-- 2951k


In [2]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
print("Lenght of document is {} characters".format(len(text)))

Lenght of document is 1115394 characters


In [4]:
print(text[:200])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


In [5]:
# Get an ordonned list based on the set of unique characters in the dataset
bigrams = sorted(list(set([text[i:i+2] for i in range(len(text)-1)])))
print("Amount of bigrams found: ", len(bigrams))
chars = sorted(list(set(text)))
vocab_list = len(chars)
print("Unique characters: ", ''.join(chars))
print("Vocab list size: ", vocab_list)

Amount of bigrams found:  1403
Unique characters:  
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocab list size:  65


In [6]:
# Tokenize the list of raw characters into a list of integers
# Here we do a very basic tokenization, we just map each character to an integer
# In practice, sub-word tokenization is often used (e.g. BPE, WordPiece, Unigram)
# This is a more complex process that allows to handle out-of-vocabulary words and reduce the vocabulary size
# When the vocabulary size is too large, the model can have difficulty learning the relationships between characters
# and the training time can increase significantly. Sub-word tokenization helps to mitigate this issue.
string_to_index_map = { ch:i for i,ch in enumerate(chars) } # string to index
index_to_char_map = { i:ch for i,ch in enumerate(chars) } # index to string

encode = lambda s: [string_to_index_map[c] for c in s] # encode a string to a list of integers
decode = lambda l: ''.join([index_to_char_map[i] for i in l]) # decode a list of integers to a string

print (encode("Hello there !"))
print (decode(encode("Hello there !")))

# Bigrams alternative
# string_to_index_map_bigrams = { bg:i for i, bg in enumerate(bigrams)}
# index_to_bigram_map = { i:bg for i, bg in enumerate(bigrams)}

encode_bigrams = lambda s: [string_to_index_map_bigrams[s[i:i+2]] for i in range(len(s)-1)] # encode a string to a list of integers
decode_bigrams = lambda l: ''.join([index_to_bigram_map[i] for i in l]) # decode a list of integers to a string

# Error because the text does not contins ' !' So it cannot translate that part.
# print(encode_bigrams("Hello there !"))
# print(decode_bigrams(encode_bigrams("Hello there !")))

[20, 43, 50, 50, 53, 1, 58, 46, 43, 56, 43, 1, 2]
Hello there !


In [7]:
# Now we encode the entire text and split it into training and validation sets
# For this we will use the Torch library to create a dataset and a dataloader
import torch
data = torch.tensor(encode(text), dtype=torch.long) # encode the entire text then wrap it in a torch.tensor

print("{} - {} - {}".format(data.shape, data.dtype, data.type())) # shape of the data tensor and its type
print(data[:200]) # Print the characters that we looked at before

torch.Size([1115394]) - torch.int64 - torch.LongTensor
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59])


In [8]:
# Separate the data into training and validation sets
# Validation part is kept for the end of the training to evaluate the model performance
n = int(0.9 * len(data)) # 90% of the data for training and 10% for validation
train_data = data[:n] # training data
val_data = data[n:] # validation data

In [9]:
# You never enter all of the dataset at once in your model, that would be physically impossible (in theory, the dataset is way bigger than ours)
block_size = 8 # number of characters to feed to the model at once
train_data[:block_size+1] # For each blocks, 8 individual exemple will be remembered by the model.

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [10]:
# Time dimension : In this small block, there are 8 rules of "following characters"
input = train_data[:block_size] # input is the first block_size characters
next = train_data[1:block_size+1] # target is the next character in the sequence

# The training is done that way so the model learns to predict the next characters no matter the context rather than only the last one
for t in range(block_size):
    context = input[:t+1]
    target = next[t]
    print(f"when input is {(context)}, target is {(target)}")

when input is tensor([18]), target is 47
when input is tensor([18, 47]), target is 56
when input is tensor([18, 47, 56]), target is 57
when input is tensor([18, 47, 56, 57]), target is 58
when input is tensor([18, 47, 56, 57, 58]), target is 1
when input is tensor([18, 47, 56, 57, 58,  1]), target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]), target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), target is 58


In [11]:
# Batch dimension : Every time we train the model, we will use a batch of data to train it. This is done to speed up the training process and to make it more stable.
torch.manual_seed(1337) # set the seed for reproducibility
batch_size = 4 # number of independent sequences to process in parallel
block_size = 8 # maximum context length for predictions

def get_batch(split):
    # Generate a small batch of input and target to feed the model
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # random starting index for each sequence in the batch. Batck_size number of blocks will be created
    x = torch.stack([data[i:i+block_size] for i in ix]) # input data
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # target data (what comes after should be guessed)
    return x, y

xb, yb = get_batch('train')
print("Batch input: ", xb.shape, "\n", xb) # shape of the input batch
print("Batch target: ", yb.shape, "\n", yb) # shape of the target batch

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"when input is {(context)}, target is {(target)}")

Batch input:  torch.Size([4, 8]) 
 tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
Batch target:  torch.Size([4, 8]) 
 tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
when input is tensor([24]), target is 43
when input is tensor([24, 43]), target is 58
when input is tensor([24, 43, 58]), target is 5
when input is tensor([24, 43, 58,  5]), target is 57
when input is tensor([24, 43, 58,  5, 57]), target is 1
when input is tensor([24, 43, 58,  5, 57,  1]), target is 46
when input is tensor([24, 43, 58,  5, 57,  1, 46]), target is 43
when input is tensor([24, 43, 58,  5, 57,  1, 46, 43]), target is 39
when input is tensor([44]), target is 53
when input is tensor([44, 53]), target is 56
when input is tensor([44, 53, 56]), target is 1
when input is tenso

In [12]:
print(xb) # Print the first input batch

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


In [13]:
# Here, we use the simpliest neural network : bigrams
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337) # set the seed for reproducibility

# We predict what happen next based on only a single token
class BigramLanguageModel(nn.Module):

    # We create a token embedding table of size vocab_size x vocab_size
    def __init__(self, vocab_size):
        super().__init__()
        # Each token directly reads off the logits for the next token via a lookup table
        # This is a simple model that uses a token embedding table to map each character to a vector of size vocab_size
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    # Each 
    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx) # (Batch, Time, Channels) 4, 8 , 65

        if targets is None:
            loss = None
        # but pytorch expect the dimension in a different order so we need to reshape our logic
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # (Batch * Time, Channels) 32, 65 : 2 dimensionnal array
            targets = targets.view(B*T) # (Batch * Time) 32 : 1 dimensionnal array
            loss = F.cross_entropy(logits, targets)# To mesure the loss / negative prediction

        return logits, loss
    
    # This method take a (B, T) array of indices and generates new tokens based on the current context limited by max_new_tokens
    # idx is the current context, max_new_tokens is the number of new tokens to generate
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions : This is where you would target the loss function, in the current case, we don't use it
            logits, loss = self(idx, None)
            # focus on the last time step
            logits = logits[:, -1, :] # becomes (B, C) where C is the vocab size
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)

        return idx
    
m = BigramLanguageModel(vocab_list) # create the model
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

# generate 100 new tokens starting from the context of a single zero index (which is the first character in the vocab)
# idx = torch.zeros((1, 1), dtype=torch.long) # 1 by 1 tensor where the d.type is integer
print(decode(m.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist())) 
# The result is bad because this is random generated model. It doesn't use the history, only the last character of the input to generate the next one.

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [14]:
# Create a PyTorch optimizer to optimize the model parameters
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3) # AdamW optimizer with a learning rate of 1e-3. Smaller model can go with faster learning rate, bigger model need smaller learning rate

In [15]:
batch_size = 32
# Typical training loop
for steps in range(10000): # the more loops, the more the model will learn

    # Sample a batch of data
    xb, xy = get_batch('train')

    # Evaluate the loss (need to be low but not so low that it give an exact copy of the training data)
    logits, loss = m(xb, xy)
    optimizer.zero_grad(set_to_none=True) # set the gradients to zero before the backward pass
    loss.backward() # Backward pass to compute the gradients
    optimizer.step() # Update the model parameters using the gradients

print(loss.item()) # Print the loss value

2.382369041442871


In [16]:
# now, we can generate new text using the trained model. This wont be perfect because the token do not speak to each other
print(decode(m.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


lso br. ave aviasurf my, yxMPZI ivee iuedrd whar ksth y h bora s be hese, woweee; the! KI 'de, ulseecherd d o blllando;LUCEO, oraingofof win!
RIfans picspeserer hee tha,
TOFonk? me ain ckntoty ded. bo'llll st ta d:
ELIS me hurf lal y, ma dus pe athouo
BEY:! Indy; by s afreanoo adicererupa anse tecorro llaus a!
OLeneerithesinthengove fal amas trr
TI ar I t, mes, n IUSt my w, fredeeyove
THek' merer, dd
We ntem lud engitheso; cer ize helorowaginte the?
Thak orblyoruldvicee chot, p,
Bealivolde Th li


# The mathematical trick in self-attention

In [17]:
# consider the following toy example:

torch.manual_seed(1337) # set the seed for reproducibility
B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B, T, C) # random input tensor
x.shape

torch.Size([4, 8, 2])

In [18]:
# we want token to get information about their past, about the previous tokens (not the future ones)
# one very basic (which loses a lot of data) way to do this is to do an averge (or a sum) of the previous tokens
# this is called a "cumulative sum" or "running sum"
xbow = torch.zeros((B, T, C)) # x "bag of words", a tensor to store the cumulative sum
for b in range(B): # batch dimension
    for t in range(T): # time dimension
        xprev = x[b,:t+1] # (t,C) => all previous tokens up to time t for batch b
        # we take the mean of all previous tokens (including the current one) to get the cumulative sum
        xbow[b, t] = torch.mean(xprev, 0)
x[0]
xbow[0] # the first token is the same as the input, the second token is the average of the first two tokens, etc.

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [19]:
# The previous code is very convenient since it get us the average of the previous tokens in a single line of code.
# So lets vectorize it
wei = torch.tril(torch.ones(T,T))
wei = wei / wei.sum(1, keepdim=True) # normalize the rows to get the average
xbow2 = wei @ x # (B, T, T) @ (B, T, C) => (B, T, C)
torch.allclose(xbow, xbow2) # check if the two tensors are equal
xbow[0], xbow2[0]

(tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]))

In [20]:
# version 3 : use softmax. 
tril = torch.tril(torch.ones(T,T)) # lower triangular 1 matrix
# More interesting because the weights start at 0. 
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf')) # any 0 becomes -inf so that token from the past cannot communicate with the future ones, so we mask the upper triangular part of the matrix
wei = F.softmax(wei, dim=-1) # apply softmax to get the weights
xbow3 = wei @ x # (B, T, T) @ (B, T, C) => (B, T, C)
xbow[0], xbow3[0] # check if the two tensors are equal

(tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]))

In [21]:
torch.tril(torch.ones(3,3)) # get the lower triangular part of a matrix (including the diagonal)

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [22]:
# this was very slow because of the for loops.
torch.manual_seed(42)
a = torch.tril(torch.ones(3,3))
b = torch.randint(0,10,(3,2)).float() # randopm numbers between 0 and 10, 3 rows, 2 columns
c = a @ b # matrix multiplication
print(f"a={a}\nb={b}\nc={c}")

a=tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
b=tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
c=tensor([[ 2.,  7.],
        [ 8., 11.],
        [14., 16.]])


In [23]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a, dim=1, keepdim=True) # normalize the rows to get the average
b = torch.randint(0,10,(3,2)).float() # randopm numbers between 0 and 10, 3 rows, 2 columns
c = a @ b # matrix multiplication
print(f"a={a}\n--\nb={b}\n--\nc={c}")

a=tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [24]:
# version 4: self-attention !
torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B, T, C)

# let's see a single Head perform self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False) # key projection
query = nn.Linear(C, head_size, bias=False) # query projection
value = nn.Linear(C, head_size, bias=False)
k = key(x) # (B, T, 16)
q = query(x) # (B, T, 16)
# No communication yet, just creation of keys
wei = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) => (B, T, T) For every row of B we get a matrix of size (T, T) where each row is the dot product of the query and the key

tril = torch.tril(torch.ones(T,T))
#wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf')) 
wei = F.softmax(wei, dim=-1)
out = wei @ x

# for the purpose of a single head : Here what I am interested in - Here what I have - If you find me interesting, here what I will communicate to you
v = value(x)
out = wei @ v
#out = wei @ x

out.shape

torch.Size([4, 8, 16])

Notes:

- Attention is a __communication mechanism__. Can be seen as nodes in a directed graph looking at each other and aggregating informations with a weighted sum from all nodes that point to them with data-dependent weights.
- There is no notion of space. Attention simply act over a set of vectors. This is why we need to positionally encode tokens.
- Each example accross batch dimensions is of course processed completly independently and never "talk" to each other.
- In an "encoder" attention block just delete the single line that does masking with tril, allowing all tokens to communicate. This here is called a "decoder" attention block because it has triangular masking and is usually used in autoregressive settings, like ... modeling. ( you would remove the line wei = wei.masked_fill(tril = 0, float('-inf')) which masks the future token from the current one)
- "Self-attention" is called such because all the value are coming from the same source (x) but in principal, attention can be more general. "Cross-attention" would be if we want a separate pool of nodes from which we want to pool information onto our node.
- "Scaled" attention additional divides wei by 1/sqrt(head_size). this makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Since wei feeds into softmax so it is important for it to be fairly defused (especially during initialization) otherwise softmax will converge toward a very strong vector.

In [25]:
# Layer normalization is a technique to normalize the activations of a layer in a neural network.
class BatchNorm1d:

    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

    def __call__(self, x):
        # calculate the forward pass
        xmean = x.mean(1, keepdim=True) # mean over the batch dimension
        xvar = x.var(1, keepdim=True, unbiased=False)
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize the unit variance
        self.out = self.gamma * xhat + self.beta # scale and shift
        
        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]
    
torch.manual_seed(1337)
module = BatchNorm1d(100)
x = torch.randn(32, 100)
out = module(x) # forward pass
x.shape

torch.Size([32, 100])