# Understanding Transformers 

We will follow along with the tutorial by Andrej karpathy to understand the transformer architecture and create an LLM on the Shakespeare dataset "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"

1. We will use the Pytorch library
2. We only work with chunks of the data at a time (batch training). The max length is suaully called block size or cntext length


In [1]:
# Open the file to inspect the data 
with open("input.txt", "r", encoding = "utf-8") as f:
    text = f.read()
    

In [2]:
print(len(text))
print(text[:1000])

1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for re

### 1. Tokenize and Encode

In [3]:
#Create a sorted list of all the unique characters in the input "text"
chars = sorted(list(set(text)))

#Vocab size is the total length of the unique tokens or characters (in this case)
vocab_size = len(chars)

print("".join(chars))
print(vocab_size ) #65


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [4]:
# Create an enocder and decoder for character level tokenizer 
#(OpenAI has atiktoken library and google has sentencepiece library for tokenizing)

#Convert char to int dict (encode)
stoi = {ch:i for i, ch in enumerate(chars)}

#Convert int to char dict (encode)
itos = {i:ch for i, ch in enumerate(chars)}


encode = lambda s : [stoi[i] for i in s]
decode = lambda l : "".join([itos[i] for i in l])

print(encode("What's up?"))

[35, 46, 39, 58, 5, 57, 1, 59, 54, 12]


In [5]:
# Import the Pytorch Library, encode the dataset and save it in the form of a tensor
import torch


data = torch.tensor(encode(text), dtype = torch.long) #saves the data in the form of a 1D tensor

data.shape #torch.Size([1115394])



torch.Size([1115394])

### 2. Train test Split

In [6]:
# Set aside the last 10% of the data as the validation data and the remaining would be the train data. 
#We do not want memorization of this data, but rather Shakepeare like text

n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

### 3. Create Batches of Data

In [7]:
# We only work with chunks of the data at a time (batch training). The max length is suaully called block size or cntext length
block_size = 8
train_data[: block_size + 1] 

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [8]:
#When we sample the chunk of data,they have mutliple samples packed into it.
#In a chunk of 9 chars, 8 individual examples are packed inside 
#This enables the model to make predictions when the input is as little as one character up to when the input size is equal to block_size 

x = train_data[:block_size ]
y = train_data[1:block_size + 1] # y is off-set by one, because the transformer predicts the next token

for t in range(block_size):
    context = x[: t+1]
    target = y[t]
    print(f"When input is {context}, the target {target}.")

When input is tensor([18]), the target 47.
When input is tensor([18, 47]), the target 56.
When input is tensor([18, 47, 56]), the target 57.
When input is tensor([18, 47, 56, 57]), the target 58.
When input is tensor([18, 47, 56, 57, 58]), the target 1.
When input is tensor([18, 47, 56, 57, 58,  1]), the target 15.
When input is tensor([18, 47, 56, 57, 58,  1, 15]), the target 47.
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target 58.


In [9]:
torch.manual_seed(1337)
batch_size = 4 #Number of training sequences to run in parallel
block_size = 8 #Max length of the sequence

def get_batch(split):

    data = train_data if split=="train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix]) #Stack multiple examples in the form of a tensor
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x,y


#32 examples packed in a tensor of size batch_size X block_size
xb, yb = get_batch("train")
print("Inputs:") #Input to our transformer 
print(xb)
print("Targets:")
print(yb)


for i in range(batch_size): #batch dimension
    for t in range(block_size): #time dimension
        context = xb[i,: t+1]
        target = yb[i, t]
        print(f"When input is {context.tolist()}, the target {target}.")


Inputs:
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
Targets:
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
When input is [24], the target 43.
When input is [24, 43], the target 58.
When input is [24, 43, 58], the target 5.
When input is [24, 43, 58, 5], the target 57.
When input is [24, 43, 58, 5, 57], the target 1.
When input is [24, 43, 58, 5, 57, 1], the target 46.
When input is [24, 43, 58, 5, 57, 1, 46], the target 43.
When input is [24, 43, 58, 5, 57, 1, 46, 43], the target 39.
When input is [44], the target 53.
When input is [44, 53], the target 56.
When input is [44, 53, 56], the target 1.
When input is [44, 53, 56, 1], the target 58.
When input is [44, 53, 56, 1, 58], the target 46.
When input is [44, 53, 56, 1, 58, 46], the target 3

## Modelling

### i. Bigram Model
The embedding matrix in the bigram model in a vocab_size * vocab_size matrix. For a given input, it simply plucks out the corresponding row from the embedding table. The values in that row corresponds to the probabilities of each of the vocab element to come next.

The predictions are based solely on the current token.

In [12]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        #Each token reads off the logits for the next token from a look up table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) 

    def forward(self, idx, targets = None):

        #idx and targets, both are (B,C) tensors. 
        #For each of these tokens, we will get a vocab_size "C" logit, and the output shape would be (B, T, C) 
        logits = self.token_embedding_table(idx) #output shape: (B, T, C)

        if targets==None:
            loss = None

        else:
        #Pytorch expects the input for this loss function to be of shape (B, C, T), so we need to reshape our logit matrix
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)    # -log likelihood loss
        
        return logits, loss


    #Generate new text
    #This function takes in the entire context, even though the bigram model only needs the last index for th enext prediction. 
    #We do this so that this function can be kept constant for more complicated models.
    def generate(self, idx, max_new_tokens):

        # idx is (B,T) array of indices in the current context
        for _ in range(max_new_tokens):

            #Get the predictions
            logits, loss = self(idx)

            # Outputs (B, T, C) Pluck the last entry in the T dimension for the next prediction
            logits = logits[:,-1, :] #(B, C)

            #Apply softmax to get the probabilities
            probs = F.softmax(logits, dim=-1) #(B, C)
            
            #sample from distribution
            idx_next = torch.multinomial(probs, num_samples = 1) #(B, 1)
            
            #Append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim = 1) #(B, T+1)
            
        return idx


            
            
model = BigramLanguageModel(vocab_size)
logits, loss = model(xb, yb)
print(loss)  
start = torch.zeros((1,1), dtype = torch.long)
print(decode(model.generate(start, max_new_tokens = 100)[0] .tolist()))


tensor(4.8786, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [None]:
#Create a pytorch optimizer to train the model
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3) #Good learning rate for larger models can be 3e-4 etc. 

In [None]:
import matplotlib.pyplot as plt

train_loss_curve = []

batch_size = 32
n_steps = 10000 #Train for 10,000 steps

for _ in range(n_steps):
    xb, yb = get_batch("train")
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()
    train_loss_curve.append(loss.item())


plt.plot(train_loss_curve)
plt.xlabel("steps")
plt.ylabel("loss")
plt.title("Training loss of the Bigram Model")

print(loss.item())  

In [None]:
#Generate from the trained model
start = torch.zeros((1,1), dtype = torch.long)
print(decode(model.generate(start, max_new_tokens = 500)[0] .tolist()))

#Great improvement from the untrained model, but we can do better

### The Mathematical Trick in Self-Attention

We would like the tokens to interact with each other. But this communication should be one-way
Information only flows from previous to the current time-step. No information flows from the future, because we want to predict the future.
One way is to average the C dimension of all the tokens in context.
This averaging or summation would be extremely lossy because we have lost all information about the relative location of all the tokens.
 
We use the term Bag of words (BOW) when we are averaging a group of words. 

In [None]:
# Take a toy example
torch.manual_seed(1337)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
print(x.shape)

In [None]:
# 1 

xbow  = torch.zeros(B, T, C)
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] #(t, C)
        xbow[b, t] = torch.mean(xprev, 0)

print(x[0])
print(xbow[0])

In [None]:
torch.manual_seed(43)
a = torch.tril(torch.ones(3,3))
a = a / a.sum(1, keepdim = True)
b = torch.randint(0, 10, (3,2)).float() # If we do not convert to float, it will give a runtime error
c = a @ b
print(f"a : {a}")
print(f"b : {b}")
print(f"c : {c}")


In [None]:
#Version 2 
#Converting cell 1 to matrix operation
w = torch.tril(torch.ones(T, T))
w = w / w.sum(1, keepdim = True)
# Pytorch will see that the dimenisons do not match and will create a batch dimension
xbow2 = w @ x # (B, T, T) * ( B, T, C)  ---> (B, T, C)
torch.allclose(xbow, xbow2) #Both are equal. 
#This is a weighted sum according to the weight matrix w

In [None]:
# Version 3
w = torch.zeros((T, T))
tril = torch.tril(torch.ones(T, T))
w = w.masked_fill(tril == 0, float('-inf'))
w = F.softmax(w, dim = -1)
xbow3 = w @ x
torch.allclose(xbow3, xbow)

In [40]:
import time
t1 = time.time()
!python bigram.py
t2 = time.time()
t2-t1

Iteration number: 0, train_loss: 4.4801, val_loss: 4.4801
Iteration number: 300, train_loss: 2.8827, val_loss: 2.9059
Traceback (most recent call last):
  File "/Users/raasim/Desktop/ramsha_work/BNL/python_envs/climate_model/mini_llm/bigram.py", line 180, in <module>
    print(decode(model.generate(context, max_new_tokens = 500)[0] .tolist()))
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/raasim/Desktop/ramsha_work/BNL/python_envs/climate_model/mini_llm/bigram.py", line 121, in generate
    logits, loss = self(idx)
                   ^^^^^^^^^
  File "/Users/raasim/Desktop/ramsha_work/BNL/python_envs/climate_model/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/raasim/Desktop/ramsha_work/BNL/python_envs/climate_model/mini_llm/bigram.py", line 95, in forward
    pos_emb = self.pos_embedding_table(torch.arange(T, device = devi

2.208585023880005

In [31]:
torch.cuda.is_available()
torch.__version__

'2.0.1'

In [36]:
torch.arange(10)

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])