In [4]:
import pandas as pd
import os

with open('/Users/tarikrashada/Projects/myMiniGPT/data/input.txt','r') as f:
    text = f.read()

In [5]:
print(len(text))

600901


In [6]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !"'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz¤¦©«Ã†
85


In [7]:
# these are the possible characters are model can emit. 
# We need to develop a strategy to tokenize the text - 
# convert the raw text to some sequence of integers according 
# to some vocabulary

# the simplest tokenization procedure would be character tokenization
# in practice LLMs have tokenizers that are optimized a lot better
# our character tokenizer will only have 85 integers but real
# tokenizers like tiktoken produce 10s of thousands of tokens

In [8]:
char_to_int_map = {char : i for i,char in enumerate(chars)}
int_to_char_map = {i : char for i,char in enumerate(chars)}
def encode(s):
    return [char_to_int_map[s[i]] for i in range(len(s))]
def decode(int_list):
    return ''.join([int_to_char_map[k] for k in int_list])

In [9]:
import torch
data = torch.tensor(encode(text),dtype=torch.long)
# data is the complete tokenized text

In [10]:
# we will now split the data into train and validation sets. We will make the first 90% of the data
# train and the rest will be for validation
num = int(0.9*len(data))
train_data = data[:num]
val_data = data[num:]

In [11]:
# next we need to break the train data into context windows and also batches 
batch_size = 16
context_length = 32

train_data[:context_length+1]

tensor([39, 41, 28, 29, 24, 26, 28,  0,  0,  0, 42, 44, 39, 39, 38, 42, 32, 37,
        30,  1, 72, 60, 53, 72,  1, 43, 70, 73, 72, 60,  1, 61, 71])

In [12]:
# this is the first 17 characters. When we sample a chunk of data like this, this actually has multiple
# examples packed. We want our transformer to be able to make predictions with contexts of length up to 32

# it should still be able to make predictions with a context of less than this. E.g.

for t in range(context_length):
    x = data[:t+1]
    y = data[t+1]
    
    print(f"The context {x} is used to predict target: {y}")

The context tensor([39]) is used to predict target: 41
The context tensor([39, 41]) is used to predict target: 28
The context tensor([39, 41, 28]) is used to predict target: 29
The context tensor([39, 41, 28, 29]) is used to predict target: 24
The context tensor([39, 41, 28, 29, 24]) is used to predict target: 26
The context tensor([39, 41, 28, 29, 24, 26]) is used to predict target: 28
The context tensor([39, 41, 28, 29, 24, 26, 28]) is used to predict target: 0
The context tensor([39, 41, 28, 29, 24, 26, 28,  0]) is used to predict target: 0
The context tensor([39, 41, 28, 29, 24, 26, 28,  0,  0]) is used to predict target: 0
The context tensor([39, 41, 28, 29, 24, 26, 28,  0,  0,  0]) is used to predict target: 42
The context tensor([39, 41, 28, 29, 24, 26, 28,  0,  0,  0, 42]) is used to predict target: 44
The context tensor([39, 41, 28, 29, 24, 26, 28,  0,  0,  0, 42, 44]) is used to predict target: 39
The context tensor([39, 41, 28, 29, 24, 26, 28,  0,  0,  0, 42, 44, 39]) is use

In [13]:
# so context length is the maximum context length for predictions while batch_size is the number of
# independent sequences processed in parallel

def generate_batch(split):
    data = train_data if split == 'train' else val_data
    # generate batch size number of random positions between 0 and len(data) - context_length
    xi = torch.randint(len(data) - context_length, (batch_size,))
    # the first 'context_length' characters starting at i
    x = torch.stack([data[i:i + context_length] for i in xi])
    # the y's are the offset 1 of this
    y = torch.stack([data[i + 1: i + 1 + context_length] for i in xi])
    return x,y

# y gives us the correct answer for every position in x (it is the next token that we would be predicting)

In [14]:
import torch
import torch.nn as nn
from torch.nn import functional as F


class BigramLanguageModel(nn.Module):
    
    def __init__(self,vocab_size):
        super().__init__()
        self.embedding_lookup = nn.Embedding(vocab_size, vocab_size)
    # thin wrapper where we basically convert each integer in our input to the integer-th row of the
    # embedding matrix
    def forward(self, input, targets=None):
        logits = self.embedding_lookup(input) # <- shape is (batch_size, context_length, vocab_size)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss
        # because the final dimension is vocab size we can interpret this to be the scores or logits for the
        # next char in the sequence - for each of the inputs we have vocab_size scores because there are
        # batch_size * context_length predictions
    def generate(self, input, max_new_tokens):
        for _ in range(max_new_tokens):
            # get predictions
            logits, _ = self(input)
            # focus on last time step because this is the prediction using the entire context window
            logits = logits[:,-1,:]
            # softmax to get probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from distribution to get 1 sample
            next = torch.multinomial(probs, num_samples=1) # <- (batch_size, 1)
            input = torch.cat((input, next), dim=1) # <- (B, context_length + 1) concatenate the new token
        return input
            

In [15]:
xb, yb = generate_batch('train')

In [16]:
Bigram = BigramLanguageModel(vocab_size)
logits, loss = Bigram(xb,yb)
print(logits.shape)
print(loss)

torch.Size([512, 85])
tensor(4.8055, grad_fn=<NllLossBackward0>)


In [17]:
output = torch.zeros((1,1),dtype=torch.long)
decode(Bigram.generate(output,max_new_tokens=100)[0].tolist())
# the output is random because the embedding weights are initially random

'\ny4Zh;"MM;1?42h16e2"d\nsoZedRkt?6iPsnUFQAHRÃ9U©H]V=n¤GA??(XlKH.†xIRV4OeVUG[2I=1Nh©28G 8qc2_2]JSTLY?DGs'

In [18]:
optimizer = torch.optim.AdamW(Bigram.parameters(), lr=1e-3)

In [19]:
# train Bigram


batch_size
for steps in range(10000):
    xb, yb = generate_batch('train')
    # evaluate loss
    logits, loss = Bigram(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

In [20]:
decode(Bigram.generate(output,max_new_tokens=100)[0].tolist())

'\nat n hed\nith " lewid.\natabus (fbe th k tit icoregsthaphast. id ipaniell Gathe t\nheere iopeMIANe acev'

We want to find a way to include data from every token prior to the token we are currently processing. We want to calculate something like an average of all of the embeddings of previous tokens

We can model this using matrix multiplication with lower triangular matrices and performing an average in the rows of the resulting matrix

In [21]:
# e.g.
a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b

In [24]:
a

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])

In [23]:
b

tensor([[5., 1.],
        [2., 4.],
        [2., 5.]])

In [22]:
# the result is that each row of c is an average of the rows of b
# the first row averages the first row
# the second row averages the first two rows
# the third row averages all three rows of b
c

tensor([[5.0000, 1.0000],
        [3.5000, 2.5000],
        [3.0000, 3.3333]])

This is analogous to self-attention - we weight some vectors associated with each of 

In [27]:
tril = torch.tril(torch.ones(5,5))
wei = torch.zeros((5,5))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim= -1)

In [28]:
wei
# the lower triangular shape lets us mask out and normalize the token embeddings so that 
# we only calculate affinities amongst tokens that occur before a given token

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000]])

There is a very obvious problem here that we don't want these scores to be equally weighted across all previous tokens. We want to choose weights in a data dependent way

Self-attention solves this with the following - each token at each position will emit a key vector and a query vector. For a given token - the dot product is calculated between the key vector of this token and the query vector of all other past tokens. If the dot product is large then we say it has a strong affinity to the token.

After performing this dot product we normalize to get what we call attention scores.

In [31]:
# Self-attention for a single attention head
head_dim = 16
embed_dim = 32
# x1 contains the positional + token embeddings and it is the combined positional and token
# embedding for all elements in a sequence (1 batch)
x1 = torch.randn(batch_size,context_length,embed_dim)

key_Matrix = nn.Linear(embed_dim,head_dim,bias=False)
query_Matrix = nn.Linear(embed_dim,head_dim,bias=False)

key_vec1 = key_Matrix(x1)
query_vec2 = query_Matrix(x1)

wei = query_vec2 @ key_vec1.transpose(-2,-1) # output is (batch_size,context_length,context_length)

tril = torch.tril(torch.ones(context_length,context_length))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim= -1)

# wei has shape batch_size, context_length, context_length
# and will have zeros above the major diagonal (it is lower triagonal)
# because of the tril mask so that only prior token embeddings have influence

In [34]:
wei

tensor([[[1.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [9.4236e-01, 5.7643e-02, 0.0000e+00,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [4.3725e-02, 1.5580e-01, 8.0048e-01,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         ...,
         [6.0430e-02, 2.8321e-02, 3.9269e-02,  ..., 1.5843e-02,
          0.0000e+00, 0.0000e+00],
         [1.2855e-02, 2.6191e-03, 4.9429e-03,  ..., 2.8759e-02,
          1.0886e-02, 0.0000e+00],
         [3.1368e-02, 1.2411e-03, 1.2391e-02,  ..., 4.8287e-02,
          1.3204e-02, 1.6809e-02]],

        [[1.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [9.0805e-01, 9.1946e-02, 0.0000e+00,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [8.9412e-03, 9.7391e-01, 1.7153e-02,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         ...,
         [2.8447e-03, 7.3581e-02, 2.1818e-02,  ..., 8.9623e-04,
          0.000