<a href="https://colab.research.google.com/github/Nandika-A/LLM-from-scratch/blob/main/LLM_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preparing the data

In [4]:
import os
import urllib.request

In [5]:
if not os.path.exists("the-verict.txt"):
  url = ("https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/refs/heads/main/ch02/01_main-chapter-code/the-verdict.txt")
  urllib.request.urlretrieve(url, "the-verdict.txt")


In [6]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
  raw_text = f.read()

In [7]:
raw_text

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)\n\n"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it\'s going to send the value of my picture \'way up; but I don\'t think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing\'s lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn\'s "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?\n\nWell!--even 

In [8]:
len(raw_text)

20479

In [9]:
import re

## Break down text into tokens

In [10]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]

In [11]:
preprocessed[:10]

['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius']

## Convert tokens into token Ids

In [12]:
all_words = sorted(set(preprocessed))

In [13]:
vocab_size = len(all_words)

In [14]:
print(vocab_size)

1130


In [15]:
vocab = {token:integer for integer,token in enumerate(all_words)}

In [16]:
len(vocab)

1130

Use this vocabulary to convert each word into an integer

Now encode the text into token Ids

In [17]:
class SimpleTokenizerV1:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {i:s for s,i in vocab.items()}

  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    ids = [self.str_to_int[i] for i in preprocessed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[i] for i in ids])
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text

This tokenizer can only work for the words in the vocabulary. Otherwise it gives error.

In [18]:
tokenizer = SimpleTokenizerV1(vocab)

In [19]:
new_text = "This is good!"
ids = tokenizer.encode(new_text)
print(ids)

[97, 584, 500, 0]


In [20]:
tokenizer.decode(ids)

'This is good!'

## Special context tokens
Like end-of-text token

We also want if the word isn't in the vocabulary, the tokenizer shouldn't fail but extend those tokens in the vocabulary.

In [21]:
all_words.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer,token in enumerate(all_words)}

In [22]:
len(vocab)

1132

In [23]:
class SimpleTokenizerV2:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {i:s for s,i in vocab.items()}

  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed
        ]
    ids = [self.str_to_int[i] for i in preprocessed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[i] for i in ids])
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text

In [24]:
tokenizer = SimpleTokenizerV2(vocab)

In [25]:
text = "Hello World, This is good!"

In [26]:
ids = tokenizer.encode(text)

In [27]:
tokenizer.decode(ids)

'<|unk|> <|unk|>, This is good!'

## Byte pair encoding
Algo for handling unknown tokens
It breaks down longer words into known subwords instead of substituting unknown token. So, one word may become many tokens, but it never fails

In [28]:
import tiktoken

In [29]:
tiktoken.__version__

'0.9.0'

Tiktoken library has tokenizers of many models. Here we are using Gpt2. The actual library is in rust with a python API.

In [30]:
tokenizer = tiktoken.get_encoding("gpt2")

In [31]:
tokenizer.n_vocab

50257

In [32]:
ids = tokenizer.encode("hello world!")

In [33]:
tokenizer.decode(ids)

'hello world!'

<|endoftext|> token is added to denote the end of the document.

In [34]:
ids = tokenizer.encode("hello world! <|endoftext|> hi", allowed_special={"<|endoftext|>"})

## Data Sampling with sliding window

LLMs predict one word at a time. This helps them to scale and train efficiently, as label gets labelled itself.

In [35]:
context_size = 4

no. of tokens in the window, which is passed to the LLM as LLM can't receive all the tokens at once for training, generally in thousands

In [36]:
ids = tokenizer.encode(raw_text)

In [37]:
ids[:30]

[40,
 367,
 2885,
 1464,
 1807,
 3619,
 402,
 271,
 10899,
 2138,
 257,
 7026,
 15632,
 438,
 2016,
 257,
 922,
 5891,
 1576,
 438,
 568,
 340,
 373,
 645,
 1049,
 5975,
 284,
 502,
 284,
 3285]

In [38]:
x = ids[:context_size]
y = ids[1:context_size+1]
print(x)
print(y)

[40, 367, 2885, 1464]
[367, 2885, 1464, 1807]


Three tokens overlap, as that is the prediction result. If input is 40, LLM should predict 367 and so on.

## Create the dataset

In [39]:
import torch

In [40]:
from torch.utils.data import Dataset, DataLoader

In [41]:
class GPTDatasetV1(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride):
    self.input_ids = []
    self.target_ids = []

    token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

    for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk)) # not optimal way if number of tokens is very large, can't use in memory storage

  def __len__(self):
        return len(self.input_ids)

  def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

We drop the last batch if the size if less than batch size. This creates the batches of same size, which help in stable training

In [42]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

Max length is the context length

In [43]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

In [44]:
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]]), tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])]


In [45]:
inputs, targets = next(data_iter)
print(inputs)
print(targets)

tensor([[  287,   262,  6001,   286],
        [  465, 13476,    11,   339],
        [  550,  5710,   465, 12036],
        [   11,  6405,   257,  5527],
        [27075,    11,   290,  4920],
        [ 2241,   287,   257,  4489],
        [   64,   319,   262, 34686],
        [41976,    13,   357, 10915]])
tensor([[  262,  6001,   286,   465],
        [13476,    11,   339,   550],
        [ 5710,   465, 12036,    11],
        [ 6405,   257,  5527, 27075],
        [   11,   290,  4920,  2241],
        [  287,   257,  4489,    64],
        [  319,   262, 34686, 41976],
        [   13,   357, 10915,   314]])


## Create token embeddings
token_ids --> token_embeddings

In [46]:
input = torch.tensor([[ 3, 1,  4,  5]])

In [47]:
input2 = torch.tensor([[ 300, 1,  4,  5]])

In [48]:
input

tensor([[3, 1, 4, 5]])

In [49]:
vocab_size = 6
output_dim = 3
torch.manual_seed(42) # nn layer with random weights, so putting seeds to get same weights everytime
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [50]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 1.9269,  1.4873, -0.4974],
        [ 0.4396, -0.7581,  1.0783],
        [ 0.8008,  1.6806,  0.3559],
        [-0.6866,  0.6105,  1.3347],
        [-0.2316,  0.0418, -0.2516],
        [ 0.8599, -0.3097, -0.3957]], requires_grad=True)


Weights of Embedding layer, which are optimised later.

In [51]:
embedding_layer(torch.tensor([1]))

tensor([[ 0.4396, -0.7581,  1.0783]], grad_fn=<EmbeddingBackward0>)

In [52]:
#embedding_layer(input2)

The error IndexError: index out of range in self in cell CnI8XtVpJEdy means that the input input2 contains values that are larger than the vocab_size of your embedding layer.

Your embedding layer was initialized with vocab_size = 6, which means it can only accept input indices ranging from 0 to 5. The input input2 must contain at least one value outside of this range.

To fix this, you need to ensure that all values in input2 are between 0 and 5 (inclusive). If you intended to use the input tensor defined earlier, you can change input2 to input.

In [53]:
embedding_layer(input)

tensor([[[-0.6866,  0.6105,  1.3347],
         [ 0.4396, -0.7581,  1.0783],
         [-0.2316,  0.0418, -0.2516],
         [ 0.8599, -0.3097, -0.3957]]], grad_fn=<EmbeddingBackward0>)

### Encoding word positions

In [54]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [55]:
max_length = 16
dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=max_length, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [56]:
inputs

tensor([[   40,   367,  2885,  1464,  1807,  3619,   402,   271, 10899,  2138,
           257,  7026, 15632,   438,  2016,   257]])

In [57]:
token_embeddings = token_embedding_layer(inputs)

In [58]:
token_embeddings.shape

torch.Size([1, 16, 256])

For positional embeddings, Gpt2 uses same layer again.

Add each of token embeddings to positional embeddings, and it becomes input embeddings

In [59]:
context_length = max_length
pos_embeddings_layer = torch.nn.Embedding(context_length, output_dim)

In [60]:
size_of_each_batch = torch.arange(max_length)

In [61]:
pos_embeddings_layer.weight

Parameter containing:
tensor([[ 1.2192, -0.2741,  0.6823,  ..., -2.0313, -0.3160, -0.2499],
        [ 0.1600, -2.1962,  0.4126,  ..., -1.1532,  0.4579,  1.3812],
        [-0.1451,  0.5679,  0.1859,  ...,  0.2771, -1.2594,  1.3905],
        ...,
        [-1.7559,  0.0438,  1.1475,  ...,  0.4167,  1.1104,  0.4144],
        [-1.9071, -0.9007,  1.7004,  ..., -1.8818,  0.1716,  0.0144],
        [-0.1248, -1.0667,  0.4120,  ...,  0.9362,  0.9994,  0.1742]],
       requires_grad=True)

In [62]:
pos_embeddings = pos_embeddings_layer(size_of_each_batch)

In [63]:
pos_embeddings.shape

torch.Size([16, 256])

In [64]:
input_embeddings = token_embeddings + pos_embeddings

In [65]:
input_embeddings.shape

torch.Size([1, 16, 256])

# Attention

Parsing input with self attention, to get the context of the whole sentence while generating each word, as while generating a word we can just look at the previously generated token in general. It assigns attention scores to each word in the sentence for reference.

In [66]:
# input_embeddings = [torch.tensor(
#   [[0.43, 0.15, 0.89], # Your     (x^1)
#    [0.55, 0.87, 0.66], # journey  (x^2)
#    [0.57, 0.85, 0.64], # starts   (x^3)
#    [0.22, 0.58, 0.33], # with     (x^4)
#    [0.77, 0.25, 0.10], # one      (x^5)
#    [0.05, 0.80, 0.55]] # step     (x^6)
# )]

# output_dim = 3
# max_length = 6

In [67]:
input_embeddings[0]

tensor([[ 0.9929,  1.2730,  2.4816,  ..., -3.1044, -1.0006, -1.4009],
        [ 0.2024, -1.5256,  3.4631,  ...,  0.5621,  0.5827,  1.8916],
        [-0.2273,  0.8504,  0.1607,  ..., -1.0865, -0.4518,  0.8350],
        ...,
        [-2.0435,  0.4572,  1.5234,  ...,  0.7286,  2.2091,  1.1354],
        [-0.9065, -1.3995,  0.7922,  ..., -0.7733, -0.1588,  1.0131],
        [ 1.1009, -0.8636,  1.7284,  ...,  1.2134,  0.4801,  0.0901]],
       grad_fn=<SelectBackward0>)

In [68]:
input_embeddings[0].shape[0]

16

In [69]:
query = input_embeddings[0][1] # let's query the second word in the sentence, in first batch.

In [70]:
query

tensor([ 0.2024, -1.5256,  3.4631, -0.3572,  0.7412,  0.4286,  0.0370, -0.0280,
        -0.2736,  2.2205,  0.2307, -0.3646, -0.7828,  1.1415,  1.2583,  1.4813,
         0.0852,  1.4233, -3.9333,  0.9847,  0.5941,  0.5589,  1.8527,  1.1428,
         0.9194,  1.5592, -0.5299, -0.3945,  1.6160, -2.0992,  1.1277, -0.2103,
        -1.6615,  0.9536,  0.3687,  2.7760, -1.6139, -1.9557, -0.3317,  2.7021,
        -0.1604, -1.5577,  0.4850,  2.2246, -0.0147, -1.3917,  1.7022,  0.5634,
         0.3720, -0.1192,  1.5620,  0.4310, -0.1631,  1.1995,  0.2864, -0.7074,
        -2.3949,  2.2157, -0.0747, -1.4528,  1.2118, -0.3935, -3.1965,  1.2907,
        -1.2329, -1.1810,  0.0480,  0.7396, -0.6971,  0.2765, -0.9201,  2.2997,
        -0.2713,  0.9154,  3.1596,  0.7278, -0.8000,  0.7926, -0.2249,  0.5910,
         1.6051,  0.8777, -1.3312, -1.7427, -1.5775, -0.6021,  2.4569,  2.0505,
         2.7599,  0.8910,  0.1726, -1.1436, -0.5788,  2.5402, -0.1757, -1.3925,
         2.6024, -1.4237, -1.9743,  1.06

In [71]:
attention_scores_2 = torch.empty(max_length)

for i, x_i in enumerate(input_embeddings[0]):
  attention_scores_2[i] = torch.dot(x_i, query)

print(attention_scores_2)

tensor([-2.1417e+01,  5.2857e+02,  4.9593e+01,  8.3008e+00, -5.7123e+01,
        -1.4962e+01,  4.1251e+00,  1.4813e+01,  5.0935e-01, -2.5302e+01,
        -4.3514e+01, -3.6953e+01,  6.0200e+01, -2.4985e+00,  4.8889e+01,
        -1.2598e+01], grad_fn=<CopySlices>)


Attention scores need to be normalized for getting attention weights

In [72]:
attention_weights_2 = torch.softmax(attention_scores_2, dim=0)
print(attention_weights_2)

tensor([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       grad_fn=<SoftmaxBackward0>)


In [73]:
context_vec_2 = torch.zeros(query.shape)

for i, x_i in enumerate(input_embeddings[0]):
  context_vec_2 += x_i * attention_weights_2[i]

print(context_vec_2)

tensor([ 0.2024, -1.5256,  3.4631, -0.3572,  0.7412,  0.4286,  0.0370, -0.0280,
        -0.2736,  2.2205,  0.2307, -0.3646, -0.7828,  1.1415,  1.2583,  1.4813,
         0.0852,  1.4233, -3.9333,  0.9847,  0.5941,  0.5589,  1.8527,  1.1428,
         0.9194,  1.5592, -0.5299, -0.3945,  1.6160, -2.0992,  1.1277, -0.2103,
        -1.6615,  0.9536,  0.3687,  2.7760, -1.6139, -1.9557, -0.3317,  2.7021,
        -0.1604, -1.5577,  0.4850,  2.2246, -0.0147, -1.3917,  1.7022,  0.5634,
         0.3720, -0.1192,  1.5620,  0.4310, -0.1631,  1.1995,  0.2864, -0.7074,
        -2.3949,  2.2157, -0.0747, -1.4528,  1.2118, -0.3935, -3.1965,  1.2907,
        -1.2329, -1.1810,  0.0480,  0.7396, -0.6971,  0.2765, -0.9201,  2.2997,
        -0.2713,  0.9154,  3.1596,  0.7278, -0.8000,  0.7926, -0.2249,  0.5910,
         1.6051,  0.8777, -1.3312, -1.7427, -1.5775, -0.6021,  2.4569,  2.0505,
         2.7599,  0.8910,  0.1726, -1.1436, -0.5788,  2.5402, -0.1757, -1.3925,
         2.6024, -1.4237, -1.9743,  1.06

## Simple self attention for all inputs without trainable weights

This is for 1st batch only

In [74]:
attention_scores = torch.empty(max_length, max_length)
print(input_embeddings[0].shape)

for i, x_i in enumerate(input_embeddings[0]):
  for j, x_j in enumerate(input_embeddings[0]):
    attention_scores[i, j] = x_i.dot(x_j)

print(attention_scores)
print(attention_scores.shape)

torch.Size([16, 256])
tensor([[ 4.7055e+02, -2.1417e+01,  7.9083e+00,  1.3565e+00,  1.6035e+00,
         -1.4744e+01,  5.5573e+01,  1.1527e+01, -7.0634e+01, -1.9790e+01,
         -1.5545e-01,  1.2248e+01,  3.1930e+01,  3.6030e+01,  1.4739e+01,
          1.1818e+01],
        [-2.1417e+01,  5.2857e+02,  4.9593e+01,  8.3008e+00, -5.7123e+01,
         -1.4962e+01,  4.1251e+00,  1.4813e+01,  5.0935e-01, -2.5302e+01,
         -4.3514e+01, -3.6953e+01,  6.0200e+01, -2.4985e+00,  4.8889e+01,
         -1.2598e+01],
        [ 7.9083e+00,  4.9593e+01,  5.5046e+02, -3.8664e+01, -2.4345e+00,
         -2.9529e+01,  9.1606e+00,  3.2558e+01, -6.2929e+01,  6.8157e+01,
          1.2723e+00, -3.2723e+01, -2.7900e+01,  3.8270e+01, -3.8404e+01,
         -3.2785e+01],
        [ 1.3565e+00,  8.3008e+00, -3.8664e+01,  4.4035e+02, -1.0155e+01,
          5.1974e+01,  2.9931e+01, -2.4154e-01,  9.8480e+00,  2.8886e+01,
          3.6532e+01, -5.5056e+01, -3.8941e+00, -1.2686e+01, -5.1219e+01,
         -7.3850e-01]

In [75]:
attention_weights = torch.softmax(attention_scores, dim=0)

In [76]:
context_vec = torch.empty(max_length, output_dim)

for j in range(max_length):
  for i, x_i in enumerate(input_embeddings[0]):
      context_vec[j] += x_i * attention_weights[j][i]

print(context_vec.shape)

torch.Size([16, 256])


In [77]:
context_vec[1]

tensor([ 2.0243e-01, -1.5256e+00,  3.5115e+00, -3.5721e-01,  7.4116e-01,
         4.2861e-01,  8.5520e-02, -2.7969e-02, -2.7362e-01,  2.2205e+00,
         2.3067e-01, -3.6459e-01, -7.8278e-01,  1.1415e+00,  1.2583e+00,
         1.4813e+00,  8.5225e-02,  1.4233e+00, -3.9333e+00,  9.8467e-01,
         5.9412e-01,  5.5894e-01,  3.3153e+07,  1.1428e+00,  9.1944e-01,
         5.3779e+22, -5.2993e-01, -3.9450e-01,  1.6160e+00, -2.0992e+00,
         1.1277e+00, -2.1026e-01, -1.6615e+00,  9.5357e-01,  4.1717e-01,
         2.7760e+00, -1.6139e+00, -1.9557e+00, -3.3174e-01,  2.7021e+00,
        -1.6040e-01, -1.5577e+00,  5.3351e-01,  2.2246e+00, -1.4680e-02,
        -1.3917e+00,  1.7506e+00,  5.6339e-01,  3.7203e-01, -1.1917e-01,
         1.6105e+00,  4.3099e-01, -1.6315e-01,  1.1995e+00,  3.3488e-01,
        -7.0738e-01, -2.3949e+00,  2.2157e+00, -2.6261e-02, -1.4528e+00,
         1.2118e+00, -3.9348e-01, -3.1480e+00,  1.2907e+00, -1.2329e+00,
        -1.1810e+00,  9.6509e-02,  7.3961e-01, -6.9

## Implementing self-attention with trainable weights

In [78]:
input_embeddings

tensor([[[ 0.9929,  1.2730,  2.4816,  ..., -3.1044, -1.0006, -1.4009],
         [ 0.2024, -1.5256,  3.4631,  ...,  0.5621,  0.5827,  1.8916],
         [-0.2273,  0.8504,  0.1607,  ..., -1.0865, -0.4518,  0.8350],
         ...,
         [-2.0435,  0.4572,  1.5234,  ...,  0.7286,  2.2091,  1.1354],
         [-0.9065, -1.3995,  0.7922,  ..., -0.7733, -0.1588,  1.0131],
         [ 1.1009, -0.8636,  1.7284,  ...,  1.2134,  0.4801,  0.0901]]],
       grad_fn=<AddBackward0>)

In [79]:
x_2 = input_embeddings[0, 1]

In [80]:
d_in = x_2.shape[0]
d_out = 128

The contextual embeddings don't need to be the same size as input embeddings.

In [81]:
d_in

256

In [82]:
torch.manual_seed(123)
W_query = torch.nn.Parameter(torch.rand(d_in, d_out))
W_key = torch.nn.Parameter(torch.rand(d_in, d_out))
W_value = torch.nn.Parameter(torch.rand(d_in, d_out))

Initialised random weights without training

In [83]:
query_2 = x_2 @ W_query # Matrix multiplication - dot product
query_2

tensor([14.8723,  4.7600,  7.4069, 24.2712, 11.4760, 24.9924,  4.1695, 10.9818,
        22.8250, 12.4300, 23.7424, 18.3692, 16.0938, 21.8368, 24.0885, 17.0860,
         7.8698, 13.6812, 13.4008,  8.4839, 21.4972,  3.1853, 20.5067, 19.8391,
        16.4256, 14.9363, 16.4993,  8.3866, 21.9500, 26.2890,  5.9613, 12.0275,
        24.1700,  3.7095, 16.6415, 25.0162, 14.8485, 17.7167, 10.9422,  7.1635,
        10.6035, 13.3095, 23.1069,  6.2066,  8.5269, 22.7793, 19.8128, 17.5273,
        20.7159, 27.9357, 16.2536, 20.6742,  9.5860, 17.4723, 20.6834, 19.5044,
        18.1552, 25.9855, 20.2276, 18.0374,  4.5914, 15.1547, 18.5563, 27.7711,
        15.4671, 19.6817, 33.9798, 24.7180,  6.9878, 23.1641, 17.0375,  9.6064,
        21.9311, 21.9161, 22.5524, 24.0282, 15.7451, 18.0198, 11.6525, 28.4891,
        16.5139, 16.7774,  2.2332, 28.4280, 22.3735, 19.9648, 18.3463, 28.5180,
         1.8464,  9.1346, 15.2060, 19.1910, 14.3971, 16.0631, 22.2276,  6.7943,
        14.6872, 23.5616, 20.7796, 13.47

Same Query is used for multiplying with all input embeddings for one embedding, but key and value are different for each embedding. We get softmax(Q.K).V

In [84]:
keys  = input_embeddings[0] @ W_key
values = input_embeddings[0] @ W_value

In [85]:
keys_2 = keys[1]
attention_score_22 = torch.dot(query_2, keys_2)
attention_score_22

tensor(35648.0859, grad_fn=<DotBackward0>)

In [86]:
attention_score_2 = query_2 @ keys.T
attention_score_2

tensor([ 31912.8906,  35648.0859, -19988.6094,  -4809.1665,   8196.8770,
         44183.2266,  11291.8818, -55072.3555, -42595.7656,  -2865.8901,
         -9203.5732,  26265.0605,  -6486.0352,  38350.2969, -26805.5977,
          9062.4316], grad_fn=<SqueezeBackward4>)

In [87]:
dim_k = keys.shape[1]
attention_weights_2 = torch.softmax(attention_score_2/(dim_k**0.5), dim = 0)

In [88]:
attention_weights_2

tensor([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       grad_fn=<SoftmaxBackward0>)

In [89]:
attention_weights_2.sum()

tensor(1., grad_fn=<SumBackward0>)

In [90]:
context_vec_2 = attention_weights_2 @ values
context_vec_2

tensor([26.6820, 25.4708, 16.0051, 21.4798, 16.1806, 14.2843, 21.5384, 16.9555,
        21.8234, 15.9880, 13.8837, 30.7312, 14.6857, 18.7635, 20.1577, 29.1965,
        23.3223, 27.1102, 23.6386, 16.5848, 32.6152, 19.3407, 25.7516, 18.9710,
        13.1958, 24.8358, 22.3091, 20.8299, 25.5422, 14.6519, 27.2483, 26.4246,
        20.8403, 15.8739, 19.5029, 20.9241, 23.7715, 17.8590, 21.5936, 23.3931,
        15.0140, 18.6508, 23.0842, 12.1612, 19.5109, 14.9594, 17.8556, 14.9386,
        27.3427, 16.2646, 25.7193, 24.2027, 18.1866, 26.1411, 23.8958, 19.6239,
        22.4661, 10.6859, 15.2480, 16.1396, 18.7845, 12.7977, 31.6580, 18.1729,
        23.2778, 21.4419, 16.1039, 19.5272, 10.0385, 23.8588, 22.0585, 18.7179,
        18.3748, 15.7094, 15.5886, 22.3966, 13.0420, 12.8366, 17.7804, 23.2239,
        15.2006, 26.7365, 19.4880, 17.2401, 31.3349, 13.6985, 21.6969, 26.7954,
        11.8471, 24.2980, 16.9910, 15.0971, 17.6871, 25.8924, 17.6882, 20.6526,
        15.8734, 22.7710, 22.8204, 25.34

In [91]:
m = torch.nn.Linear(256, 128)
m.weight.shape

torch.Size([128, 256])

## Self attention class

In [92]:
class SelfAttention_v1(torch.nn.Module):
  def __init__(self, d_out, d_in, qkv_bias = False):
    super().__init__()
    self.W_query = torch.nn.Linear(d_out, d_in, qkv_bias).weight
    self.W_key = torch.nn.Linear(d_out, d_in, qkv_bias).weight
    self.W_value = torch.nn.Linear(d_out, d_in, qkv_bias).weight

  def forward(self, x):
    queries = x @ self.W_query
    keys = x @ self.W_key
    values = x @ self.W_value
    attention_scores = queries @ keys.T
    dim_k = keys.shape[1]
    attention_weights = torch.softmax(attention_scores/(dim_k**0.5), dim = 0)
    context_vec = attention_weights @ values
    return context_vec

In [93]:
torch.manual_seed(123)

<torch._C.Generator at 0x7bb364f4ca50>

In [94]:
d_in

256

In [95]:
d_out

128

In [96]:
sa_v1 = SelfAttention_v1(d_out, d_in)
sa_v1.forward(input_embeddings[0])

tensor([[ 0.0361,  0.0573,  0.0521,  ..., -0.3664, -0.1049,  0.1058],
        [ 0.8511, -0.3292, -0.4016,  ..., -0.1518, -0.5548, -0.1678],
        [-0.1230, -0.4307, -0.7885,  ..., -0.0098, -0.4219, -0.1241],
        ...,
        [-0.2548,  0.1013, -0.5606,  ...,  0.1385, -0.4764, -0.1956],
        [ 0.0145,  0.0543, -0.3231,  ..., -0.2782, -0.1437, -0.0169],
        [ 0.0403,  0.0519, -0.1320,  ...,  0.0261, -0.1296,  0.0226]],
       grad_fn=<MmBackward0>)

## Applying causal attention mask

For hidding future words for LLM, as we are providing the whole text for training. We will create a triangular mask which hides the future words.

In [97]:
class CausalAttention_v1(torch.nn.Module):
  def __init__(self, d_out, d_in, max_length, qkv_bias = False):
    super().__init__()
    self.W_query = torch.nn.Linear(d_out, d_in, qkv_bias).weight
    self.W_key = torch.nn.Linear(d_out, d_in, qkv_bias).weight
    self.W_value = torch.nn.Linear(d_out, d_in, qkv_bias).weight
    self.mask_simple = torch.tril(torch.ones(max_length, max_length))

  def forward(self, x):
    queries = x @ self.W_query
    keys = x @ self.W_key
    values = x @ self.W_value
    attention_scores = queries @ keys.T
    dim_k = keys.shape[1]
    attention_weights = torch.softmax(attention_scores/(dim_k**0.5), dim = 0)
    masked_weights = attention_weights * self.mask_simple # Normalise
    masked_simple = masked_weights/masked_weights.sum()
    context_vec = masked_simple @ values
    return context_vec

In [98]:
class CausalAttention_v2(torch.nn.Module):
  def __init__(self, d_out, d_in, max_length, qkv_bias = False):
    super().__init__()
    self.W_query = torch.nn.Linear(d_out, d_in, qkv_bias).weight
    self.W_key = torch.nn.Linear(d_out, d_in, qkv_bias).weight
    self.W_value = torch.nn.Linear(d_out, d_in, qkv_bias).weight
    self.mask_simple = torch.nn.triu(torch.ones(max_length, max_length))

  def forward(self, x):
    queries = x @ self.W_query
    keys = x @ self.W_key
    values = x @ self.W_value
    attention_scores = queries @ keys.T
    masked = attention_scores.masked_fill(self.masked_simple.bool(), -torch.inf)
    dim_k = keys.shape[1]
    attention_weights = torch.softmax(masked/(dim_k**0.5), dim = 0)
    context_vec = attention_weights @ values
    return context_vec

## Masking additional attention weights with dropout

This is to avoid overfitting, so that some positions in attention weights are dropped and model doesn't rely only on certain positions.

In [102]:
torch.manual_seed(123)
layer = torch.nn.Dropout(0.3)
layer

Dropout(p=0.3, inplace=False)

In [103]:
layer(torch.ones(6,6))

tensor([[1.4286, 1.4286, 1.4286, 1.4286, 1.4286, 1.4286],
        [1.4286, 1.4286, 1.4286, 0.0000, 1.4286, 1.4286],
        [0.0000, 1.4286, 1.4286, 1.4286, 1.4286, 0.0000],
        [1.4286, 1.4286, 0.0000, 0.0000, 0.0000, 1.4286],
        [1.4286, 0.0000, 0.0000, 1.4286, 0.0000, 1.4286],
        [1.4286, 1.4286, 0.0000, 0.0000, 0.0000, 0.0000]])

It rescales the values to maintain the same sum in each row.

## Compact self attention causal class

In [113]:
class CausalAttention(torch.nn.Module):
  def __init__(self, d_out, d_in, max_length, dropout, qkv_bias = False):
    super().__init__()
    self.W_query = torch.nn.Linear(d_out, d_in, qkv_bias).weight
    self.W_key = torch.nn.Linear(d_out, d_in, qkv_bias).weight
    self.W_value = torch.nn.Linear(d_out, d_in, qkv_bias).weight
    self.register_buffer("masked_simple", torch.triu(torch.ones(max_length, max_length), diagonal=1))
    self.dropout = torch.nn.Dropout(dropout)

  def forward(self, x):
    queries = x @ self.W_query
    keys = x @ self.W_key
    values = x @ self.W_value
    attention_scores = queries @ keys.T
    masked = attention_scores.masked_fill_(self.masked_simple.bool()[ : max_length, : max_length], -torch.inf) # _ ops are in place
    dim_k = keys.shape[1]
    attention_weights = torch.softmax(masked/(dim_k**0.5), dim = 0)
    attention_weights = self.dropout(attention_weights)
    context_vec = attention_weights @ values
    return context_vec

In [117]:
input_embeddings.shape

torch.Size([1, 16, 256])

In [118]:
ca = CausalAttention(d_out, d_in, max_length, 0.3)

In [119]:
ca.forward(input_embeddings[0]).shape

torch.Size([16, 128])

## Multi-head attention

No. of heads is a hyper-parameter

Concatenate the results of n causal attentions

In [124]:
class MultiHeadAttentionWrapper(torch.nn.Module):
  def __init__(self, d_out, d_in, max_length, dropout, num_heads=2, qkv_bias = False):
    super().__init__()
    self.heads = torch.nn.ModuleList([
        CausalAttention(d_out, d_in, max_length, dropout, qkv_bias) for _ in range(num_heads)
        ])

  def forward(self, x):
    return torch.cat([head(x) for head in self.heads], dim = 1)


Sequential computing on concat, not efficient, as attention heads operate independently

In [125]:
mha = MultiHeadAttentionWrapper(d_out, d_in, max_length, 0.3)
mha.forward(input_embeddings[0])

tensor([[ 0.0374, -0.1001, -0.1076,  ..., -0.0058, -0.0048, -0.0075],
        [ 0.0000,  0.0000,  0.0000,  ..., -0.0564, -0.0563, -0.0627],
        [ 0.0912, -0.1551, -0.1309,  ..., -0.0309, -0.1603,  0.1002],
        ...,
        [ 0.2548,  0.0304,  0.3332,  ...,  0.4859,  0.4020,  0.1746],
        [-0.1088, -1.3427, -0.1127,  ...,  1.1834, -0.3244,  0.0976],
        [ 1.6673, -0.5648,  0.6274,  ...,  4.2277,  0.0890,  1.9266]],
       grad_fn=<CatBackward0>)

Parallel Execution

In [127]:
import torch.nn as nn

In [129]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
            "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length),
                       diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape
        # As in `CausalAttention`, for inputs where `num_tokens` exceeds `context_length`,
        # this will result in errors in the mask creation further below.
        # In practice, this is not a problem since the LLM (chapters 4-7) ensures that inputs
        # do not exceed `context_length` before reaching this forwar

        keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec) # optional projection

        return context_vec

torch.manual_seed(123)

batch_size, context_length, d_in = input_embeddings.shape
mha = MultiHeadAttention(d_in, d_out, context_length, 0.0, num_heads=2)

context_vecs = mha(input_embeddings)

print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)

tensor([[[ 0.4299,  0.0599,  0.5649,  ..., -0.0029, -0.0660,  0.2581],
         [ 0.5235, -0.1435,  0.5172,  ...,  0.1972, -0.0133,  0.0588],
         [ 0.4296, -0.1197,  0.4824,  ...,  0.3419, -0.1123, -0.0221],
         ...,
         [ 0.2463,  0.2876,  0.2918,  ...,  0.0331,  0.0526, -0.2645],
         [ 0.0261,  0.2153,  0.1593,  ...,  0.1678, -0.2239,  0.0175],
         [ 0.0787,  0.0504,  0.1422,  ...,  0.1167, -0.1775,  0.0756]]],
       grad_fn=<ViewBackward0>)
context_vecs.shape: torch.Size([1, 16, 128])


# GPT