In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from pathlib import Path
import re
import tiktoken

In [3]:
def read_data(filepath: Path):
    with open(filepath, "r", encoding="utf-8") as f:
        raw_text = f.read()
    return raw_text


In [4]:
data = read_data("../resources/verdict.txt")

In [5]:
for i in range(0,10,2):
    print(i)

0
2
4
6
8


In [6]:
class GPTDatasetV1(Dataset):
    def __init__(self, text, tokeniser, max_length, stride):
        self.tokeniser = tokeniser
        self.input_ids = []
        self.target_ids=[]

        token_ids = tokeniser.encode(text,allowed_special={"<|endoftext|>"})

        for i in range(0,len(token_ids)-max_length, stride):
            input_chunk= token_ids[i:i+max_length]
            target_chunk= token_ids[i+1: i+max_length+1] 
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
        

    def __getitem__(self,idx):
        return self.input_ids[idx], self.target_ids[idx]



In [7]:
def create_dataloader_v1(text, batch_size=4,max_length=256, stride=128, shuffle=True,drop_last=True):
    tokeniser = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(text,tokeniser,max_length,stride)
    dataloader = DataLoader(dataset, batch_size=batch_size,shuffle=shuffle, drop_last=drop_last)
    return dataloader
    

In [8]:
dataloader = create_dataloader_v1(data, batch_size=2,max_length=4,stride=1, shuffle=False)

In [9]:
data_iter = iter(dataloader)
batch1 = next(data_iter)
print(batch1) #return input_token and target_token

[tensor([[  464,  4643, 11600,    25],
        [ 4643, 11600,    25,  1717]]), tensor([[ 4643, 11600,    25,  1717],
        [11600,    25,  1717,   342]])]


In [10]:
batch2 = next(data_iter)
print(batch2) #return input_token and target_token

[tensor([[11600,    25,  1717,   342],
        [   25,  1717,   342,   854]]), tensor([[   25,  1717,   342,   854],
        [ 1717,   342,   854, 41328]])]


Here, we will understand how the embedding layer is like a lookup.

In [11]:
vocab_size=5
output_dim=2
embedding_layer = nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight.shape)
print(embedding_layer.weight)

torch.Size([5, 2])
Parameter containing:
tensor([[ 0.9577,  0.8142],
        [-0.0816, -0.8564],
        [ 0.1519,  0.7748],
        [-0.8239, -0.7357],
        [ 0.6571, -0.3910]], requires_grad=True)


In [12]:
embedding_layer(torch.tensor(3))

tensor([-0.8239, -0.7357], grad_fn=<EmbeddingBackward0>)

In [13]:
torch.tensor(3)

tensor(3)

In [14]:
embedding_layer(torch.tensor([1,2,3]))

tensor([[-0.0816, -0.8564],
        [ 0.1519,  0.7748],
        [-0.8239, -0.7357]], grad_fn=<EmbeddingBackward0>)

### Creating Positional Encodings

In [15]:
output_dim = 256
vocab_size = 50257
embedding_layer  = nn.Embedding(vocab_size, output_dim)

In [16]:
max_length = 4
dataloader = create_dataloader_v1(data, batch_size=8,max_length=max_length,stride=max_length, shuffle=False)

In [17]:
data_iter = iter(dataloader)

In [18]:
inputs, targets = next(data_iter)

In [19]:
print(inputs)

tensor([[  464,  4643, 11600,    25],
        [ 1717,   342,   854, 41328],
        [   25, 40417,   198,  3109],
        [ 9213,   422, 11145,   271],
        [ 1668,   319,  3267,  2310],
        [   11, 48609,   198,   198],
        [   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271]])


In [20]:
print(targets)

tensor([[ 4643, 11600,    25,  1717],
        [  342,   854, 41328,    25],
        [40417,   198,  3109,  9213],
        [  422, 11145,   271,  1668],
        [  319,  3267,  2310,    11],
        [48609,   198,   198,    40],
        [  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899]])


In [21]:
token_embeddings  = embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [22]:
# context length here is the maximum lenght of a given sentence.
context_length = max_length
pos_embedding_layer = nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [23]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings)

tensor([[[ 2.2313e+00,  9.5014e-01,  8.3350e-01,  ...,  4.3666e-01,
           7.0738e-01, -2.8890e+00],
         [ 4.3013e-01,  4.7948e-01,  7.0637e-01,  ..., -1.4691e+00,
          -1.2311e+00,  5.4692e-01],
         [ 2.9066e-01, -3.2754e-01, -4.7325e-03,  ...,  1.5367e-01,
           9.1061e-01,  9.0097e-01],
         [ 1.1351e+00, -1.1351e+00, -2.6348e+00,  ...,  4.2776e-01,
           2.1555e+00, -9.9331e-01]],

        [[ 1.6255e+00, -8.7252e-01,  1.3496e+00,  ..., -9.8355e-01,
           1.3422e+00, -3.6554e-01],
         [ 2.2749e+00, -2.0938e+00,  1.4435e-01,  ...,  2.9035e-01,
           1.2776e+00, -2.9447e-01],
         [ 1.0584e+00, -1.2154e+00, -4.2583e-01,  ...,  2.4404e-01,
          -1.9316e+00,  1.1576e+00],
         [-1.7942e-01, -1.1768e+00, -2.8520e-01,  ...,  9.6883e-01,
           1.0250e+00, -1.7601e+00]],

        [[ 2.4362e+00, -2.7157e-01, -1.4153e+00,  ..., -1.1771e+00,
           2.3300e+00, -5.8323e-01],
         [-1.8907e+00, -4.8473e-01,  2.8876e+00,  .

In [24]:
inputs = torch.tensor( [[0.43, 0.15, 0.89], [0.55, 0.87, 0.66], 
                        [0.57, 0.85, 0.64], [0.22, 0.58, 0.33], 
                        [0.77, 0.25, 0.10], [0.05, 0.80, 0.55]])

In [25]:
inputs

tensor([[0.4300, 0.1500, 0.8900],
        [0.5500, 0.8700, 0.6600],
        [0.5700, 0.8500, 0.6400],
        [0.2200, 0.5800, 0.3300],
        [0.7700, 0.2500, 0.1000],
        [0.0500, 0.8000, 0.5500]])

In [26]:
query = inputs[1]
print(query)

tensor([0.5500, 0.8700, 0.6600])


In [27]:
attention_scores_2 = torch.empty(inputs.shape[0])
print(attention_scores_2)

tensor([0., 0., 0., 0., 0., 0.])


In [28]:
for i, x_i in enumerate(inputs):
    attention_scores_2[i] = query @ x_i

print(attention_scores_2)

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


In [29]:
normalised_attention_scores = attention_scores_2/attention_scores_2.sum()
print(normalised_attention_scores)

tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])


In [30]:
naive_softmax = lambda x: torch.exp(x)/torch.exp(x).sum()
normalised_attention_scores  = naive_softmax(attention_scores_2)
print(normalised_attention_scores)

tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])


In [31]:
normalised_attention_scores  = torch.softmax(attention_scores_2,dim=0)
print(normalised_attention_scores)

tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])


In [32]:
query = inputs[1]
context_vec_2 = torch.zeros(query.shape)
for i, x_i in enumerate(inputs):
    context_vec_2 += normalised_attention_scores[i] * x_i

print(context_vec_2)


tensor([0.4419, 0.6515, 0.5683])


In [33]:
unormalised_attention_score = inputs @ inputs.T
print(unormalised_attention_score)

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])


In [34]:
attention_weights = torch.softmax(unormalised_attention_score,dim=0)
print(attention_weights)

tensor([[0.2098, 0.1385, 0.1390, 0.1435, 0.1526, 0.1385],
        [0.2006, 0.2379, 0.2369, 0.2074, 0.1958, 0.2184],
        [0.1981, 0.2333, 0.2326, 0.2046, 0.1975, 0.2128],
        [0.1242, 0.1240, 0.1242, 0.1462, 0.1367, 0.1420],
        [0.1220, 0.1082, 0.1108, 0.1263, 0.1879, 0.0988],
        [0.1452, 0.1581, 0.1565, 0.1720, 0.1295, 0.1896]])


In [35]:
attention_weights[:,0].sum()

tensor(1.0000)

In [36]:
context_vector = attention_weights @ inputs
print(context_vector)

tensor([[0.4017, 0.5023, 0.5059],
        [0.5595, 0.7824, 0.6953],
        [0.5538, 0.7686, 0.6834],
        [0.3369, 0.4647, 0.4119],
        [0.3525, 0.4059, 0.3657],
        [0.3856, 0.5761, 0.5077]])


Self attention with trainable weights

In [40]:
x_2 = inputs[1]
d_in = inputs.shape[1]
d_out = 2 # generally the sizes for input and output are the same but the author suggests different for better understanding.

In [42]:
w_q  =nn.Parameter(torch.rand(d_in,d_out), requires_grad=False)
w_k  =nn.Parameter(torch.rand(d_in,d_out), requires_grad=False)
w_v  =nn.Parameter(torch.rand(d_in,d_out), requires_grad=False)


In [43]:
q_2 = x_2 @ w_q
k_2 = x_2 @ w_k
v_2 = x_2 @ w_v

In [46]:
print(q_2)

tensor([1.3436, 1.0967])


In [47]:
key = inputs @ w_k 
values  = inputs @ w_v

In [54]:
print("keys: ", key)
print("#"*50)
print("values: ", values)

keys:  tensor([[0.4639, 0.4834],
        [0.6597, 0.7193],
        [0.6759, 0.7253],
        [0.2944, 0.3578],
        [0.7777, 0.6296],
        [0.1645, 0.3273]])
##################################################
values:  tensor([[0.6139, 1.0462],
        [0.8336, 0.9588],
        [0.8270, 0.9441],
        [0.4448, 0.4776],
        [0.4759, 0.4149],
        [0.5395, 0.6572]])


In [59]:
key_2 = key[1]
attention_score_22 = q_2.dot(key_2)# key_2.dot(q_2)

In [60]:
print(key_2.shape)
print(q_2.shape)
print(attention_score_22.shape)

torch.Size([2])
torch.Size([2])
torch.Size([])


In [61]:
attention_score_22

tensor(1.6753)

In [81]:
attention_score_2 = q_2 @ key.T # the second element here matches the attention_score_22

Calculating attention scores for all , but I have hurried to do this, the book first suggests we stick with q_2 and first scale and then will come back to the doing all at once.

In [82]:
#query = inputs @ w_q
#print(query.shape)
#print(key.shape)

#attention_scores = query @ key.T
#print(attention_scores.shape)

Lets calculate normalise to get from attention_scores to attention_weights for q_2 first.

In [83]:
d_k = key.shape[-1]
print(d_k)

2


In [84]:
attention_weights_2 = torch.softmax(attention_score_2/d_k**0.5, dim = -1)

In [85]:
attention_weights_2

tensor([0.1456, 0.2105, 0.2148, 0.1124, 0.2197, 0.0970])

In [86]:
attention_scores_2 @ values

tensor([4.3500, 5.2349])

Create a class for extracting the context vector

In [None]:
class SelfAttentionV1(nn.Module):
    