In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from pathlib import Path
import re
import tiktoken

In [2]:
def read_data(filepath: Path):
    with open(filepath, "r", encoding="utf-8") as f:
        raw_text = f.read()
    return raw_text


In [3]:
data = read_data("../resources/verdict.txt")

In [4]:
for i in range(0,10,2):
    print(i)

0
2
4
6
8


In [5]:
class GPTDatasetV1(Dataset):
    def __init__(self, text, tokeniser, max_length, stride):
        self.tokeniser = tokeniser
        self.input_ids = []
        self.target_ids=[]

        token_ids = tokeniser.encode(text,allowed_special={"<|endoftext|>"})

        for i in range(0,len(token_ids)-max_length, stride):
            input_chunk= token_ids[i:i+max_length]
            target_chunk= token_ids[i+1: i+max_length+1] 
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
        

    def __getitem__(self,idx):
        return self.input_ids[idx], self.target_ids[idx]



In [6]:
def create_dataloader_v1(text, batch_size=4,max_length=256, stride=128, shuffle=True,drop_last=True):
    tokeniser = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(text,tokeniser,max_length,stride)
    dataloader = DataLoader(dataset, batch_size=batch_size,shuffle=shuffle, drop_last=drop_last)
    return dataloader
    

In [7]:
dataloader = create_dataloader_v1(data, batch_size=2,max_length=4,stride=1, shuffle=False)

In [8]:
data_iter = iter(dataloader)
batch1 = next(data_iter)
print(batch1) #return input_token and target_token

[tensor([[  464,  4643, 11600,    25],
        [ 4643, 11600,    25,  1717]]), tensor([[ 4643, 11600,    25,  1717],
        [11600,    25,  1717,   342]])]


In [9]:
batch2 = next(data_iter)
print(batch2) #return input_token and target_token

[tensor([[11600,    25,  1717,   342],
        [   25,  1717,   342,   854]]), tensor([[   25,  1717,   342,   854],
        [ 1717,   342,   854, 41328]])]


Here, we will understand how the embedding layer is like a lookup.

In [10]:
vocab_size=5
output_dim=2
embedding_layer = nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight.shape)
print(embedding_layer.weight)

torch.Size([5, 2])
Parameter containing:
tensor([[-0.0322,  0.1282],
        [ 0.4242,  1.2052],
        [-0.4361, -0.1566],
        [ 0.7840, -1.3910],
        [-0.1416,  0.9451]], requires_grad=True)


In [11]:
embedding_layer(torch.tensor(3))

tensor([ 0.7840, -1.3910], grad_fn=<EmbeddingBackward0>)

In [12]:
torch.tensor(3)

tensor(3)

In [13]:
embedding_layer(torch.tensor([1,2,3]))

tensor([[ 0.4242,  1.2052],
        [-0.4361, -0.1566],
        [ 0.7840, -1.3910]], grad_fn=<EmbeddingBackward0>)

### Creating Positional Encodings

In [14]:
output_dim = 256
vocab_size = 50257
embedding_layer  = nn.Embedding(vocab_size, output_dim)

In [15]:
max_length = 4
dataloader = create_dataloader_v1(data, batch_size=8,max_length=max_length,stride=max_length, shuffle=False)

In [16]:
data_iter = iter(dataloader)

In [17]:
inputs, targets = next(data_iter)

In [18]:
print(inputs)

tensor([[  464,  4643, 11600,    25],
        [ 1717,   342,   854, 41328],
        [   25, 40417,   198,  3109],
        [ 9213,   422, 11145,   271],
        [ 1668,   319,  3267,  2310],
        [   11, 48609,   198,   198],
        [   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271]])


In [19]:
print(targets)

tensor([[ 4643, 11600,    25,  1717],
        [  342,   854, 41328,    25],
        [40417,   198,  3109,  9213],
        [  422, 11145,   271,  1668],
        [  319,  3267,  2310,    11],
        [48609,   198,   198,    40],
        [  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899]])


In [20]:
token_embeddings  = embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [21]:
# context length here is the maximum lenght of a given sentence.
context_length = max_length
pos_embedding_layer = nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [22]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings)

tensor([[[ 3.6032,  0.4536, -2.1153,  ...,  1.8928,  0.2065,  3.1340],
         [-0.6294, -0.9129,  0.1067,  ..., -0.2031, -1.6383, -1.1708],
         [-1.8529,  0.1328, -1.3398,  ...,  1.2740, -0.6213, -0.3105],
         [-1.3928, -0.5648, -0.3608,  ..., -0.8804,  0.1126,  1.4773]],

        [[ 2.3999,  1.7802, -0.0788,  ...,  2.6048, -0.7550, -0.3089],
         [-0.9203, -0.1721, -1.3905,  ...,  0.6783, -0.2578, -1.3210],
         [ 0.9350, -0.4921, -2.3809,  ..., -1.8527, -0.3522,  0.5418],
         [ 0.6017,  2.0869,  0.0525,  ..., -3.8018,  0.3548,  2.6026]],

        [[ 0.0119, -0.0930, -0.3751,  ...,  2.0448, -1.0513,  2.1434],
         [ 2.0866,  1.0002,  0.3554,  ...,  1.2946, -2.8253,  0.5628],
         [ 0.8257,  1.6432,  0.1596,  ...,  0.2713,  1.1571,  1.6925],
         [ 0.7863,  0.9505,  0.1449,  ..., -1.0515,  0.4334, -0.0519]],

        ...,

        [[ 0.7069,  0.1944, -1.0627,  ...,  0.7843, -1.2283,  0.9006],
         [ 0.3513, -0.9403,  1.3441,  ...,  1.1319, -1.44

In [23]:
inputs = torch.tensor( [[0.43, 0.15, 0.89], [0.55, 0.87, 0.66], 
                        [0.57, 0.85, 0.64], [0.22, 0.58, 0.33], 
                        [0.77, 0.25, 0.10], [0.05, 0.80, 0.55]])

In [24]:
inputs

tensor([[0.4300, 0.1500, 0.8900],
        [0.5500, 0.8700, 0.6600],
        [0.5700, 0.8500, 0.6400],
        [0.2200, 0.5800, 0.3300],
        [0.7700, 0.2500, 0.1000],
        [0.0500, 0.8000, 0.5500]])

In [25]:
query = inputs[1]
print(query)

tensor([0.5500, 0.8700, 0.6600])


In [26]:
attention_scores_2 = torch.empty(inputs.shape[0])
print(attention_scores_2)

tensor([0., 0., 0., 0., 0., 0.])


In [27]:
for i, x_i in enumerate(inputs):
    attention_scores_2[i] = query @ x_i

print(attention_scores_2)

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


In [28]:
normalised_attention_scores = attention_scores_2/attention_scores_2.sum()
print(normalised_attention_scores)

tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])


In [29]:
naive_softmax = lambda x: torch.exp(x)/torch.exp(x).sum()
normalised_attention_scores  = naive_softmax(attention_scores_2)
print(normalised_attention_scores)

tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])


In [30]:
normalised_attention_scores  = torch.softmax(attention_scores_2,dim=0)
print(normalised_attention_scores)

tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])


In [31]:
query = inputs[1]
context_vec_2 = torch.zeros(query.shape)
for i, x_i in enumerate(inputs):
    context_vec_2 += normalised_attention_scores[i] * x_i

print(context_vec_2)


tensor([0.4419, 0.6515, 0.5683])


In [32]:
unormalised_attention_score = inputs @ inputs.T
print(unormalised_attention_score)

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])


In [33]:
attention_weights = torch.softmax(unormalised_attention_score,dim=0)
print(attention_weights)

tensor([[0.2098, 0.1385, 0.1390, 0.1435, 0.1526, 0.1385],
        [0.2006, 0.2379, 0.2369, 0.2074, 0.1958, 0.2184],
        [0.1981, 0.2333, 0.2326, 0.2046, 0.1975, 0.2128],
        [0.1242, 0.1240, 0.1242, 0.1462, 0.1367, 0.1420],
        [0.1220, 0.1082, 0.1108, 0.1263, 0.1879, 0.0988],
        [0.1452, 0.1581, 0.1565, 0.1720, 0.1295, 0.1896]])


In [34]:
attention_weights[:,0].sum()

tensor(1.0000)

In [35]:
context_vector = attention_weights @ inputs
print(context_vector)

tensor([[0.4017, 0.5023, 0.5059],
        [0.5595, 0.7824, 0.6953],
        [0.5538, 0.7686, 0.6834],
        [0.3369, 0.4647, 0.4119],
        [0.3525, 0.4059, 0.3657],
        [0.3856, 0.5761, 0.5077]])


Self attention with trainable weights

In [57]:
x_2 = inputs[1]
d_in = inputs.shape[1]
d_out = 2 # generally the sizes for input and output are the same but the author suggests different for better understanding.

In [58]:
print(d_in)
print(d_out)

3
2


In [37]:
w_q  =nn.Parameter(torch.rand(d_in,d_out), requires_grad=False)
w_k  =nn.Parameter(torch.rand(d_in,d_out), requires_grad=False)
w_v  =nn.Parameter(torch.rand(d_in,d_out), requires_grad=False)


In [38]:
q_2 = x_2 @ w_q
k_2 = x_2 @ w_k
v_2 = x_2 @ w_v

In [39]:
print(q_2)

tensor([0.6192, 0.7694])


In [40]:
key = inputs @ w_k 
values  = inputs @ w_v

In [41]:
print("keys: ", key)
print("#"*50)
print("values: ", values)

keys:  tensor([[1.1114, 0.7984],
        [1.4699, 1.0890],
        [1.4642, 1.0853],
        [0.7628, 0.5678],
        [0.9499, 0.7126],
        [0.8735, 0.6450]])
##################################################
values:  tensor([[1.1273, 0.3644],
        [1.4172, 0.5973],
        [1.4024, 0.6058],
        [0.7532, 0.2929],
        [0.7414, 0.5892],
        [0.9494, 0.2333]])


In [42]:
key_2 = key[1]
attention_score_22 = q_2.dot(key_2)# key_2.dot(q_2)

In [43]:
print(key_2.shape)
print(q_2.shape)
print(attention_score_22.shape)

torch.Size([2])
torch.Size([2])
torch.Size([])


In [44]:
attention_score_22

tensor(1.7480)

In [45]:
attention_score_2 = q_2 @ key.T # the second element here matches the attention_score_22

Calculating attention scores for all , but I have hurried to do this, the book first suggests we stick with q_2 and first scale and then will come back to the doing all at once.

In [46]:
#query = inputs @ w_q
#print(query.shape)
#print(key.shape)

#attention_scores = query @ key.T
#print(attention_scores.shape)

Lets calculate normalise to get from attention_scores to attention_weights for q_2 first.

In [47]:
d_k = key.shape[-1]
print(d_k)

2


In [48]:
attention_weights_2 = torch.softmax(attention_score_2/d_k**0.5, dim = -1)

In [49]:
attention_weights_2

tensor([0.1610, 0.2207, 0.2197, 0.1219, 0.1432, 0.1335])

In [50]:
attention_scores_2 @ values

tensor([7.4545, 3.0515])

Create a class for extracting the context vector

In [65]:
class SelfAttentionV1(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.d_in = d_in
        self.d_out= d_out
        self.wq = nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
        self.wk = nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
        self.wv = nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
        
    def forward(self, inputs):
        query = inputs @ self.wq
        key= inputs @ self.wk
        value= inputs @ self.wv
        
        attention_score = query @ key.T
        d_k = key.shape[-1]
        attention_weights = torch.softmax(attention_score/d_k ** 0.5, dim = -1)
        context_vector = attention_weights @ value
        return context_vector


In [71]:
torch.manual_seed(123)
self_attention = SelfAttentionV1(3,2)
self_attention(inputs)

tensor([[0.2996, 0.8053],
        [0.3061, 0.8210],
        [0.3058, 0.8203],
        [0.2948, 0.7939],
        [0.2927, 0.7891],
        [0.2990, 0.8040]])