In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from pathlib import Path
import re
import tiktoken

In [2]:
def read_data(filepath: Path):
    with open(filepath, "r", encoding="utf-8") as f:
        raw_text = f.read()
    return raw_text


In [3]:
data = read_data("../resources/verdict.txt")

In [4]:
for i in range(0,10,2):
    print(i)

0
2
4
6
8


In [5]:
class GPTDatasetV1(Dataset):
    def __init__(self, text, tokeniser, max_length, stride):
        self.tokeniser = tokeniser
        self.input_ids = []
        self.target_ids=[]

        token_ids = tokeniser.encode(text,allowed_special={"<|endoftext|>"})

        for i in range(0,len(token_ids)-max_length, stride):
            input_chunk= token_ids[i:i+max_length]
            target_chunk= token_ids[i+1: i+max_length+1] 
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
        

    def __getitem__(self,idx):
        return self.input_ids[idx], self.target_ids[idx]



In [6]:
def create_dataloader_v1(text, batch_size=4,max_length=256, stride=128, shuffle=True,drop_last=True):
    tokeniser = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(text,tokeniser,max_length,stride)
    dataloader = DataLoader(dataset, batch_size=batch_size,shuffle=shuffle, drop_last=drop_last)
    return dataloader
    

In [7]:
dataloader = create_dataloader_v1(data, batch_size=2,max_length=4,stride=1, shuffle=False)

In [8]:
data_iter = iter(dataloader)
batch1 = next(data_iter)
print(batch1) #return input_token and target_token

[tensor([[  464,  4643, 11600,    25],
        [ 4643, 11600,    25,  1717]]), tensor([[ 4643, 11600,    25,  1717],
        [11600,    25,  1717,   342]])]


In [9]:
batch2 = next(data_iter)
print(batch2) #return input_token and target_token

[tensor([[11600,    25,  1717,   342],
        [   25,  1717,   342,   854]]), tensor([[   25,  1717,   342,   854],
        [ 1717,   342,   854, 41328]])]


Here, we will understand how the embedding layer is like a lookup.

In [10]:
vocab_size=5
output_dim=2
embedding_layer = nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight.shape)
print(embedding_layer.weight)

torch.Size([5, 2])
Parameter containing:
tensor([[ 0.5955,  1.8261],
        [-0.8714,  1.0945],
        [-0.2039, -1.3195],
        [ 0.5021, -0.3470],
        [ 1.9123, -1.1638]], requires_grad=True)


In [11]:
embedding_layer(torch.tensor(3))

tensor([ 0.5021, -0.3470], grad_fn=<EmbeddingBackward0>)

In [12]:
torch.tensor(3)

tensor(3)

In [13]:
embedding_layer(torch.tensor([1,2,3]))

tensor([[-0.8714,  1.0945],
        [-0.2039, -1.3195],
        [ 0.5021, -0.3470]], grad_fn=<EmbeddingBackward0>)

### Creating Positional Encodings

In [14]:
output_dim = 256
vocab_size = 50257
embedding_layer  = nn.Embedding(vocab_size, output_dim)

In [15]:
max_length = 4
dataloader = create_dataloader_v1(data, batch_size=8,max_length=max_length,stride=max_length, shuffle=False)

In [16]:
data_iter = iter(dataloader)

In [17]:
inputs, targets = next(data_iter)

In [18]:
print(inputs)

tensor([[  464,  4643, 11600,    25],
        [ 1717,   342,   854, 41328],
        [   25, 40417,   198,  3109],
        [ 9213,   422, 11145,   271],
        [ 1668,   319,  3267,  2310],
        [   11, 48609,   198,   198],
        [   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271]])


In [19]:
print(targets)

tensor([[ 4643, 11600,    25,  1717],
        [  342,   854, 41328,    25],
        [40417,   198,  3109,  9213],
        [  422, 11145,   271,  1668],
        [  319,  3267,  2310,    11],
        [48609,   198,   198,    40],
        [  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899]])


In [20]:
token_embeddings  = embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [21]:
# context length here is the maximum lenght of a given sentence.
context_length = max_length
pos_embedding_layer = nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [22]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings)

tensor([[[-1.1230, -2.7800,  1.7692,  ..., -0.4179,  0.9727,  1.7178],
         [-1.3568,  0.7412,  0.3651,  ...,  0.7839, -0.4573, -0.1322],
         [ 0.6171,  1.7705, -2.0805,  ..., -2.0243,  1.4302,  1.2133],
         [ 1.2763,  0.5105,  1.0526,  ..., -0.5032,  0.6029,  2.2067]],

        [[-0.1252, -1.0589, -1.1794,  ...,  0.2569,  1.2510,  1.7167],
         [-1.2667, -0.6551,  0.0868,  ..., -0.5351, -0.4735, -0.2435],
         [ 0.8123,  0.8743,  0.1137,  ...,  0.1016,  1.8303,  0.9296],
         [-1.4103, -0.7671, -1.3170,  ...,  1.1577, -0.4392,  1.9551]],

        [[ 1.8138, -1.7748,  0.6231,  ..., -2.4029,  1.1416,  0.9481],
         [-0.8753, -0.8782,  1.4461,  ..., -0.6215, -0.9858,  1.1643],
         [-1.2581,  0.4101,  0.9407,  ..., -2.1274,  1.2045,  0.8311],
         [-1.5351,  0.7415,  1.8753,  ...,  2.0043, -0.6440,  0.9900]],

        ...,

        [[-0.0822, -2.8682, -0.7680,  ..., -0.6822,  2.2024,  0.7272],
         [-0.0632, -2.1329, -0.3574,  ..., -0.2914, -0.67

In [23]:
inputs = torch.tensor( [[0.43, 0.15, 0.89], [0.55, 0.87, 0.66], 
                        [0.57, 0.85, 0.64], [0.22, 0.58, 0.33], 
                        [0.77, 0.25, 0.10], [0.05, 0.80, 0.55]])

In [24]:
inputs

tensor([[0.4300, 0.1500, 0.8900],
        [0.5500, 0.8700, 0.6600],
        [0.5700, 0.8500, 0.6400],
        [0.2200, 0.5800, 0.3300],
        [0.7700, 0.2500, 0.1000],
        [0.0500, 0.8000, 0.5500]])

In [25]:
query = inputs[1]
print(query)

tensor([0.5500, 0.8700, 0.6600])


In [26]:
attention_scores_2 = torch.empty(inputs.shape[0])
print(attention_scores_2)

tensor([0., 0., 0., 0., 0., 0.])


In [27]:
for i, x_i in enumerate(inputs):
    attention_scores_2[i] = query @ x_i

print(attention_scores_2)

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


In [28]:
normalised_attention_scores = attention_scores_2/attention_scores_2.sum()
print(normalised_attention_scores)

tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])


In [29]:
naive_softmax = lambda x: torch.exp(x)/torch.exp(x).sum()
normalised_attention_scores  = naive_softmax(attention_scores_2)
print(normalised_attention_scores)

tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])


In [30]:
normalised_attention_scores  = torch.softmax(attention_scores_2,dim=0)
print(normalised_attention_scores)

tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])


In [31]:
query = inputs[1]
context_vec_2 = torch.zeros(query.shape)
for i, x_i in enumerate(inputs):
    context_vec_2 += normalised_attention_scores[i] * x_i

print(context_vec_2)


tensor([0.4419, 0.6515, 0.5683])


In [32]:
unormalised_attention_score = inputs @ inputs.T
print(unormalised_attention_score)

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])


In [33]:
attention_weights = torch.softmax(unormalised_attention_score,dim=0)
print(attention_weights)

tensor([[0.2098, 0.1385, 0.1390, 0.1435, 0.1526, 0.1385],
        [0.2006, 0.2379, 0.2369, 0.2074, 0.1958, 0.2184],
        [0.1981, 0.2333, 0.2326, 0.2046, 0.1975, 0.2128],
        [0.1242, 0.1240, 0.1242, 0.1462, 0.1367, 0.1420],
        [0.1220, 0.1082, 0.1108, 0.1263, 0.1879, 0.0988],
        [0.1452, 0.1581, 0.1565, 0.1720, 0.1295, 0.1896]])


In [34]:
attention_weights[:,0].sum()

tensor(1.0000)

In [35]:
context_vector = attention_weights @ inputs
print(context_vector)

tensor([[0.4017, 0.5023, 0.5059],
        [0.5595, 0.7824, 0.6953],
        [0.5538, 0.7686, 0.6834],
        [0.3369, 0.4647, 0.4119],
        [0.3525, 0.4059, 0.3657],
        [0.3856, 0.5761, 0.5077]])


Self attention with trainable weights

In [36]:
x_2 = inputs[1]
d_in = inputs.shape[1]
d_out = 2 # generally the sizes for input and output are the same but the author suggests different for better understanding.

In [37]:
print(d_in)
print(d_out)

3
2


In [38]:
w_q  =nn.Parameter(torch.rand(d_in,d_out), requires_grad=False)
w_k  =nn.Parameter(torch.rand(d_in,d_out), requires_grad=False)
w_v  =nn.Parameter(torch.rand(d_in,d_out), requires_grad=False)


In [39]:
q_2 = x_2 @ w_q
k_2 = x_2 @ w_k
v_2 = x_2 @ w_v

In [40]:
print(q_2)

tensor([1.3363, 0.9772])


In [41]:
key = inputs @ w_k 
values  = inputs @ w_v

In [42]:
print("keys: ", key)
print("#"*50)
print("values: ", values)

keys:  tensor([[0.2965, 0.9540],
        [0.8224, 1.6327],
        [0.8168, 1.6209],
        [0.4857, 0.9078],
        [0.4852, 0.9520],
        [0.5681, 1.0806]])
##################################################
values:  tensor([[0.9293, 0.7429],
        [0.9828, 0.7662],
        [0.9878, 0.7627],
        [0.4553, 0.3731],
        [0.8001, 0.4856],
        [0.4468, 0.4372]])


In [43]:
key_2 = key[1]
attention_score_22 = q_2.dot(key_2)# key_2.dot(q_2)

In [44]:
print(key_2.shape)
print(q_2.shape)
print(attention_score_22.shape)

torch.Size([2])
torch.Size([2])
torch.Size([])


In [45]:
attention_score_22

tensor(2.6945)

In [46]:
attention_score_2 = q_2 @ key.T # the second element here matches the attention_score_22

Calculating attention scores for all , but I have hurried to do this, the book first suggests we stick with q_2 and first scale and then will come back to the doing all at once.

In [47]:
#query = inputs @ w_q
#print(query.shape)
#print(key.shape)

#attention_scores = query @ key.T
#print(attention_scores.shape)

Lets calculate normalise to get from attention_scores to attention_weights for q_2 first.

In [48]:
d_k = key.shape[-1]
print(d_k)

2


In [49]:
attention_weights_2 = torch.softmax(attention_score_2/d_k**0.5, dim = -1)

In [50]:
attention_weights_2

tensor([0.1002, 0.2632, 0.2597, 0.1160, 0.1196, 0.1413])

In [51]:
attention_scores_2 @ values

tensor([5.2488, 4.1128])

Create a class for extracting the context vector

In [52]:
class SelfAttentionV1(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.d_in = d_in
        self.d_out= d_out
        self.wq = nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
        self.wk = nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
        self.wv = nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
        
    def forward(self, inputs):
        query = inputs @ self.wq
        key= inputs @ self.wk
        value= inputs @ self.wv
        
        attention_score = query @ key.T
        d_k = key.shape[-1]
        attention_weights = torch.softmax(attention_score/d_k ** 0.5, dim = -1)
        context_vector = attention_weights @ value
        return context_vector


In [53]:
torch.manual_seed(123)
self_attention_v1 = SelfAttentionV1(3,2)
self_attention_v1(inputs)

tensor([[0.2996, 0.8053],
        [0.3061, 0.8210],
        [0.3058, 0.8203],
        [0.2948, 0.7939],
        [0.2927, 0.7891],
        [0.2990, 0.8040]])

In [54]:

class SelfAttentionV2(nn.Module):
    def __init__(self, d_in, d_out, qkv_bias = False):
        super().__init__()
        self.d_in = d_in
        self.d_out= d_out
        self.wq = nn.Linear(d_in,d_out, bias=qkv_bias)
        self.wk = nn.Linear(d_in,d_out, bias=qkv_bias)
        self.wv = nn.Linear(d_in,d_out, bias=qkv_bias)
        
    def forward(self, inputs):
        query = self.wq(inputs)
        key= self.wk(inputs)
        value= self.wv(inputs)
        
        attention_score = query @ key.T
        d_k = key.shape[-1]
        attention_weights = torch.softmax(attention_score/d_k ** 0.5, dim = -1)
        context_vector = attention_weights @ value
        return context_vector

In [55]:
torch.manual_seed(789)
self_attention_v2 = SelfAttentionV2(3,2)
result = self_attention_v2(inputs)
print(result)

tensor([[-0.0739,  0.0713],
        [-0.0748,  0.0703],
        [-0.0749,  0.0702],
        [-0.0760,  0.0685],
        [-0.0763,  0.0679],
        [-0.0754,  0.0693]], grad_fn=<MmBackward0>)


In [56]:
wq = self_attention_v2.state_dict()["wq.weight"]
wk = self_attention_v2.state_dict()["wk.weight"]
wv = self_attention_v2.state_dict()["wv.weight"]

In [57]:
self_attention_v1.wq = nn.Parameter(wq.T)
self_attention_v1.wk = nn.Parameter(wk.T)
self_attention_v1.wv = nn.Parameter(wv.T)


In [58]:

# here we have take the weights from v2 and applied to v2 and we can see they are essentially doing the operations, 
# we can see outputs are the same once we copy the weight matrices.
self_attention_v1(inputs)

tensor([[-0.0739,  0.0713],
        [-0.0748,  0.0703],
        [-0.0749,  0.0702],
        [-0.0760,  0.0685],
        [-0.0763,  0.0679],
        [-0.0754,  0.0693]], grad_fn=<MmBackward0>)

### Masked Attention

In [None]:
query = self_attention_v2.wq(inputs)
keys = self_attention_v2.wv(inputs)
attention_scores = query @ keys.T

print(attention_scores.shape)

torch.Size([6, 6])


In [73]:
print(attention_scores)
print("#"*50)
print(torch.softmax(attention_scores,dim=-1))


tensor([[-0.0634, -0.0907, -0.0862, -0.0597,  0.0188, -0.1059],
        [-0.0921, -0.1375, -0.1340, -0.0818, -0.0332, -0.1204],
        [-0.0908, -0.1357, -0.1322, -0.0806, -0.0332, -0.1185],
        [-0.0514, -0.0774, -0.0757, -0.0452, -0.0240, -0.0642],
        [-0.0420, -0.0635, -0.0623, -0.0367, -0.0229, -0.0507],
        [-0.0672, -0.1005, -0.0980, -0.0596, -0.0254, -0.0873]],
       grad_fn=<MmBackward0>)
##################################################
tensor([[0.1667, 0.1622, 0.1630, 0.1673, 0.1810, 0.1598],
        [0.1679, 0.1604, 0.1610, 0.1696, 0.1780, 0.1632],
        [0.1678, 0.1605, 0.1610, 0.1696, 0.1778, 0.1633],
        [0.1675, 0.1632, 0.1634, 0.1685, 0.1721, 0.1653],
        [0.1674, 0.1638, 0.1640, 0.1683, 0.1706, 0.1659],
        [0.1676, 0.1621, 0.1625, 0.1689, 0.1747, 0.1642]],
       grad_fn=<SoftmaxBackward0>)


In [74]:
attention_weights = torch.softmax(attention_scores/keys.shape[-1]**0.5, dim=-1)

In [None]:
context_length = torch.tensor(attention_weights.shape[0]) # size 6
print(type(context_length))

<class 'torch.Tensor'>


In [86]:
simple_mask=torch.tril(torch.ones(context_length,context_length))
print(simple_mask)

tensor([[1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1.]])


In [87]:
masked_weights = attention_weights * simple_mask
print(masked_weights)

tensor([[0.1667, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1675, 0.1622, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1675, 0.1623, 0.1627, 0.0000, 0.0000, 0.0000],
        [0.1672, 0.1642, 0.1644, 0.1680, 0.0000, 0.0000],
        [0.1672, 0.1646, 0.1648, 0.1678, 0.1694, 0.0000],
        [0.1673, 0.1634, 0.1637, 0.1682, 0.1723, 0.1650]],
       grad_fn=<MulBackward0>)


In [91]:
masked_sum = masked_weights.sum(dim=1,keepdim=True)
print(masked_sum)

tensor([[0.1667],
        [0.3298],
        [0.4925],
        [0.6638],
        [0.8339],
        [1.0000]], grad_fn=<SumBackward1>)


In [93]:
mask_weights_normalised = masked_weights/masked_sum
print(mask_weights_normalised)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5080, 0.4920, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3401, 0.3295, 0.3303, 0.0000, 0.0000, 0.0000],
        [0.2519, 0.2474, 0.2477, 0.2530, 0.0000, 0.0000],
        [0.2005, 0.1975, 0.1976, 0.2012, 0.2032, 0.0000],
        [0.1673, 0.1634, 0.1637, 0.1682, 0.1723, 0.1650]],
       grad_fn=<DivBackward0>)


In [100]:

mask= torch.triu(torch.ones(context_length,context_length),diagonal=1)
masked_attention_scores = attention_scores.masked_fill(mask.bool(),-torch.inf)
print(masked_attention_scores)

tensor([[-0.0634,    -inf,    -inf,    -inf,    -inf,    -inf],
        [-0.0921, -0.1375,    -inf,    -inf,    -inf,    -inf],
        [-0.0908, -0.1357, -0.1322,    -inf,    -inf,    -inf],
        [-0.0514, -0.0774, -0.0757, -0.0452,    -inf,    -inf],
        [-0.0420, -0.0635, -0.0623, -0.0367, -0.0229,    -inf],
        [-0.0672, -0.1005, -0.0980, -0.0596, -0.0254, -0.0873]],
       grad_fn=<MaskedFillBackward0>)


In [101]:
attention_weights = torch.softmax(masked_attention_scores/keys.shape[-1]**0.5,dim=1)
print(attention_weights)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5080, 0.4920, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3401, 0.3295, 0.3303, 0.0000, 0.0000, 0.0000],
        [0.2519, 0.2474, 0.2477, 0.2530, 0.0000, 0.0000],
        [0.2005, 0.1975, 0.1976, 0.2012, 0.2032, 0.0000],
        [0.1673, 0.1634, 0.1637, 0.1682, 0.1723, 0.1650]],
       grad_fn=<SoftmaxBackward0>)


In [102]:
attention_weights @ values

tensor([[0.9293, 0.7429],
        [0.9556, 0.7544],
        [0.9663, 0.7571],
        [0.8371, 0.6600],
        [0.8298, 0.6247],
        [0.7660, 0.5930]], grad_fn=<MmBackward0>)

In [None]:
"""
Generally our d_in and d_out is the same , representing d_model. 
However, the author has given different dimensions for illustration purposes.
We start with vocab_size and d_model in this case we started with [6,(3,2)], as mentioned above (3,2) is generally 
represented by a single number d_model. But in our case values represent 6,2 shape , the 2 here is the d_out.
"""
values.shape 

torch.Size([6, 2])