<a href="https://colab.research.google.com/github/space4VV/LLM_trailblazr/blob/main/chapter3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#print torch version
import torch
print(torch.__version__)


2.4.1+cu121


In [2]:
import torch
inputs = torch.tensor(
   [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)
print(inputs)

tensor([[0.4300, 0.1500, 0.8900],
        [0.5500, 0.8700, 0.6600],
        [0.5700, 0.8500, 0.6400],
        [0.2200, 0.5800, 0.3300],
        [0.7700, 0.2500, 0.1000],
        [0.0500, 0.8000, 0.5500]])


- We use input sequence element 2, as an example to compute context vector z
- The context vector is "context"-specific to a certain input


# Compute the attention weights and context vector for input 2



## Step 1 - compute the unnormalized attention scores by computing the dot product between the query and all other input tokens:


In [4]:
query = inputs[1]
print(query)
attn_scores_2 = torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs):
    attn_scores_2[i] = torch.dot(x_i,query)
print(attn_scores_2)

tensor([0.5500, 0.8700, 0.6600])
tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


## Step 2 - Normalize the attention scores so that they sum upto 1


In [5]:
attn_weights_2_tmp = attn_scores_2 /attn_scores_2.sum()
print(attn_weights_2_tmp)
print("sum of attn scores:",attn_weights_2_tmp.sum())



tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])
sum of attn scores: tensor(1.0000)


However, in practice, using the softmax function for normalization, which is better at handling extreme values and has more desirable gradient properties during training, is common and recommended.

In [7]:
# naive softmax
def softmax_naive(x):
  return torch.exp(x) / torch.exp(x).sum(dim=0)
attn_weights_2_naive = softmax_naive(attn_scores_2)
print(attn_weights_2_naive)
print("sum of attn scores:",attn_weights_2_naive.sum())


tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
sum of attn scores: tensor(1.)


In [8]:
# in practice its better to use the one from torch directly
atten_weights_2 = torch.softmax(attn_scores_2, dim=0)
print(atten_weights_2)
print("sum of attn scores:",atten_weights_2.sum())


tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
sum of attn scores: tensor(1.)


## Step 3 - compute the context vector by multiplying the embedded input tokens, with the attention weights and sum the resulting vectors

In [13]:
print("inputs:",inputs)
print("attention_weights",atten_weights_2)
query =  inputs[1] # 2nd token is the query
context_vec_2  = torch.zeros(query.shape)
for i, x_i in enumerate(inputs):
    context_vec_2 += atten_weights_2[i] * x_i
print("context_vector -",context_vec_2)

inputs: tensor([[0.4300, 0.1500, 0.8900],
        [0.5500, 0.8700, 0.6600],
        [0.5700, 0.8500, 0.6400],
        [0.2200, 0.5800, 0.3300],
        [0.7700, 0.2500, 0.1000],
        [0.0500, 0.8000, 0.5500]])
attention_weights tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
context_vector - tensor([0.4419, 0.6515, 0.5683])


# Compute all attention weights and context vectors

In [15]:
attn_scores = torch.empty((inputs.shape[0], inputs.shape[0]))
for i, x_i in enumerate(inputs):
    for j, x_j in enumerate(inputs):
        attn_scores[i, j] = torch.dot(x_i, x_j)
print(attn_scores)

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])


In [17]:
# matric multiplication vs the above for loops to improve the performance
attn_scores = inputs @ inputs.T
print(attn_scores)

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])


In [18]:
# now normalize
attn_weights = torch.softmax(attn_scores, dim=1)
print(attn_weights)

tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])


In [19]:
# now calculate context vectors
all_context_vectors = attn_weights @ inputs
print(all_context_vectors)

tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])


# Implementing self-attention with trainable weights
There are only slight differences compared to the basic attention mechanism introduced earlier:
- The most notable difference is the introduction of weight matrices that are updated during model training
- These trainable weight matrices are crucial so that the model (specifically, the attention module inside the model) can learn to produce "good" context vectors


In [21]:
# we need 3 weight matrices K,Q,V -  these 3 matrices project the embedded input
# tokens into K;Q;V Vectors

In [30]:
print(inputs)

tensor([[0.4300, 0.1500, 0.8900],
        [0.5500, 0.8700, 0.6600],
        [0.5700, 0.8500, 0.6400],
        [0.2200, 0.5800, 0.3300],
        [0.7700, 0.2500, 0.1000],
        [0.0500, 0.8000, 0.5500]])


In [24]:
x_2 = inputs[1]
print(x_2.shape)
d_in = inputs.shape[1]
print("input dim:",d_in)
d_out = 2

torch.Size([3])
input dim: 3


In [27]:
# initialize the 3 weight matrices
torch.manual_seed(123)
w_query = torch.nn.Parameter(torch.randn(d_in, d_out),requires_grad=False)
w_key = torch.nn.Parameter(torch.randn(d_in, d_out),requires_grad=False)
w_value = torch.nn.Parameter(torch.randn(d_in, d_out),requires_grad=False)

In [29]:
query_2 = x_2 @ w_query # _2 because it's with respect to the 2nd input element
key_2 = x_2 @ w_key
value_2 = x_2 @ w_value
print("query:",query_2)
print("key:",key_2)

query: tensor([-1.1729, -0.0048])
key: tensor([-0.1142, -0.7676])


In [31]:
keys = inputs @ w_key
values = inputs @ w_value
print("shape of keys:",keys.shape)
print("shape of values:",values.shape)

shape of keys: torch.Size([6, 2])
shape of values: torch.Size([6, 2])


In [35]:
# step 2, we compute the unnormalized attention scores by computing the dot
#product between the query and each key vector:

keys_2 = keys[1]
print("keys2:",keys_2)
print("query2:",query_2)
attn_score_22 = torch.dot(query_2, keys_2)
print("attention score:",attn_score_22)

keys2: tensor([-0.1142, -0.7676])
query2: tensor([-1.1729, -0.0048])
attention score: tensor(0.1376)


In [33]:
attn_scores_2 = query_2 @ keys.T
print(attn_scores_2)

tensor([ 0.2172,  0.1376,  0.1730, -0.0491,  0.7616, -0.3809])


In [37]:
# compute attention weights
d_k = keys.shape[1]
attn_weights_2 = torch.softmax(attn_scores_2 / d_k**0.5, dim=-1)
print(attn_weights_2)

tensor([0.1704, 0.1611, 0.1652, 0.1412, 0.2505, 0.1117])


In [38]:
# step 4, we now compute the context vector for input query vector 2:

context_vec_2 = attn_weights_2 @ values
print(context_vec_2)



tensor([0.2854, 0.4081])


## Self attention class


In [41]:
import torch.nn as nn
class SelfAttention_V1(nn.Module):
  def __init__(self, d_in, d_out):
    super().__init__()
    self.w_query = nn.Parameter(torch.rand(d_in, d_out))
    self.w_key = nn.Parameter(torch.rand(d_in, d_out))
    self.w_value = nn.Parameter(torch.rand(d_in, d_out))

  def forward(self, x):
    keys = x @ self.w_key
    values = x @ self.w_value
    queries = x @ self.w_query
    attn_scores = queries @ keys.T # omega
    attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5, dim=-1
        )

    context_vec = attn_weights @ values
    return context_vec

torch.manual_seed(123)
self_attn_v1 = SelfAttention_V1(d_in, d_out)
print(self_attn_v1(inputs))

tensor([[0.2996, 0.8053],
        [0.3061, 0.8210],
        [0.3058, 0.8203],
        [0.2948, 0.7939],
        [0.2927, 0.7891],
        [0.2990, 0.8040]], grad_fn=<MmBackward0>)


In [45]:
# new class using pytorch linear layer
class SelfAttention_V2(nn.Module):
  def __init__(self, d_in, d_out,qkv_bias=False):
    super().__init__()
    self.linear_k = nn.Linear(d_in, d_out, bias=qkv_bias)
    self.linear_v = nn.Linear(d_in, d_out, bias=qkv_bias)
    self.linear_q = nn.Linear(d_in, d_out, bias=qkv_bias)

  def forward(self, x):
    keys = self.linear_k(x)
    values = self.linear_v(x)
    queries = self.linear_q(x)
    attn_scores = queries @ keys.T # omega
    attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5, dim=-1
        )

    context_vec = attn_weights @ values
    return context_vec

torch.manual_seed(789)
self_attn_v2 = SelfAttention_V2(d_in, d_out)
print(self_attn_v2(inputs))

tensor([[ 0.0266, -0.3275],
        [ 0.0271, -0.3267],
        [ 0.0272, -0.3268],
        [ 0.0264, -0.3277],
        [ 0.0275, -0.3294],
        [ 0.0261, -0.3266]], grad_fn=<MmBackward0>)
