# Self Attention in Transformers

## 1. Generate Data using numpy
## 2. Self Attention and Softmax Functions

$$
\text{self attention} = softmax\bigg(\frac{Q.K^T}{\sqrt{d_k}}+M\bigg)
$$

$$
\text{new V} = \text{self attention}.V
$$ 

$$
\text{softmax} = \frac{e^{x_i}}{\sum_j e^x_j}
$$

- Here Q represents the Query, K represents the Key and V represents the Value.
- In simple terms, Query is like what is being asked for, Key is more like what could be the answers and Value are the actual answers.
## 3. Masking

- This is to ensure words don't get context from words generated in the future. 
- Not required in the encoders, but required int he decoders




In [20]:
# Generating sample data
import numpy as np
import math

L, d_k, d_v = 4, 6, 6
q = np.random.randn(L, d_k)
k = np.random.randn(L, d_k)
v = np.random.randn(L, d_v)

In [21]:
print("Q\n", q)
print("K\n", k)
print("V\n", v)

Q
 [[-0.74997175 -1.48471274  0.16157109  1.14423135  0.34306808 -1.05432311]
 [ 0.55887059  0.70326087 -0.73438231  0.23873458  0.72512726 -1.23835805]
 [-1.01178833  1.5195689  -0.00756557 -0.28695223  1.4458824   0.73084787]
 [ 0.52728646  1.19624081  1.14599103  1.22874849 -0.09537591  0.19840448]]
K
 [[-0.01048114 -0.9534428  -1.50099221 -0.3175654  -1.70667591 -0.11072636]
 [ 0.30677411 -0.51819746 -1.51520262 -2.57945209 -0.16658498  0.48554307]
 [ 0.71328879 -0.14974623 -0.06419487  0.13252368 -1.41101419  0.7821332 ]
 [ 0.45202161 -0.97065668 -1.41170921 -0.23262226  1.05515032 -0.1203795 ]]
V
 [[ 1.86884234  0.13630357 -0.26468617 -0.52582364 -1.79191896 -0.76645068]
 [ 0.10983438  1.23890184 -1.1168217  -0.48865678 -0.12252874  0.65741124]
 [-0.05866044  1.23699223 -0.77772799  0.210061   -0.84459206 -0.41297181]
 [-0.66113363 -0.37446685  0.79213643 -0.52445988  0.00974803 -0.18244311]]


In [22]:
np.matmul(q, k.T)

array([[ 0.34879933, -3.2260697 , -1.48004563,  1.0967851 ],
       [-0.75032667, -0.41811886, -1.61961878,  1.46538719],
       [-3.88431215, -0.23218848, -2.45532947, -0.41725384],
       [-3.1155982 , -5.25181548,  0.57600198, -2.95095434]])

In [23]:
q.var(), k.var(), np.matmul(q, k.T).var()

(0.7619167478253632, 0.8012021019549399, 3.498903972683273)

In [24]:
scaled = np.matmul(q, k.T) / math.sqrt(d_k)
q.var(), k.var(), scaled.var()

(0.7619167478253632, 0.8012021019549399, 0.5831506621138789)

Reduction in variance of the product due to sqrt(d_k) term.

In [25]:
scaled

array([[ 0.14239673, -1.31703744, -0.6042261 ,  0.44776064],
       [-0.30631958, -0.17069631, -0.6612066 ,  0.59824181],
       [-1.5857638 , -0.09479055, -1.00238406, -0.17034317],
       [-1.27193764, -2.14404469,  0.23515183, -1.20472207]])

In [27]:
#Masking the data
mask = np.tril(np.ones( (L, L) ))
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [28]:
mask[mask == 0] = -np.infty
mask[mask == 1] = 0

In [29]:
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [30]:
scaled + mask

array([[ 0.14239673,        -inf,        -inf,        -inf],
       [-0.30631958, -0.17069631,        -inf,        -inf],
       [-1.5857638 , -0.09479055, -1.00238406,        -inf],
       [-1.27193764, -2.14404469,  0.23515183, -1.20472207]])

In [31]:
def softmax(x):
  return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

In [32]:
attention = softmax(scaled + mask)

In [33]:
attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.46614606, 0.53385394, 0.        , 0.        ],
       [0.13824564, 0.61400641, 0.24774794, 0.        ],
       [0.14283326, 0.05971426, 0.64468859, 0.15276389]])

In [34]:
new_v = np.matmul(attention, v)
new_v

array([[ 1.86884234,  0.13630357, -0.26468617, -0.52582364, -1.79191896,
        -0.76645068],
       [ 0.92978901,  0.72493   , -0.71960208, -0.50598196, -0.90070841,
        -0.00631638],
       [ 0.31126532,  1.08599933, -0.91500791, -0.32068904, -0.53220437,
         0.19538334],
       [ 0.13467646,  0.83371856, -0.4848787 , -0.04897948, -0.80627206,
        -0.36432675]])

In [35]:
v

array([[ 1.86884234,  0.13630357, -0.26468617, -0.52582364, -1.79191896,
        -0.76645068],
       [ 0.10983438,  1.23890184, -1.1168217 , -0.48865678, -0.12252874,
         0.65741124],
       [-0.05866044,  1.23699223, -0.77772799,  0.210061  , -0.84459206,
        -0.41297181],
       [-0.66113363, -0.37446685,  0.79213643, -0.52445988,  0.00974803,
        -0.18244311]])

In [36]:
def scaled_dot_product_attention(q, k, v, mask=None):
  d_k = q.shape[-1]
  scaled = np.matmul(q, k.T) / math.sqrt(d_k)
  if mask is not None:
    scaled = scaled + mask
  attention = softmax(scaled)
  out = np.matmul(attention, v)
  return out, attention

In [37]:
values, attention = scaled_dot_product_attention(q, k, v, mask=mask)
print("Q\n", q)
print("K\n", k)
print("V\n", v)
print("New V\n", values)
print("Attention\n", attention)

Q
 [[-0.74997175 -1.48471274  0.16157109  1.14423135  0.34306808 -1.05432311]
 [ 0.55887059  0.70326087 -0.73438231  0.23873458  0.72512726 -1.23835805]
 [-1.01178833  1.5195689  -0.00756557 -0.28695223  1.4458824   0.73084787]
 [ 0.52728646  1.19624081  1.14599103  1.22874849 -0.09537591  0.19840448]]
K
 [[-0.01048114 -0.9534428  -1.50099221 -0.3175654  -1.70667591 -0.11072636]
 [ 0.30677411 -0.51819746 -1.51520262 -2.57945209 -0.16658498  0.48554307]
 [ 0.71328879 -0.14974623 -0.06419487  0.13252368 -1.41101419  0.7821332 ]
 [ 0.45202161 -0.97065668 -1.41170921 -0.23262226  1.05515032 -0.1203795 ]]
V
 [[ 1.86884234  0.13630357 -0.26468617 -0.52582364 -1.79191896 -0.76645068]
 [ 0.10983438  1.23890184 -1.1168217  -0.48865678 -0.12252874  0.65741124]
 [-0.05866044  1.23699223 -0.77772799  0.210061   -0.84459206 -0.41297181]
 [-0.66113363 -0.37446685  0.79213643 -0.52445988  0.00974803 -0.18244311]]
New V
 [[ 1.86884234  0.13630357 -0.26468617 -0.52582364 -1.79191896 -0.76645068]
 [ 0.9