In [12]:
import numpy as np
import math
np.random.seed(1)

In [13]:
L, d_k, d_v = 4, 8, 8
q = np.random.randn(L, d_k)
k = np.random.randn(L, d_k)
v = np.random.randn(L, d_v)

In [14]:
q, k, v

(array([[ 1.62434536, -0.61175641, -0.52817175, -1.07296862,  0.86540763,
         -2.3015387 ,  1.74481176, -0.7612069 ],
        [ 0.3190391 , -0.24937038,  1.46210794, -2.06014071, -0.3224172 ,
         -0.38405435,  1.13376944, -1.09989127],
        [-0.17242821, -0.87785842,  0.04221375,  0.58281521, -1.10061918,
          1.14472371,  0.90159072,  0.50249434],
        [ 0.90085595, -0.68372786, -0.12289023, -0.93576943, -0.26788808,
          0.53035547, -0.69166075, -0.39675353]]),
 array([[-0.6871727 , -0.84520564, -0.67124613, -0.0126646 , -1.11731035,
          0.2344157 ,  1.65980218,  0.74204416],
        [-0.19183555, -0.88762896, -0.74715829,  1.6924546 ,  0.05080775,
         -0.63699565,  0.19091548,  2.10025514],
        [ 0.12015895,  0.61720311,  0.30017032, -0.35224985, -1.1425182 ,
         -0.34934272, -0.20889423,  0.58662319],
        [ 0.83898341,  0.93110208,  0.28558733,  0.88514116, -0.75439794,
          1.25286816,  0.51292982, -0.29809284]]),
 array([[ 0.

## Self Attention

$$
\text{self attention} = softmax\bigg(\frac{Q.K^T}{\sqrt{d_k}}+M\bigg)
$$

$$
\text{new V} = \text{self attention}.V
$$ 

In [15]:
np.dot(q,k.T)

array([[ 0.59372367, -0.94549483, -0.95872768, -2.72188667],
       [ 0.37206766, -6.28431385,  0.66946338, -0.69900135],
       [ 4.19215154,  2.20952829,  0.20885328,  2.14304477],
       [-0.96560313, -2.37462568,  0.01151389, -0.11414552]])

In [18]:
q.var(), k.var(), np.dot(q,k.T).var()

(0.9697668554625956, 0.6929926957288495, 5.153730321939619)

In [21]:
scaled = np.dot(q, k.T) / np.sqrt(d_k)
q.var(), k.var(), scaled.var()

(0.9697668554625956, 0.6929926957288495, 0.6442162902424524)

In [22]:
scaled

array([[ 0.20991302, -0.3342829 , -0.33896142, -0.96233226],
       [ 0.13154578, -2.22184047,  0.23669105, -0.2471343 ],
       [ 1.48214939,  0.78118622,  0.07384078,  0.75768074],
       [-0.34139226, -0.83955696,  0.00407078, -0.04035654]])

## Masking

- This is to ensure words don't get context from words generated in the future. 
- Not required in the encoders, but required in the decoders

In [25]:
mask = np.tril(np.ones((L,L)))
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [26]:
mask[mask == 0] = -np.inf
mask[mask == 1] = 0
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [27]:
scaled + mask

array([[ 0.20991302,        -inf,        -inf,        -inf],
       [ 0.13154578, -2.22184047,        -inf,        -inf],
       [ 1.48214939,  0.78118622,  0.07384078,        -inf],
       [-0.34139226, -0.83955696,  0.00407078, -0.04035654]])

## Softmax

$$
\text{softmax} = \frac{e^{x_i}}{\sum_j e^x_j}
$$

In [35]:
def softmax(x):
    return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

In [36]:
attention = softmax(scaled + mask)
attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.91320301, 0.08679699, 0.        , 0.        ],
       [0.57449347, 0.28501037, 0.14049616, 0.        ],
       [0.22875201, 0.13899998, 0.32314512, 0.30910289]])

In [38]:
new_v = np.dot(attention, v)
new_v

array([[ 0.48851815, -0.07557171,  1.13162939,  1.51981682,  2.18557541,
        -1.39649634, -1.44411381, -0.50446586],
       [ 0.46000698,  0.00703651,  1.06080353,  1.21238031,  1.96929645,
        -1.20341895, -1.29879754, -0.39453947],
       [ 0.29502646,  0.17809604,  0.76628409,  0.35438721,  1.19618784,
        -0.54957641, -0.85828004, -0.019585  ],
       [ 0.09980571,  0.38875303,  0.73361113,  0.2563138 ,  0.40547033,
        -0.36333924, -0.38417869,  0.13643589]])

# Function

In [39]:
def softmax(x):
    return (np.exp(x).T / np.sum(np.exp(x), axis = -1)).T

def scaled_dot_product_attention(q, k, v, mask = None):
    scaled = np.dot(q,k.T) / np.sqrt(d_k)
    if mask is not None:
        scaled = scaled + mask
    attention = softmax(scaled)
    out = np.dot(attention, v)
    return out, attention

In [41]:
values, attention = scaled_dot_product_attention(q, k, v, mask = mask)
values, attention

(array([[ 0.48851815, -0.07557171,  1.13162939,  1.51981682,  2.18557541,
         -1.39649634, -1.44411381, -0.50446586],
        [ 0.46000698,  0.00703651,  1.06080353,  1.21238031,  1.96929645,
         -1.20341895, -1.29879754, -0.39453947],
        [ 0.29502646,  0.17809604,  0.76628409,  0.35438721,  1.19618784,
         -0.54957641, -0.85828004, -0.019585  ],
        [ 0.09980571,  0.38875303,  0.73361113,  0.2563138 ,  0.40547033,
         -0.36333924, -0.38417869,  0.13643589]]),
 array([[1.        , 0.        , 0.        , 0.        ],
        [0.91320301, 0.08679699, 0.        , 0.        ],
        [0.57449347, 0.28501037, 0.14049616, 0.        ],
        [0.22875201, 0.13899998, 0.32314512, 0.30910289]]))