In [4]:
import numpy as np
import math

L, d_k, d_v = 4, 8, 8

q = np.random.randn(L, d_k)
k = np.random.randn(L, d_k)
v = np.random.randn(L, d_v)

In [7]:
print("Q: ",q)
print("\nK: ",k)
print("\nV: ",v)

Q:  [[ 0.98205942 -0.50755003  0.47323046  0.10986494  0.0447011  -0.04533553
   1.28116101 -0.1757931 ]
 [ 0.14364862 -0.76884773  0.26990682  2.33779622  0.99162073 -0.20263487
  -1.25577631 -0.02678597]
 [ 0.13779342  1.84631505  0.25225605  0.01757992  0.93741477  0.16007054
   0.48772065 -0.21236113]
 [ 0.10697245 -1.84942673  0.5812828  -0.32358033  0.68510351  0.80408038
  -3.05743139  1.46230043]]

K:  [[ 0.47533767 -1.43451379 -0.28773468  0.91437225 -1.37915891  0.72313008
  -1.87916793  0.3201954 ]
 [-1.95823135  0.80544231 -0.81318788  0.18794265 -1.57861441 -1.60963002
  -1.49455126 -0.64197819]
 [-0.04249592 -0.01675876 -1.0202719   1.36645528  2.06222298  0.16856864
  -0.96584488  0.64269616]
 [ 0.09575021  0.81083233 -0.24131169 -2.11070173 -0.54358969  0.4815727
   1.32580471 -1.04449218]]

V:  [[-1.41227385 -0.10021945  0.67363302 -0.47450307 -0.26607726  0.21561015
   0.56622529  0.56259403]
 [-0.63489962 -0.50304537 -0.70390156 -0.7501038   1.2129899  -0.44650598
  

#Self Attention

$$
\text{Self Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V
$$

$$
\text{New V} = \text{self attention}. \text{V}
$$

In [12]:
q.var()

np.float64(1.372748955520124)

In [13]:
k.var()

np.float64(0.959591576712673)

In [15]:
np.matmul(q,k.T).var()

np.float64(11.624958798722634)

In [9]:
# Q.K^T is of high order
#Hence it is divided with sqrt(d_k)
scaled = np.matmul(q,k.T)/math.sqrt(d_k)
scaled

array([[-0.49463825, -1.58942635, -0.57691735,  0.41452674],
       [ 1.43834804, -0.00914805,  2.16811354, -2.78696558],
       [-1.69747078, -0.46478899,  0.38270379,  0.65344299],
       [ 2.86059521, -0.34565262,  1.56709779, -2.30259829]])

In [10]:
scaled.var()

np.float64(2.3440784950138314)

the variance is now reduced

#Masking



*   This is to ensure that wwords don't get context from the words generated in the future.
*   Not required in encoder, but required in the decoders



In [11]:
mask = np.tril(np.ones((L,L)))
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [13]:
mask[mask == 0] = -np.inf
mask[mask == 1] = 0

In [14]:
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [15]:
scaled+mask

array([[-0.49463825,        -inf,        -inf,        -inf],
       [ 1.43834804, -0.00914805,        -inf,        -inf],
       [-1.69747078, -0.46478899,  0.38270379,        -inf],
       [ 2.86059521, -0.34565262,  1.56709779, -2.30259829]])

#Softmax

$$
\text{softmax}(x_i) = \frac{e^{x_i}}{\sum_{j=1}^{n} e^{x_j}}
$$


In [16]:
def softmax(x):
  exp_x = np.exp(x)
  return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

In [17]:
attention = softmax(scaled+mask)

In [18]:
attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.80961278, 0.19038722, 0.        , 0.        ],
       [0.08040988, 0.2758394 , 0.64375073, 0.        ],
       [0.75726514, 0.03067554, 0.20772519, 0.00433413]])

In [19]:
new_v = np.matmul(attention, v)
new_v

array([[-1.41227385, -0.10021945,  0.67363302, -0.47450307, -0.26607726,
         0.21561015,  0.56622529,  0.56259403],
       [-1.26427173, -0.17691236,  0.41136804, -0.52697393,  0.01551823,
         0.0895517 ,  0.18482251,  0.74636439],
       [-0.38293493,  0.61392392, -0.67075979, -0.32143779,  0.01673851,
        -0.13617661, -0.9646321 , -0.45126398],
       [-1.11912456,  0.15256096,  0.3207672 , -0.4059155 , -0.25933705,
         0.14035667,  0.18745802,  0.18333973]])

#Function

In [20]:
def softmax(x):
  exp_x = np.exp(x)
  return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

def scaled_dot_product_attention(q,k,v,mask=None):
  d_k = q.shape[-1]
  scaled = np.matmul(q,k.T)/math.sqrt(d_k)
  if mask is not None:
    scaled = scaled+mask
  attention = softmax(scaled)
  out = np.matmul(attention, v)
  return out,attention

In [23]:
values, attention = scaled_dot_product_attention(q,k,v,mask=mask)
print("Q: \n",q)
print("\nK: \n",k)
print("\nV: \n",v)
print("\nNew V: \n",values)
print("\nAttention: \n",attention)

Q: 
 [[ 0.98205942 -0.50755003  0.47323046  0.10986494  0.0447011  -0.04533553
   1.28116101 -0.1757931 ]
 [ 0.14364862 -0.76884773  0.26990682  2.33779622  0.99162073 -0.20263487
  -1.25577631 -0.02678597]
 [ 0.13779342  1.84631505  0.25225605  0.01757992  0.93741477  0.16007054
   0.48772065 -0.21236113]
 [ 0.10697245 -1.84942673  0.5812828  -0.32358033  0.68510351  0.80408038
  -3.05743139  1.46230043]]

K: 
 [[ 0.47533767 -1.43451379 -0.28773468  0.91437225 -1.37915891  0.72313008
  -1.87916793  0.3201954 ]
 [-1.95823135  0.80544231 -0.81318788  0.18794265 -1.57861441 -1.60963002
  -1.49455126 -0.64197819]
 [-0.04249592 -0.01675876 -1.0202719   1.36645528  2.06222298  0.16856864
  -0.96584488  0.64269616]
 [ 0.09575021  0.81083233 -0.24131169 -2.11070173 -0.54358969  0.4815727
   1.32580471 -1.04449218]]

V: 
 [[-1.41227385 -0.10021945  0.67363302 -0.47450307 -0.26607726  0.21561015
   0.56622529  0.56259403]
 [-0.63489962 -0.50304537 -0.70390156 -0.7501038   1.2129899  -0.44650598