In [None]:
 import numpy as np
import math

In [None]:
#L is the length of sequence
#size of each vector is 8
L, d_k, d_v = 4, 8, 8

q = np.random.rand(L, d_k)
k = np.random.rand(L, d_k)
v = np.random.rand(L, d_v)

In [None]:
print("Q\n", q)
print("K\n", k)
print("V\n", v)

Q
 [[0.03516553 0.39443892 0.68515114 0.61116192 0.95954882 0.34170034
  0.21008309 0.61531448]
 [0.10551262 0.10063835 0.84490331 0.3600921  0.73717909 0.28412922
  0.95338587 0.15819319]
 [0.13632456 0.25890823 0.45904796 0.50874366 0.28343728 0.23984914
  0.27504889 0.53832964]
 [0.45646359 0.30812743 0.94504223 0.30611443 0.66912584 0.61999278
  0.65990707 0.95887408]]
K
 [[0.20588949 0.70705122 0.67539631 0.27279736 0.90232276 0.11855169
  0.9932036  0.89274042]
 [0.19749757 0.40147932 0.33510483 0.98696583 0.97452718 0.218806
  0.65712737 0.23873479]
 [0.8030173  0.14896225 0.98696104 0.46297994 0.61963442 0.84603292
  0.62098428 0.92172243]
 [0.22049509 0.36655229 0.4189931  0.3029475  0.82672969 0.19628122
  0.94123642 0.27513637]]
V
 [[0.53155351 0.28582039 0.86707958 0.10734356 0.66568229 0.58875906
  0.05756912 0.24355149]
 [0.01490617 0.90589651 0.38697766 0.94419101 0.06448466 0.0565052
  0.32406503 0.50999069]
 [0.04432185 0.43505744 0.60107197 0.03849098 0.46203224 0.356

## Self Attention

$$
\text{self attention} = softmax\bigg(\frac{Q.K^T}{\sqrt{d_k}}+M\bigg)
$$

$$
\text{new V} = \text{self attention}.V
$$

In [None]:
#multiplication
np.matmul(q, k.T)

array([[2.57990392, 2.29291837, 2.62743488, 1.85195   ],
       [2.54874636, 2.14460488, 2.53533389, 2.12935517],
       [1.69790701, 1.42476981, 1.88217839, 1.15982685],
       [3.22234457, 2.282969  , 3.71964758, 2.26212442]])

In [None]:
# Why we need sqrt(d_k) in denominator
q.var(), k.var(), np.matmul(q, k.T).var()

(0.07401896682835635, 0.09221065687890084, 0.37599064646711167)

In [None]:
scaled = np.matmul(q, k.T) / math.sqrt(d_k)
q.var(), k.var(), scaled.var()

(0.07401896682835635, 0.09221065687890084, 0.04699883080838895)

In [None]:
scaled

array([[0.91213378, 0.81066906, 0.92893851, 0.6547632 ],
       [0.90111792, 0.75823233, 0.89637589, 0.75284074],
       [0.60030078, 0.5037322 , 0.66545055, 0.41006072],
       [1.13927085, 0.80715143, 1.31509401, 0.79978176]])

## Masking

- This is to ensure words don't get context from words generated in the future.
- Not required in the encoders, but required in the decoders

In [None]:
# create a trangular matrix
mask = np.tril(np.ones( (L, L) ))
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [None]:
# make every single one a zero and every zero to negative Infinity.
# because if we apply this mask we get the exact same values for the lower diagonal as it was without the mask,
# but the values that are above that mask are just going to be considered as negative Infinity
# which means that we're not really going to be getting any context from it.
mask[mask == 0] = -np.infty
mask[mask == 1] = 0
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [None]:
scaled + mask

array([[0.91213378,       -inf,       -inf,       -inf],
       [0.90111792, 0.75823233,       -inf,       -inf],
       [0.60030078, 0.5037322 , 0.66545055,       -inf],
       [1.13927085, 0.80715143, 1.31509401, 0.79978176]])

## Softmax

$$
\text{softmax} = \frac{e^{x_i}}{\sum_j e^x_j}
$$

In [None]:
#convert a vector into a probability distribution
def softmax(x):
  return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

In [None]:
attention = softmax(scaled + mask)

In [None]:
attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.53566075, 0.46433925, 0.        , 0.        ],
       [0.33610435, 0.30516514, 0.35873051, 0.        ],
       [0.27610863, 0.19808079, 0.32918422, 0.19662636]])

In [None]:
new_v = np.matmul(attention, v)
new_v

array([[0.53155351, 0.28582039, 0.86707958, 0.10734356, 0.66568229,
        0.58875906, 0.05756912, 0.24355149],
       [0.29165387, 0.57374608, 0.64414941, 0.49592468, 0.38652263,
        0.3416127 , 0.18131364, 0.36726967],
       [0.19910589, 0.52858189, 0.62514416, 0.33802071, 0.40916224,
        0.34297308, 0.37048229, 0.41850304],
       [0.23361327, 0.49330216, 0.59734136, 0.39926321, 0.36185579,
        0.45675344, 0.32073766, 0.33477113]])

In [None]:
v

array([[0.53155351, 0.28582039, 0.86707958, 0.10734356, 0.66568229,
        0.58875906, 0.05756912, 0.24355149],
       [0.01490617, 0.90589651, 0.38697766, 0.94419101, 0.06448466,
        0.0565052 , 0.32406503, 0.50999069],
       [0.04432185, 0.43505744, 0.60107197, 0.03849098, 0.46203224,
        0.35638225, 0.7031454 , 0.50459312],
       [0.35246583, 0.46651905, 0.42424096, 0.86421782, 0.06707274,
        0.84263358, 0.04672262, 0.00203945]])

In [None]:
def softmax(x):
  return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

def scaled_dot_product_attention(q, k, v, mask=None):
  d_k = q.shape[-1]
  scaled = np.matmul(q, k.T) / math.sqrt(d_k)
  if mask is not None:
    scaled = scaled + mask
  attention = softmax(scaled)
  out = np.matmul(attention, v)
  return out, attention

In [None]:
# Encoder
values, attention = scaled_dot_product_attention(q, k, v, mask=None)
print("Q\n", q)
print("K\n", k)
print("V\n", v)
print("New V\n", values)
print("Attention\n", attention)

Q
 [[0.03516553 0.39443892 0.68515114 0.61116192 0.95954882 0.34170034
  0.21008309 0.61531448]
 [0.10551262 0.10063835 0.84490331 0.3600921  0.73717909 0.28412922
  0.95338587 0.15819319]
 [0.13632456 0.25890823 0.45904796 0.50874366 0.28343728 0.23984914
  0.27504889 0.53832964]
 [0.45646359 0.30812743 0.94504223 0.30611443 0.66912584 0.61999278
  0.65990707 0.95887408]]
K
 [[0.20588949 0.70705122 0.67539631 0.27279736 0.90232276 0.11855169
  0.9932036  0.89274042]
 [0.19749757 0.40147932 0.33510483 0.98696583 0.97452718 0.218806
  0.65712737 0.23873479]
 [0.8030173  0.14896225 0.98696104 0.46297994 0.61963442 0.84603292
  0.62098428 0.92172243]
 [0.22049509 0.36655229 0.4189931  0.3029475  0.82672969 0.19628122
  0.94123642 0.27513637]]
V
 [[0.53155351 0.28582039 0.86707958 0.10734356 0.66568229 0.58875906
  0.05756912 0.24355149]
 [0.01490617 0.90589651 0.38697766 0.94419101 0.06448466 0.0565052
  0.32406503 0.50999069]
 [0.04432185 0.43505744 0.60107197 0.03849098 0.46203224 0.356

In [None]:
# Decoder
values, attention = scaled_dot_product_attention(q, k, v, mask=mask)
print("Q\n", q)
print("K\n", k)
print("V\n", v)
print("New V\n", values)
print("Attention\n", attention)

Q
 [[0.03516553 0.39443892 0.68515114 0.61116192 0.95954882 0.34170034
  0.21008309 0.61531448]
 [0.10551262 0.10063835 0.84490331 0.3600921  0.73717909 0.28412922
  0.95338587 0.15819319]
 [0.13632456 0.25890823 0.45904796 0.50874366 0.28343728 0.23984914
  0.27504889 0.53832964]
 [0.45646359 0.30812743 0.94504223 0.30611443 0.66912584 0.61999278
  0.65990707 0.95887408]]
K
 [[0.20588949 0.70705122 0.67539631 0.27279736 0.90232276 0.11855169
  0.9932036  0.89274042]
 [0.19749757 0.40147932 0.33510483 0.98696583 0.97452718 0.218806
  0.65712737 0.23873479]
 [0.8030173  0.14896225 0.98696104 0.46297994 0.61963442 0.84603292
  0.62098428 0.92172243]
 [0.22049509 0.36655229 0.4189931  0.3029475  0.82672969 0.19628122
  0.94123642 0.27513637]]
V
 [[0.53155351 0.28582039 0.86707958 0.10734356 0.66568229 0.58875906
  0.05756912 0.24355149]
 [0.01490617 0.90589651 0.38697766 0.94419101 0.06448466 0.0565052
  0.32406503 0.50999069]
 [0.04432185 0.43505744 0.60107197 0.03849098 0.46203224 0.356