In [1]:
from numpy import array
from numpy import random
from numpy import dot
from scipy.special import softmax

# encoder representations of four different words
word_1 = array([1, 0, 0])
word_2 = array([0, 1, 0])
word_3 = array([1, 1, 0])
word_4 = array([0, 0, 1])

# stacking the word embeddings into a single array
words = array([word_1, word_2, word_3, word_4])

# generating the weight matrices
W_Q = random.randint(3, size=(3, 3))
W_K = random.randint(3, size=(3, 3))
W_V = random.randint(3, size=(3, 3))

# generating the queries, keys and values
Q = words @ W_Q
K = words @ W_K
V = words @ W_V

Q, K, V

(array([[0, 2, 0],
        [0, 2, 2],
        [0, 4, 2],
        [0, 0, 2]]),
 array([[2, 1, 1],
        [0, 2, 2],
        [2, 3, 3],
        [0, 1, 1]]),
 array([[0, 1, 1],
        [0, 1, 1],
        [0, 2, 2],
        [1, 0, 0]]))

In [2]:
# scoring the query vectors against all key vectors
scores = Q @ K.transpose()
scores

array([[ 2,  4,  6,  2],
       [ 4,  8, 12,  4],
       [ 6, 12, 18,  6],
       [ 2,  4,  6,  2]])

In [3]:
# computing the weights by a softmax operation
weights = softmax(scores / K.shape[1] ** 0.5, axis=1)
weights

array([[6.56104878e-02, 2.08186871e-01, 6.60592153e-01, 6.56104878e-02],
       [8.81516097e-03, 8.87545020e-02, 8.93615176e-01, 8.81516097e-03],
       [9.48221252e-04, 3.02935312e-02, 9.67810026e-01, 9.48221252e-04],
       [6.56104878e-02, 2.08186871e-01, 6.60592153e-01, 6.56104878e-02]])

In [4]:
# computing the attention by a weighted sum of the value vectors
attention = weights @ V
attention

array([[6.56104878e-02, 1.59498167e+00, 1.59498167e+00],
       [8.81516097e-03, 1.88480002e+00, 1.88480002e+00],
       [9.48221252e-04, 1.96686181e+00, 1.96686181e+00],
       [6.56104878e-02, 1.59498167e+00, 1.59498167e+00]])