In [1]:
import numpy as np

In [2]:
X = np.array([
    [1, 2, 3, 4],
    [5, 6, 7, 8]
])

In [3]:
# Weight matrices for two attention heads (4x3)
WQ_1 = np.array([
    [0.1, 0.2, 0.3],
    [0.4, 0.5, 0.6],
    [0.7, 0.8, 0.9],
    [1.0, 1.1, 1.2]
])

WK_1 = np.array([
    [0.3, 0.2, 0.1],
    [0.6, 0.5, 0.4],
    [0.9, 0.8, 0.7],
    [1.2, 1.1, 1.0]
])

WV_1 = np.array([
    [0.7, 0.8, 0.9],
    [0.4, 0.5, 0.6],
    [0.1, 0.2, 0.3],
    [1.3, 1.4, 1.5]
])

WQ_2 = np.array([
    [0.2, 0.3, 0.4],
    [0.5, 0.6, 0.7],
    [0.8, 0.9, 1.0],
    [1.1, 1.2, 1.3]
])

WK_2 = np.array([
    [0.4, 0.3, 0.2],
    [0.7, 0.6, 0.5],
    [1.0, 0.9, 0.8],
    [1.3, 1.2, 1.1]
])

WV_2 = np.array([
    [0.9, 1.0, 1.1],
    [0.6, 0.7, 0.8],
    [0.3, 0.4, 0.5],
    [1.4, 1.5, 1.6]
])

print("WQ_1:\n", WQ_1)
print("WK_1:\n", WK_1)
print("WV_1:\n", WV_1)
print("WQ_2:\n", WQ_2)
print("WK_2:\n", WK_2)
print("WV_2:\n", WV_2)


WQ_1:
 [[0.1 0.2 0.3]
 [0.4 0.5 0.6]
 [0.7 0.8 0.9]
 [1.  1.1 1.2]]
WK_1:
 [[0.3 0.2 0.1]
 [0.6 0.5 0.4]
 [0.9 0.8 0.7]
 [1.2 1.1 1. ]]
WV_1:
 [[0.7 0.8 0.9]
 [0.4 0.5 0.6]
 [0.1 0.2 0.3]
 [1.3 1.4 1.5]]
WQ_2:
 [[0.2 0.3 0.4]
 [0.5 0.6 0.7]
 [0.8 0.9 1. ]
 [1.1 1.2 1.3]]
WK_2:
 [[0.4 0.3 0.2]
 [0.7 0.6 0.5]
 [1.  0.9 0.8]
 [1.3 1.2 1.1]]
WV_2:
 [[0.9 1.  1.1]
 [0.6 0.7 0.8]
 [0.3 0.4 0.5]
 [1.4 1.5 1.6]]


# Part a

In [4]:
# Compute Q, K, V for Head 1
Q_1 = np.dot(X, WQ_1)
K_1 = np.dot(X, WK_1)
V_1 = np.dot(X, WV_1)

# Compute Q, K, V for Head 2
Q_2 = np.dot(X, WQ_2)
K_2 = np.dot(X, WK_2)
V_2 = np.dot(X, WV_2)

print("Head 1 - Q_1:\n", Q_1)
print("Head 1 - K_1:\n", K_1)
print("Head 1 - V_1:\n", V_1)

print("Head 2 - Q_2:\n", Q_2)
print("Head 2 - K_2:\n", K_2)
print("Head 2 - V_2:\n", V_2)


Head 1 - Q_1:
 [[ 7.   8.   9. ]
 [15.8 18.4 21. ]]
Head 1 - K_1:
 [[ 9.   8.   7. ]
 [21.  18.4 15.8]]
Head 1 - V_1:
 [[ 7.   8.   9. ]
 [17.  19.6 22.2]]
Head 2 - Q_2:
 [[ 8.   9.  10. ]
 [18.4 21.  23.6]]
Head 2 - K_2:
 [[10.   9.   8. ]
 [23.6 21.  18.4]]
Head 2 - V_2:
 [[ 8.6  9.6 10.6]
 [21.4 24.  26.6]]


# Part b

In [5]:
# Function to compute attention scores
def compute_attention_scores(Q, K):
    dk = K.shape[-1]  # Get the dimension of the key matrix
    return np.dot(Q, K.T) / np.sqrt(dk)

# Compute attention scores for Head 1
attention_scores_1 = compute_attention_scores(Q_1, K_1)

# Compute attention scores for Head 2
attention_scores_2 = compute_attention_scores(Q_2, K_2)

print("Head 1 - Attention Scores:\n", attention_scores_1)
print("Head 2 - Attention Scores:\n", attention_scores_2)


Head 1 - Attention Scores:
 [[109.69655115 251.95565747]
 [251.95565747 578.59734577]]
Head 2 - Attention Scores:
 [[139.14141487 324.35538123]
 [324.35538123 756.0286305 ]]


In [8]:
# Function to apply softmax along rows
def apply_softmax(scores):
     # This step is done for normalisation. You wont be expected to do normalisation in the exam.
    exp_scores = np.exp(scores - np.max(scores,axis=1,keepdims=True))
    return exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

# Apply softmax to attention scores for Head 1
softmax_attention_1 = apply_softmax(attention_scores_1)

# Apply softmax to attention scores for Head 2
softmax_attention_2 = apply_softmax(attention_scores_2)

print("Head 1 - Softmax Attention Weights:\n", np.round(softmax_attention_1,3))
print("Head 2 - Softmax Attention Weights:\n", np.round(softmax_attention_2,3))


Head 1 - Softmax Attention Weights:
 [[0. 1.]
 [0. 1.]]
Head 2 - Softmax Attention Weights:
 [[0. 1.]
 [0. 1.]]


In [10]:
# Compute weighted sum (Z) for Head 1
Z_1 = np.dot(softmax_attention_1, V_1)

# Compute weighted sum (Z) for Head 2
Z_2 = np.dot(softmax_attention_2, V_2)

print("Head 1 - Weighted Sum (Z_1):\n", Z_1)
print("Head 2 - Weighted Sum (Z_2):\n", Z_2)


Head 1 - Weighted Sum (Z_1):
 [[17.  19.6 22.2]
 [17.  19.6 22.2]]
Head 2 - Weighted Sum (Z_2):
 [[21.4 24.  26.6]
 [21.4 24.  26.6]]


# Part c

In [11]:
# Concatenate Z_1 and Z_2 along axis 1 (columns)
Z = np.concatenate((Z_1, Z_2), axis=1)

print("Multi-Headed Attention Output (Z):\n", Z)


Multi-Headed Attention Output (Z):
 [[17.  19.6 22.2 21.4 24.  26.6]
 [17.  19.6 22.2 21.4 24.  26.6]]


In [12]:
Z.shape

(2, 6)

# Part d

In [14]:
# Sample values for the weight matrix W_0 (6x4)
W_0 = np.array([
    [0.1, 0.2, 0.3, 0.4],
    [0.5, 0.6, 0.7, 0.8],
    [0.9, 1.0, 1.1, 1.2],
    [1.3, 1.4, 1.5, 1.6],
    [1.7, 1.8, 1.9, 2.0],
    [2.1, 2.2, 2.3, 2.4]
])


In [15]:
# Define the weight matrix W_0 with shape 6x4 (sample values)
# Random values for demonstration, replace with actual matrix W_0

# Multiply the concatenated multi-headed attention output Z by the weight matrix W_0
result = np.dot(Z, W_0)  # Transpose W_0 to match dimensions for multiplication

print("Result after multiplication with W_0:\n", result)


Result after multiplication with W_0:
 [[155.96 169.04 182.12 195.2 ]
 [155.96 169.04 182.12 195.2 ]]
