**Import & Input**

In [1]:
import tensorflow as tf
import numpy as np

# Input
x = tf.constant([
    [0.1, 0.2, 0.3, 0.4],
    [0.5, 0.6, 0.7, 0.8],
    [0.9, 1.0, 1.1, 1.2]
], dtype=tf.float32)

**Projection matrices**

In [2]:
# Projection matrices (EXACT from manual)
W_Q = tf.constant([
    [0.1, 0.2, 0.3],
    [0.4, 0.5, 0.6],
    [0.7, 0.8, 0.9],
    [1.0, 1.1, 1.2]
], dtype=tf.float32)
W_K = W_Q  # Same as Q
W_V = tf.constant([
    [0.1, 0.2],
    [0.3, 0.4],
    [0.5, 0.6],
    [0.7, 0.8]
], dtype=tf.float32)

**Manual projection to Queries, Keys & Values**

In [3]:
queries = tf.matmul(x, W_Q)
print("Queries Matrix: ",queries)
keys = tf.matmul(x, W_K)
print("Keys Matrix:",keys)
values = tf.matmul(x, W_V)
print("Values Matrix:", values)

Queries Matrix:  tf.Tensor(
[[0.70000005 0.8000001  0.90000004]
 [1.5799999  1.8400002  2.1       ]
 [2.46       2.88       3.3000002 ]], shape=(3, 3), dtype=float32)
Keys Matrix: tf.Tensor(
[[0.70000005 0.8000001  0.90000004]
 [1.5799999  1.8400002  2.1       ]
 [2.46       2.88       3.3000002 ]], shape=(3, 3), dtype=float32)
Values Matrix: tf.Tensor(
[[0.5       0.6      ]
 [1.14      1.4000001]
 [1.7800001 2.2      ]], shape=(3, 2), dtype=float32)


**Attention**

In [4]:
scores = tf.matmul(queries, keys, transpose_b=True)
print("Attention Scores Matrix: ",scores)
scaled = scores / tf.sqrt(3.0)
print("Attention Score Matrix Scaled: ",scaled)
weights = tf.nn.softmax(scaled, axis=-1)
print("Attention Weights Matrix:", weights)

Attention Scores Matrix:  tf.Tensor(
[[ 1.9400002  4.4680004  6.996001 ]
 [ 4.4680004 10.292     16.116001 ]
 [ 6.996001  16.116001  25.236002 ]], shape=(3, 3), dtype=float32)
Attention Score Matrix Scaled:  tf.Tensor(
[[ 1.1200596  2.5796013  4.039143 ]
 [ 2.5796013  5.942089   9.304578 ]
 [ 4.039143   9.304578  14.570013 ]], shape=(3, 3), dtype=float32)
Attention Weights Matrix: tf.Tensor(
[[4.1966919e-02 1.8062508e-01 7.7740794e-01]
 [1.1589993e-03 3.3449765e-02 9.6539128e-01]
 [2.6561418e-05 5.1404452e-03 9.9483299e-01]], shape=(3, 3), dtype=float32)


**Contextualized Value Vectors for each word**

In [5]:

context = tf.matmul(weights, values)
print("Context vectors for each word:")
print(context.numpy())

Context vectors for each word:
[[1.6106822 1.9883528]
 [1.7571088 2.171386 ]
 [1.7766763 2.1958454]]
