In [None]:
import numpy as np

def softmax(x):
    """Compute softmax values for each row of x."""
    return np.exp(x) / np.sum(np.exp(x), axis=-1, keepdims=True)

# Example input (word embeddings as vectors)
input_vectors = np.array([
    [1.0, 0.5, 0.2],  # Word 1
    [0.8, 0.1, 0.4],  # Word 2
    [0.9, 0.3, 0.7]   # Word 3
])

# Initialize random weight matrices for query, key, and value
np.random.seed(42)  # For reproducibility
embedding_dim = input_vectors.shape[1] #get the embedding dimension
W_q = np.random.rand(embedding_dim, embedding_dim)  # Query weights
W_k = np.random.rand(embedding_dim, embedding_dim)  # Key weights
W_v = np.random.rand(embedding_dim, embedding_dim)  # Value weights

# Compute queries, keys, and values
queries = input_vectors @ W_q
keys = input_vectors @ W_k
values = input_vectors @ W_v

# Compute attention scores (dot product of queries and keys)
attention_scores = queries @ keys.T

# Scale attention scores (important for stability)
scale_factor = np.sqrt(embedding_dim)
scaled_attention_scores = attention_scores / scale_factor

# Apply softmax to get attention weights
attention_weights = softmax(scaled_attention_scores)

# Compute final output as weighted sum of values
output_vectors = attention_weights @ values

# Print results
print("Input Vectors:")
print(input_vectors)
print("\nAttention Weights:")
print(attention_weights)
print("\nOutput Vectors:")
print(output_vectors)

Input Vectors:
[[1.  0.5 0.2]
 [0.8 0.1 0.4]
 [0.9 0.3 0.7]]

Attention Weights:
[[0.35135292 0.26561127 0.38303581]
 [0.34136602 0.28126756 0.37736642]
 [0.34452088 0.26282275 0.39265637]]

Output Vectors:
[[0.63948136 0.70637722 0.76088776]
 [0.63779355 0.70436497 0.75749905]
 [0.64113336 0.70936223 0.76119321]]
