 **BUILDING MULTOHEAD ATTENTION FROM SCRATCH**

In [1]:
import numpy as np
def multihead_attention(q, k, v, num_heads=2):
    d_model = q.shape[-1]
    assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
    depth = d_model // num_heads

    # Split heads
    def split_heads(x):
        return x.reshape(x.shape[0], -1, num_heads, depth).transpose(0, 2, 1, 3)

    q_heads = split_heads(q)
    k_heads = split_heads(k)
    v_heads = split_heads(v)

    # Scaled dot-product attention
    scores = np.matmul(q_heads, k_heads.transpose(0, 1, 3, 2)) / np.sqrt(depth)
    weights = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
    weights /= np.sum(weights, axis=-1, keepdims=True)

    context = np.matmul(weights, v_heads)

    # Combine heads
    context = context.transpose(0, 2, 1, 3).reshape(context.shape[0], -1, d_model)
    return context