Q1. Compute Scaled Dot-Product Attention (Python)




In [1]:
import numpy as np

def softmax(x):
    x_shifted = x - np.max(x, axis=-1, keepdims=True)
    exp_x = np.exp(x_shifted)
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)


def scaled_dot_product_attention(Q, K, V):

    # Step 1: Compute raw attention scores (Q * K^T)
    scores = np.dot(Q, K.T)

    # Step 2: Scale by sqrt(d_k)
    d_k = K.shape[-1]
    scores = scores / np.sqrt(d_k)

    # Step 3: Softmax normalization
    attention_weights = softmax(scores)

    # Step 4: Compute context vector (A * V)
    context_vector = np.dot(attention_weights, V)

    return attention_weights, context_vector


if __name__ == "__main__":
    # Example matrices
    Q = np.array([[1, 0, 1]])
    K = np.array([[1, 0, 1],
                  [0, 1, 0]])
    V = np.array([[1, 2],
                  [3, 4]])

    attn, ctx = scaled_dot_product_attention(Q, K, V)

    print("Attention Weights:\n", attn)
    print("Context Vector:\n", ctx)


Attention Weights:
 [[0.76036844 0.23963156]]
Context Vector:
 [[1.47926312 2.47926312]]


Q2. Implement Simple Transformer Encoder Block (PyTorch)


In [2]:
import torch
import torch.nn as nn

class SimpleTransformerEncoder(nn.Module):
    def __init__(self, d_model=128, num_heads=8, dim_ff=512):
        super(SimpleTransformerEncoder, self).__init__()

        # Multi-head self-attention
        self.self_attn = nn.MultiheadAttention(embed_dim=d_model, num_heads=num_heads, batch_first=True)

        self.ffn = nn.Sequential(
            nn.Linear(d_model, dim_ff),
            nn.ReLU(),
            nn.Linear(dim_ff, d_model)
        )

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x):
        # x shape: (batch, seq_len, d_model)

        attn_output, _ = self.self_attn(x, x, x)  # Q=K=V=x

        x = self.norm1(x + attn_output)

        ff_output = self.ffn(x)

        out = self.norm2(x + ff_output)

        return out


if __name__ == "__main__":
    batch_size = 32
    seq_len = 10
    d_model = 128


    x = torch.randn(batch_size, seq_len, d_model)

    encoder = SimpleTransformerEncoder(d_model=128, num_heads=8)

    output = encoder(x)

    print("Input shape :", x.shape)
    print("Output shape:", output.shape)


Input shape : torch.Size([32, 10, 128])
Output shape: torch.Size([32, 10, 128])
