In [3]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

# ==========================================
# Q1. Compute Scaled Dot-Product Attention (Python/NumPy)
# ==========================================

def scaled_dot_product_attention(Q, K, V):
    """
    Computes the scaled dot-product attention.

    Args:
        Q (np.array): Query matrix of shape (seq_len, d_k)
        K (np.array): Key matrix of shape (seq_len, d_k)
        V (np.array): Value matrix of shape (seq_len, d_v)

    Returns:
        tuple: (context_vector, attention_weights)
    """
    # 1. Determine dimension d_k for scaling
    d_k = Q.shape[-1]

    # 2. Compute scores: (Q . K^T) / sqrt(d_k)
    scores = np.dot(Q, K.T) / np.sqrt(d_k)

    # 3. Normalize scores using softmax
    # We subtract max for numerical stability before exp
    exp_scores = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
    attention_weights = exp_scores / np.sum(exp_scores, axis=-1, keepdims=True)

    # 4. Compute context vector: weights . V
    context_vector = np.dot(attention_weights, V)

    return context_vector, attention_weights

# --- Testing Q1 ---
print("--- Q1: Scaled Dot-Product Attention Test ---")
np.random.seed(42)
seq_len = 3
d_model = 4
Q_np = np.random.rand(seq_len, d_model)
K_np = np.random.rand(seq_len, d_model)
V_np = np.random.rand(seq_len, d_model)

context, weights = scaled_dot_product_attention(Q_np, K_np, V_np)
print("Attention Weights shape:", weights.shape)
print("Context Vector shape:", context.shape)
print("\n")


# ==========================================
# Q2. Implement Simple Transformer Encoder Block (PyTorch)
# ==========================================

class SimpleTransformerEncoderBlock(nn.Module):
    def __init__(self, d_model, n_head, ff_hidden=256, dropout=0.1):
        super(SimpleTransformerEncoderBlock, self).__init__()

        # a) Initialize dimensions dmodel=128, h=8 (passed in arguments)
        self.self_attn = nn.MultiheadAttention(embed_dim=d_model, num_heads=n_head, batch_first=True)

        # b) Add residual connections and layer normalization
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        # Feed-forward network (2 linear layers with ReLU)
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, ff_hidden),
            nn.ReLU(),
            nn.Linear(ff_hidden, d_model)
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        """
        Args:
            x: Input tensor of shape (batch_size, seq_len, d_model)
        """
        # 1. Multi-Head Attention Sub-layer
        # The attention layer returns (attn_output, attn_weights)
        attn_output, _ = self.self_attn(x, x, x) # Self-attention: Q=K=V=x

        # Add & Norm (Residual connection + LayerNorm)
        x = self.norm1(x + self.dropout(attn_output))

        # 2. Feed-Forward Sub-layer
        ff_output = self.feed_forward(x)

        # Add & Norm (Residual connection + LayerNorm)
        x = self.norm2(x + self.dropout(ff_output))

        return x

# --- Testing Q2 ---
print("--- Q2: Transformer Encoder Block Test ---")

# c) Verify the output shape for a batch of 32 sentences, each with 10 tokens
BATCH_SIZE = 32
SEQ_LEN = 10
D_MODEL = 128
N_HEAD = 8

# Instantiate model
encoder_block = SimpleTransformerEncoderBlock(d_model=D_MODEL, n_head=N_HEAD)

# Create dummy input (Batch, Seq_Len, D_Model)
dummy_input = torch.randn(BATCH_SIZE, SEQ_LEN, D_MODEL)

# Forward pass
output = encoder_block(dummy_input)

print(f"Input shape: {dummy_input.shape}")
print(f"Output shape: {output.shape}")

# Verification
if output.shape == (BATCH_SIZE, SEQ_LEN, D_MODEL):
    print("SUCCESS: Output shape matches requirements.")
else:
    print("FAILURE: Output shape incorrect.")

--- Q1: Scaled Dot-Product Attention Test ---
Attention Weights shape: (3, 3)
Context Vector shape: (3, 4)


--- Q2: Transformer Encoder Block Test ---
Input shape: torch.Size([32, 10, 128])
Output shape: torch.Size([32, 10, 128])
SUCCESS: Output shape matches requirements.
