<a href="https://colab.research.google.com/github/shivavarma2001/HOMEWORK-ML5/blob/main/Welcome_To_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np

def softmax(x):
    """Compute softmax along the last axis."""
    ex = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return ex / np.sum(ex, axis=-1, keepdims=True)

def scaled_dot_product_attention(Q, K, V):
    """
    Compute Scaled Dot-Product Attention.

    Q: Query matrix (shape: [num_queries, d_k])
    K: Key matrix   (shape: [num_keys, d_k])
    V: Value matrix (shape: [num_keys, d_v])

    Returns:
        attention_weights: softmax-normalized weights
        context: weighted sum of values
    """

    # Step 1: Compute raw scores (QK^T)
    scores = np.dot(Q, K.T)   # shape â†’ (num_queries, num_keys)

    # Step 2: Scale by sqrt(d_k)
    d_k = K.shape[-1]
    scaled_scores = scores / np.sqrt(d_k)

    # Step 3: Apply softmax
    attention_weights = softmax(scaled_scores)

    # Step 4: Multiply attention weights with V
    context = np.dot(attention_weights, V)

    return attention_weights, context


# ---------------------------
# Example test
# ---------------------------

Q = np.array([[1, 0, 1]])
K = np.array([[1, 0, 1],
              [0, 1, 0],
              [1, 1, 0]])
V = np.array([[1, 2],
              [0, 3],
              [4, 5]])

attn_weights, context_vector = scaled_dot_product_attention(Q, K, V)

print("Attention Weights:\n", attn_weights)
print("\nContext Vector:\n", context_vector)


Attention Weights:
 [[0.53289684 0.16794345 0.29915971]]

Context Vector:
 [[1.72953569 3.06542259]]


In [2]:
import torch
import torch.nn as nn

class SimpleTransformerEncoderBlock(nn.Module):
    def __init__(self, d_model=128, num_heads=8, d_ff=512, dropout=0.1):
        super(SimpleTransformerEncoderBlock, self).__init__()

        self.self_attn = nn.MultiheadAttention(embed_dim=d_model, num_heads=num_heads,
                                  dropout=dropout, batch_first=True)  # shape: (batch, seq, d_model)


        self.norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )

        self.norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x):
        attn_output, _ = self.self_attn(x, x, x)
        x = self.norm1(x + self.dropout1(attn_output))
        ffn_output = self.ffn(x)
        x = self.norm2(x + self.dropout2(ffn_output))
        return x

if __name__ == "__main__":
    batch_size = 32
    seq_len = 10
    d_model = 128
    x = torch.randn(batch_size, seq_len, d_model)

    encoder = SimpleTransformerEncoderBlock(d_model=d_model, num_heads=8)
    out = encoder(x)
    print("Input shape :", x.shape)   # torch.Size([32, 10, 128])
    print("Output shape:", out.shape)  # torch.Size([32, 10, 128])


Input shape : torch.Size([32, 10, 128])
Output shape: torch.Size([32, 10, 128])
