In [None]:
import torch
import torch.nn as nn
import math

# === HISTORY: Why Transformers? ===
# Before transformers, most models for sequences (like text) used RNNs (Recurrent Neural Networks).
# RNNs process data one token at a time, which is slow and has trouble with long dependencies.
# Transformers changed this by processing all tokens at once and using "attention" to decide what matters.

# === This class builds a very basic Transformer Encoder model ===
class SimpleTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=32, nhead=4, dim_feedforward=64, max_len=100):
        """
        vocab_size: How many unique words/tokens we have.
        d_model: Size of each word/token embedding vector.
        nhead: Number of attention heads (part of multi-head attention).
        dim_feedforward: Size of internal FFN layer.
        max_len: Max number of tokens in a sequence.
        """
        super().__init__()

        # Step 1: Turn each token into a vector using an embedding
        self.embedding = nn.Embedding(vocab_size, d_model)

        # Step 2: Add positional info (so model knows the order of words)
        self.positional_encoding = PositionalEncoding(d_model, max_len)

        # Step 3: Define a basic Transformer Encoder Layer (1 layer only)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,        # input & output vector size
            nhead=nhead,            # how many attention "heads" to use
            dim_feedforward=dim_feedforward  # internal hidden size
        )

        # Step 4: Wrap that one layer into a full encoder
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=1)

        # Step 5: Project output to vocab size (like predicting next word)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        """
        x: Tensor of shape (batch_size, seq_len), containing token IDs
        """
        # Embed the tokens (from IDs to vectors)
        embedded = self.embedding(x)  # shape: (batch, seq_len, d_model)

        # Add position information
        embedded = self.positional_encoding(embedded)

        # PyTorch Transformer wants (seq_len, batch, d_model)
        embedded = embedded.permute(1, 0, 2)

        # Run through the Transformer Encoder
        encoded = self.transformer_encoder(embedded)

        # Convert back to (batch, seq_len, d_model)
        encoded = encoded.permute(1, 0, 2)

        # Final prediction layer (e.g. for classification or next-word prediction)
        return self.fc_out(encoded)

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        """
        PositionalEncoding gives the model info about the position of each word.
        This is critical because the Transformer has no idea of order by default.
        We use sine & cosine functions (from original 2017 paper) to generate these.
        """
        super().__init__()

        # Create matrix of shape (max_len, d_model)
        pe = torch.zeros(max_len, d_model)

        # Positions from 0 to max_len-1, reshaped to (max_len, 1)
        position = torch.arange(0, max_len).unsqueeze(1)

        # Exponential decay terms for frequency
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))

        # Even positions = sine, Odd positions = cosine
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        # Add a batch dimension so we can add it to embeddings later
        pe = pe.unsqueeze(0)  # shape becomes (1, max_len, d_model)

        # Register buffer so it's saved with the model but not trainable
        self.register_buffer('pe', pe)

    def forward(self, x):
        # Add positional encoding to input embeddings
        x = x + self.pe[:, :x.size(1)]
        return x


In [None]:
# Define vocab size (like 100 tokens/words total)
vocab_size = 100

# Create model
model = SimpleTransformer(vocab_size)

# Sample batch: 2 sequences, each 10 tokens long
x = torch.randint(0, vocab_size, (2, 10))  # shape = (batch=2, seq_len=10)

# Get model output
output = model(x)  # shape = (2, 10, vocab_size)

print(output.shape)  # Expect: [2, 10, 100]


## 🧠 Summary (Why Each Part Exists)

| Component              | Purpose                                         |
|------------------------|-------------------------------------------------|
| `Embedding`            | Turns token IDs into dense vectors              |
| `PositionalEncoding`   | Injects info about word order                   |
| `TransformerEncoderLayer` | Applies attention + feedforward logic       |
| `TransformerEncoder`   | Stack of encoder layers                         |
| `Linear`               | Projects back to token space (e.g., to predict next word) |

---

### ✅ You can train this model on:
- Next-token prediction tasks
- Simple classification problems (e.g., sentiment)

---

### 🚀 Let me know if you want:
- A training loop  
- A dataset example (like text)  
- Visualization of attention heads  
- Extension to encoder-decoder (like T5 or GPT)  
