# Understanding the Transformer Architecture
In this tutorial, we'll delve into the Transformer architecture, a groundbreaking model in the NLP domain. We'll use PyTorch to implement the model and understand its components.

##Prerequisites:
Basic understanding of PyTorch.
Familiarity with deep learning concepts.
##Setting up the Environment
First, let's set up our Colab environment:

In [90]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import math

import re

## 1. Self-Attention Mechanism
The self-attention mechanism allows the model to weigh the importance of different words in a sequence relative to a particular word.

In [91]:
class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert (
            self.head_dim * heads == embed_size
        ), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(heads * self.head_dim, embed_size)

    def forward(self, values, keys, query, mask):
        N = query.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        # Split the embedding into self.heads different pieces
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = query.reshape(N, query_len, self.heads, self.head_dim)

        values = self.values(values)
        keys = self.keys(keys)
        queries = self.queries(queries)

        # Scaled dot-product attention
        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))

        attention = torch.nn.functional.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)

        # Store attention weights
        self.attention_weights = attention

        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.heads * self.head_dim
        )

        out = self.fc_out(out)
        return out


## 2. Transformer Block
The Transformer block consists of the self-attention mechanism and a feed-forward neural network.

In [92]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)

        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size),
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        attention = self.attention(value, key, query, mask)

        # Add & norm
        x = self.norm1(attention + query)
        x = self.dropout(x)
        forward = self.feed_forward(x)
        out = self.norm2(forward + x)
        out = self.dropout(out)
        return out


## 3. Encoder
The encoder consists of multiple Transformer blocks.

In [93]:
class Encoder(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        embed_size,
        num_layers,
        heads,
        device,
        forward_expansion,
        dropout,
        max_length,
    ):
        super(Encoder, self).__init__()
        self.embed_size = embed_size
        self.device = device
        self.word_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        self.layers = nn.ModuleList(
            [
                TransformerBlock(
                    embed_size,
                    heads,
                    dropout=dropout,
                    forward_expansion=forward_expansion,
                )
                for _ in range(num_layers)
            ]
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        out = self.dropout(
            (self.word_embedding(x) + self.position_embedding(positions))
        )

        for layer in self.layers:
            out = layer(out, out, out, mask)

        return out


## 4. Decoder
The decoder also consists of multiple Transformer blocks but also has an additional feed-forward neural network at the end to produce predicted tokens.

In [94]:
class Decoder(nn.Module):
    def __init__(
        self,
        trg_vocab_size,
        embed_size,
        num_layers,
        heads,
        forward_expansion,
        dropout,
        device,
        max_length,
    ):
        super(Decoder, self).__init__()
        self.device = device
        self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        self.layers = nn.ModuleList(
            [
                TransformerBlock(
                    embed_size,
                    heads,
                    dropout=dropout,
                    forward_expansion=forward_expansion,
                )
                for _ in range(num_layers)
            ]
        )
        self.fc_out = nn.Linear(embed_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask, trg_mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        x = self.dropout(self.word_embedding(x) + self.position_embedding(positions))

        for layer in self.layers:
            x = layer(x, enc_out, enc_out, trg_mask)

        out = self.fc_out(x)
        return out


## 5. Transformer
Finally, the Transformer model combines the encoder and decoder.

In [95]:
class Transformer(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        trg_pad_idx,
        embed_size=512,
        num_layers=6,
        forward_expansion=4,
        heads=8,
        dropout=0,
        device="cuda",
        max_length=100,
    ):
        super(Transformer, self).__init__()

        self.encoder = Encoder(
            src_vocab_size,
            embed_size,
            num_layers,
            heads,
            device,
            forward_expansion,
            dropout,
            max_length,
        )

        self.decoder = Decoder(
            trg_vocab_size,
            embed_size,
            num_layers,
            heads,
            forward_expansion,
            dropout,
            device,
            max_length,
        )

        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask.to(self.device)

    def make_trg_mask(self, trg):
        N, trg_len = trg.shape
        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
            N, 1, trg_len, trg_len
        )
        return trg_mask.to(self.device)

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        out = self.decoder(trg, enc_src, src_mask, trg_mask)
        return out


In [96]:
# Function to clean text
def clean_text(text):
    # Remove chapter titles and other unwanted formatting
    # This regex captures "CHAPTER X" and "THE TITLE"
    text = re.sub(r'CHAPTER\s+\d+.*\n', '', text)  # Remove chapter titles
    # text = re.sub(r'THE\s+[A-Z\s]+', '', text)  # Remove section titles (in all caps)
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces and line breaks
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters, keeping only letters and spaces
    text = text.lower()  # Convert to lowercase for uniformity
    return text.strip()

In [97]:
with open('wizard_of_oz.txt', 'r') as handle:
    text = handle.read()

text = clean_text(text)

In [98]:
token = text.split()
vocab = set(token)
vocab_size = len(vocab)

word2idx = {word:i + 1 for i, word in enumerate(vocab)}
idx2word = {i + 1:word for i, word in enumerate(vocab)}

token_int = [word2idx[e] for e in token]


In [99]:
seq_len = 5
data = []
for i in range(0, len(token) - seq_len):
    data.append((torch.tensor(token_int[i:i + seq_len]), torch.tensor(token_int[i + 1: i + seq_len + 1])))

data[0]

(tensor([2170, 1648, 2170, 1586, 2565]),
 tensor([1648, 2170, 1586, 2565, 3289]))

In [100]:
data[0], max(token_int), min(token_int)

((tensor([2170, 1648, 2170, 1586, 2565]),
  tensor([1648, 2170, 1586, 2565, 3289])),
 4212,
 1)

In [103]:
# Assuming all the previous classes (SelfAttention, TransformerBlock, Encoder, Decoder, Transformer) are already defined

import torch.optim as optim
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import torch.optim as optim
import matplotlib.pyplot as plt
import seaborn as sns

# Check for device
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
NUM_EPOCHS = 10
LEARNING_RATE = 0.001
BATCH_SIZE = 64
SRC_VOCAB_SIZE = 4213  # Update to match max token index
TRG_VOCAB_SIZE = 4213  # Update to match max token index
EMBED_SIZE = 512
NUM_HEADS = 8
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3
DROPOUT = 0.10
MAX_LENGTH = 10
FORWARD_EXPANSION = 4
SRC_PAD_IDX = 4212  # Adjust if necessary
TRG_PAD_IDX = 4212  # Adjust if necessary


device = torch.device("cpu")  # Use CPU

print(data[0])

# Model, Optimizer, and Loss
model = Transformer(
    SRC_VOCAB_SIZE,
    TRG_VOCAB_SIZE,
    SRC_PAD_IDX,
    TRG_PAD_IDX,
    EMBED_SIZE,
    NUM_ENCODER_LAYERS,
    FORWARD_EXPANSION,
    NUM_HEADS,
    DROPOUT,
    device,
    MAX_LENGTH,
).to(device)

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX).to(device)

# Training Loop with tqdm progress bar
for epoch in range(NUM_EPOCHS):
    total_loss = 0
    progress_bar = tqdm(data, desc=f"Epoch {epoch + 1}/{NUM_EPOCHS}", leave=False)
    for src, trg in progress_bar:
        print('src, trg: ', src.shape, trg.shape)
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()
        output = model(src.unsqueeze(0), trg.unsqueeze(0))
        print(output.shape)
        output = output.squeeze(0)

        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix({'loss': total_loss / (len(data) / (BATCH_SIZE if BATCH_SIZE < len(data) else len(data)))})
        break
    break



(tensor([2170, 1648, 2170, 1586, 2565]), tensor([1648, 2170, 1586, 2565, 3289]))


                                                     

src, trg:  torch.Size([5]) torch.Size([5])
torch.Size([1, 5, 4213])




RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
