###**Task 1 | GPT-2 Model & Checkpoints**

###This code defines a GPT-2-like transformer model in PyTorch. Here's a breakdown of the code:

- **Multi-Head Attention:** The `MultiHeadAttention` class implements the multi-head attention mechanism, which splits the input into multiple heads and computes attention scores between them.

- **Feed-Forward Network:** The `FeedForward` class defines a simple feed-forward neural network used within the transformer block.

- **Positional Encoding:** The `PositionalEncoding` class generates positional encodings to provide information about the order of tokens in the input sequence.

- **Transformer Block:** The `TransformerBlock` class encapsulates a single block of the transformer architecture, consisting of multi-head attention, layer normalization, and feed-forward layers.

- **GPT-2 Model:** The `GPT2` class brings together the transformer blocks and other necessary components (embedding, positional encoding, linear layers) to create a GPT-2 model.

- **Sample Usage:** It includes a sample usage section where an instance of the `GPT2` model is created, and a sample input tensor is passed through the model to obtain the output.

This code provides the structure for a basic GPT-2-like model. To train this model or use it for specific tasks, you'd need to define a training loop, prepare your dataset, specify a loss function, and optimize the model parameters using gradient descent.

The `output` tensor represents the model's predictions based on the sample input tensor. Adjusting the `vocab_size` and `seq_length` variables will allow you to tailor the model to your specific use case or dataset.

Ref: a)The [GPT-2](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) paper's architecture.
b)[Andrej Karpathy's nanogpt](https://github.com/karpathy/nanoGPT) repository

In [1]:
import torch
import torch.nn as nn

# Define Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads

        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)

        self.fc_out = nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask=None):
        batch_size = query.shape[0]

        # Linear transformation for query, key, and value
        Q = self.query(query)
        K = self.key(key)
        V = self.value(value)

        # Reshape Q, K, and V
        Q = Q.view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)

        # Calculate attention scores
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / (self.head_dim ** 0.5)
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float('-1e20'))  # Masked positions get large negative energy

        attention = torch.nn.functional.softmax(energy, dim=-1)
        x = torch.matmul(attention, V)

        # Reshape and concatenate attention heads
        x = x.permute(0, 2, 1, 3).contiguous().view(batch_size, -1, self.d_model)

        # Linear transformation for output
        x = self.fc_out(x)

        return x

# Define Feed-Forward Network
class FeedForward(nn.Module):
    def __init__(self, d_model, hidden_dim):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, d_model)

    def forward(self, x):
        x = torch.nn.functional.relu(self.linear1(x))
        x = self.linear2(x)
        return x

# Define Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x

# Define Transformer Block
class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, hidden_dim):
        super(TransformerBlock, self).__init__()
        self.attention = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForward(d_model, hidden_dim)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)

    def forward(self, x, mask=None):
        attended = self.attention(x, x, x, mask)
        x = self.layer_norm1(x + attended)
        feed_forward_output = self.feed_forward(x)
        x = self.layer_norm2(x + feed_forward_output)
        return x

# Define GPT-2 Model
class GPT2(nn.Module):
    def __init__(self, vocab_size, d_model=768, num_heads=12, num_layers=12, hidden_dim=3072, max_len=512):
        super(GPT2, self).__init__()
        self.token_embeddings = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_len)
        self.transformer_blocks = nn.ModuleList([TransformerBlock(d_model, num_heads, hidden_dim) for _ in range(num_layers)])
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, x, mask=None):
        token_embed = self.token_embeddings(x)
        token_embed = self.positional_encoding(token_embed)

        for transformer_block in self.transformer_blocks:
            token_embed = transformer_block(token_embed, mask)

        output = self.fc(token_embed)
        return output

# Sample usage and testing
vocab_size = 10000  # Replace with your actual vocabulary size
seq_length = 50  # Replace with the desired sequence length

model = GPT2(vocab_size)

# Create an example input tensor (batch size of 1 for simplicity)
sample_input = torch.randint(0, vocab_size, (1, seq_length))

# Perform a forward pass through the model
output = model(sample_input)

# Print the shape of the output tensor
print("Output shape:", output.shape)

print(output)


Output shape: torch.Size([1, 50, 10000])
tensor([[[ 0.9271, -0.3426,  0.5934,  ..., -0.1170, -1.0391, -0.2518],
         [ 0.4209, -1.0054,  1.1158,  ..., -0.3033, -1.0238, -0.5546],
         [ 0.2445, -0.9566,  1.4994,  ..., -0.3522, -0.3304, -0.4663],
         ...,
         [ 0.4212, -0.2445,  0.4920,  ..., -0.3525, -0.1931, -0.8938],
         [-0.0057,  0.2506,  0.8035,  ..., -0.8767, -0.3137, -1.0402],
         [ 0.1285, -1.0617, -0.0357,  ..., -0.4016, -0.7944, -0.5287]]],
       grad_fn=<ViewBackward0>)


###**Task 2 | Transformer Architectural Changes**

###Rotary Positional Embedding:

This RotaryPositionalEmbedding class is an implementation of Rotary Positional Embedding, which is an alternative to the standard positional encoding in transformer-based models.To explore more about Rotary Positional Embeddings, you can refer to the following paper: [Rotary Position Embeddings](https://arxiv.org/pdf/2104.09864.pdf).


In [2]:
class RotaryPositionalEmbedding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super(RotaryPositionalEmbedding, self).__init__()
        self.d_model = d_model
        self.max_len = max_len

        self.init_rotary_embeddings()

    def init_rotary_embeddings(self):
        # Initialize rotary positional embeddings
        inv_freq = 1.0 / (10000 ** (torch.arange(0, self.d_model, 2.0) / self.d_model))
        position = torch.arange(0, self.max_len).unsqueeze(1)
        sin_pos = torch.sin(position * inv_freq)
        cos_pos = torch.cos(position * inv_freq)

        self.pos_embedding_sin = nn.Parameter(sin_pos)
        self.pos_embedding_cos = nn.Parameter(cos_pos)

    def forward(self, x):
        # Apply Rotary Positional Embedding to the input sequence x
        seq_len = x.size(1)
        pos_embedding = torch.cat([self.pos_embedding_sin[:seq_len, :], self.pos_embedding_cos[:seq_len, :]], dim=-1)
        return x + pos_embedding


###Group Query Attention:

This code defines a GroupQueryAttention module, which is an altered version of the Multi-Head Attention mechanism. This attention mechanism divides the input into multiple groups and performs attention separately within each group.

Ref: The Group Query Attention mechanism following the insights from the [Ainslie et. al. GQA: Training Generalized Multi-Query Transforme](https://arxiv.org/pdf/2305.13245v2.pdf)

In [3]:
class GroupQueryAttention(nn.Module):
    def __init__(self, d_model, num_heads, num_groups=4):
        super(GroupQueryAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.num_groups = num_groups
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        assert d_model % num_groups == 0, "d_model must be divisible by num_groups"
        assert num_heads % num_groups == 0, "num_heads must be divisible by num_groups"

        self.group_dim = d_model // num_groups
        self.group_heads = num_heads // num_groups

        # Linear transformations for queries, keys, and values for each group
        self.query = nn.Linear(self.group_dim, self.group_heads * self.head_dim)
        self.key = nn.Linear(self.group_dim, self.group_heads * self.head_dim)
        self.value = nn.Linear(self.group_dim, self.group_heads * self.head_dim)

        # Output projection
        self.fc_out = nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask=None):
        batch_size = query.shape[0]

        # Reshape queries, keys, and values into groups
        query = query.view(batch_size, -1, self.num_groups, self.group_dim).permute(0, 2, 1, 3)
        key = key.view(batch_size, -1, self.num_groups, self.group_dim).permute(0, 2, 1, 3)
        value = value.view(batch_size, -1, self.num_groups, self.group_dim).permute(0, 2, 1, 3)

        # Linear transformations
        Q = self.query(query)
        K = self.key(key)
        V = self.value(value)

        # Reshape Q, K, V for attention calculation
        Q = Q.view(batch_size, self.num_groups, -1, self.head_dim).permute(0, 1, 3, 2)
        K = K.view(batch_size, self.num_groups, -1, self.head_dim).permute(0, 1, 3, 2)
        V = V.view(batch_size, self.num_groups, -1, self.head_dim).permute(0, 1, 3, 2)

        # Calculate attention scores
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / (self.head_dim ** 0.5)
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float('-1e20'))  # Masked positions get large negative energy

        attention = torch.nn.functional.softmax(energy, dim=-1)
        x = torch.matmul(attention, V)

        # Reshape and concatenate attention heads
        x = x.permute(0, 1, 3, 2).contiguous().view(batch_size, -1, self.d_model)

        # Linear transformation for output
        x = self.fc_out(x)

        return x

In [4]:
# Define Transformer Block with Group Query Attention and Rotary Positional Embedding
class TransformerBlock_Groupquery_RoPE(nn.Module):
    def __init__(self, d_model, num_heads, hidden_dim):
        super(TransformerBlock_Groupquery_RoPE, self).__init__()
        self.attention = GroupQueryAttention(d_model, num_heads)  # Group Query Attention
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, d_model)
        )
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.rotary_pos_embedding = RotaryPositionalEmbedding(d_model)  # Rotary Positional Embedding

    def forward(self, x, mask=None):
        attended = self.attention(x, x, x, mask)
        x = self.layer_norm1(x + attended)
        x = self.rotary_pos_embedding(x)  # Applying Rotary Positional Embedding
        feed_forward_output = self.feed_forward(x)
        x = self.layer_norm2(x + feed_forward_output)
        return x

# Define Modified GPT-2 model with Rotary Positional Embedding and Group Query Attention
class ModifiedGPT2(nn.Module):
    def __init__(self, vocab_size, d_model=768, num_heads=12, num_layers=12, hidden_dim=3072):
        super(ModifiedGPT2, self).__init__()
        self.token_embeddings = nn.Embedding(vocab_size, d_model)
        self.transformer_blocks = nn.ModuleList([TransformerBlock_Groupquery_RoPE(d_model, num_heads, hidden_dim) for _ in range(num_layers)])
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, x, mask=None):
        token_embed = self.token_embeddings(x)

        for transformer_block in self.transformer_blocks:
            token_embed = transformer_block(token_embed, mask)

        output = self.fc(token_embed)
        return output

# Sample usage and testing
vocab_size = 10000  # Replace with your actual vocabulary size
seq_length = 50  # Replace with the desired sequence length

model = ModifiedGPT2(vocab_size)

# Create an example input tensor (batch size of 1 for simplicity)
sample_input = torch.randint(0, vocab_size, (1, seq_length))

# Perform a forward pass through the model
output = model(sample_input)

# Print the shape of the output tensor
print("Output shape:", output.shape)

print(output)

Output shape: torch.Size([1, 50, 10000])
tensor([[[-0.4557, -0.2189,  0.4474,  ...,  1.4816, -0.6794,  1.3982],
         [-0.3831,  0.0115,  0.5123,  ...,  1.3253, -0.6766,  1.3296],
         [-0.3300,  0.1373,  0.7185,  ...,  1.2620, -0.7739,  1.3185],
         ...,
         [-0.4795, -0.1452,  1.1642,  ...,  0.8815,  0.3852, -0.1648],
         [-0.2475, -0.3346,  1.1498,  ...,  1.0655,  0.1698, -0.2334],
         [ 0.0063, -0.3363,  1.0891,  ...,  1.3330, -0.1069, -0.2190]]],
       grad_fn=<ViewBackward0>)


###Sliding Window Attention:


The SlidingWindowAttention module implements a variation of the self-attention mechanism where attention is applied within a sliding window across the input sequence. This is useful when dealing with long sequences as it reduces computational complexity compared to standard self-attention while maintaining some degree of local attention.

This mechanism divides the input sequence into overlapping windows and computes attention separately for each window, reducing the computational complexity while retaining some local context information.

Ref: The work by [Beltagy et. al. Longformer](https://arxiv.org/pdf/2004.05150v2.pdf) for better comprehension of its implementation and advantages.

In [7]:
class SlidingWindowAttention(nn.Module):
    def __init__(self, d_model, num_heads, window_size):
        super(SlidingWindowAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.window_size = window_size

        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)

        self.fc_out = nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask=None):
        batch_size = query.shape[0]

        Q = self.query(query)
        K = self.key(key)
        V = self.value(value)

        Q = Q.view(batch_size, -1, self.num_heads, self.d_model // self.num_heads).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.num_heads, self.d_model // self.num_heads).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.num_heads, self.d_model // self.num_heads).permute(0, 2, 1, 3)

        window_size = min(self.window_size, K.size(2))

        # Implement sliding window attention
        output_chunks = []
        for i in range(0, K.size(2), window_size):
            k_chunk = K[:, :, i:i+window_size]  # Split keys into chunks
            v_chunk = V[:, :, i:i+window_size]  # Split values into chunks

            energy = torch.matmul(Q, k_chunk.permute(0, 1, 3, 2)) / (self.d_model ** 0.5)
            if mask is not None:
                energy = energy.masked_fill(mask == 0, float('-1e20'))

            attention = torch.nn.functional.softmax(energy, dim=-1)
            output_chunk = torch.matmul(attention, v_chunk)
            output_chunks.append(output_chunk)

        output = torch.cat(output_chunks, dim=2)
        output = output.permute(0, 2, 1, 3).contiguous().view(batch_size, -1, self.d_model)
        output = self.fc_out(output)
        return output


In [8]:
# Define Transformer Block with Group Query Attention, Rotary Positional Embedding, and Sliding Window Attention
class TransformerBlock_Groupquery_RoPE_SWA(nn.Module):
    def __init__(self, d_model, num_heads, hidden_dim, window_size):
        super(TransformerBlock_Groupquery_RoPE_SWA, self).__init__()
        self.attention = SlidingWindowAttention(d_model, num_heads, window_size)  # Sliding Window Attention
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, d_model)
        )
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.rotary_pos_embedding = RotaryPositionalEmbedding(d_model)  # Rotary Positional Embedding

    def forward(self, x, mask=None):
        attended = self.attention(x, x, x, mask)
        x = self.layer_norm1(x + attended)
        x = self.rotary_pos_embedding(x)  # Applying Rotary Positional Embedding
        feed_forward_output = self.feed_forward(x)
        x = self.layer_norm2(x + feed_forward_output)
        return x

# Define Modified GPT-2 model with Rotary Positional Embedding, Group Query Attention, and Sliding Window Attention
class ModifiedGPT2_SWA(nn.Module):
    def __init__(self, vocab_size, d_model=768, num_heads=12, num_layers=12, hidden_dim=3072, window_size=128):
        super(ModifiedGPT2_SWA, self).__init__()
        self.token_embeddings = nn.Embedding(vocab_size, d_model)
        self.transformer_blocks = nn.ModuleList([
            TransformerBlock_Groupquery_RoPE_SWA(d_model, num_heads, hidden_dim, window_size) for _ in range(num_layers)
        ])
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, x, mask=None):
        token_embed = self.token_embeddings(x)

        for transformer_block in self.transformer_blocks:
            token_embed = transformer_block(token_embed, mask)

        output = self.fc(token_embed)
        return output

# Sample usage and testing
vocab_size = 10000  # Replace with your actual vocabulary size
seq_length = 50  # Replace with the desired sequence length

model3 = ModifiedGPT2_SWA(vocab_size)

# Create an example input tensor (batch size of 1 for simplicity)
sample_input = torch.randint(0, vocab_size, (1, seq_length))

# Perform a forward pass through the model
output = model3(sample_input)

# Print the shape of the output tensor
print("Output shape:", output.shape)
print(output)

Output shape: torch.Size([1, 50, 10000])
tensor([[[ 1.9934,  0.2343,  0.5987,  ...,  1.4065,  1.1938, -0.1305],
         [ 2.1030,  0.2031,  0.5219,  ...,  1.4025,  1.3405, -0.3348],
         [ 1.9928,  0.2200,  0.6767,  ...,  1.3596,  1.4793, -0.7478],
         ...,
         [ 0.3951, -0.2136,  0.5191,  ..., -0.1982,  1.0205,  0.0118],
         [ 0.5461, -0.0633,  0.3845,  ...,  0.0713,  1.0498,  0.0813],
         [ 0.7763,  0.0673,  0.3135,  ...,  0.3663,  1.0490, -0.0674]]],
       grad_fn=<ViewBackward0>)


###**Task 3: Training Loop Implementation**

###1.Single GPU Training Loop:

In [None]:
# Create an instance of the GPT-2 model
vocab_size = 10000  # Replace with the actual vocabulary size
model = GPT2SingleGPU(vocab_size)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


train_dataset = ...  # Your training dataset
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=1e-4)


# Training loop
num_epochs = 10  # Set your desired number of epochs
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if (batch_idx + 1) % 100 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}] Batch [{batch_idx+1}/{len(train_loader)}] Loss: {total_loss / 100:.4f}")
            total_loss = 0.0




###2. Distributed Data Parallel (DDP):

DistributedDataParallel (DDP) implements data parallelism at the module level which can run across multiple machines. Applications using DDP should spawn multiple processes and create a single DDP instance per process. DDP uses collective communications in the torch.distributed package to synchronize gradients and buffers. More specifically, DDP registers an autograd hook for each parameter given by model.parameters() and the hook will fire when the corresponding gradient is computed in the backward pass.

Ref: [PyTorch's DDP tutorial](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html)

In [None]:
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler


# Function to train the model
def train_DDP(rank, world_size):
    torch.manual_seed(1234)
    dist.init_process_group("gloo", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

    # Define your dataset and DataLoader
    train_dataset = ...  # Your training dataset
    train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank)
    train_loader = DataLoader(train_dataset, batch_size=32, sampler=train_sampler)

    # Create an instance of the GPT-2 model
    vocab_size = 10000  # Replace with the actual vocabulary size
    model = GPT2(vocab_size)
    model.to(rank)
    model = DDP(model, device_ids=[rank])

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-4)

     # Training loop
    num_epochs = 10  # Set your desired number of epochs
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        for batch_idx, (inputs, targets) in enumerate(train_loader):
            inputs, targets = inputs.to(rank), targets.to(rank)
            optimizer.zero_grad()
            ......
            .....



###3. Fully Sharded Data Parallel (FSDP)

Ref:  [Gupta et al., 2020, Training GPT-3 Like Models on a Single Machine](https://arxiv.org/pdf/2101.06840.pdf)

In [None]:
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
from torch.distributed.fsdp import ShardingStrategy


# Define the FSDP model and optimizer
fsdp_model = FullyShardedDataParallel(model)  # Create an FSDP-wrapped model
optimizer = torch.optim.Adam(fsdp_model.parameters())


# Training loop
for epoch in range(num_epochs):
    for data in dataloader:
        optimizer.zero_grad()
        input_data, target = data

        # Forward pass
        output = fsdp_model(input_data)
        loss = loss_function(output, target)

        # Backward pass
        loss.backward()

        # Gradient reduction across shards
        fsdp_model.reduce()

        # Optimizer step
        optimizer.step()

    # Optional: Synchronize optimizer state
    fsdp_model.sync_optimizer_state()
