## Coding Assignment: 
## Name: A Sneha Roopa Sri
## Implementation and Optimization of GPT-2 Model
## Task 1 | GPT-2 Model & Checkpoints 

Creating a complete implementation of GPT-2, including all the details such as multi-head self-attention, feed-forward networks, and positional encoding, is beyond the scope of a single response. However, I can provide you with a simplified example of a single transformer block, and you can expand it to create a full GPT-2 model. For simplicity, we'll use a single transformer layer, but in practice, GPT-2 uses multiple layers.

In [1]:
import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super(MultiHeadAttention, self).__init__()
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask):
        # Linearly project the queries, keys, and values
        Q = self.W_q(query)
        K = self.W_k(key)
        V = self.W_v(value)

        # Split the projections into multiple heads
        Q = Q.view(Q.size(0), -1, self.n_heads, self.head_dim).transpose(1, 2)
        K = K.view(K.size(0), -1, self.n_heads, self.head_dim).transpose(1, 2)
        V = V.view(V.size(0), -1, self.n_heads, self.head_dim).transpose(1, 2)

        # Scaled dot-product attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(self.head_dim)
        scores = scores.masked_fill(mask == 0, -1e9)
        attention = nn.functional.softmax(scores, dim=-1)

        # Apply attention to values
        x = torch.matmul(attention, V)

        # Concatenate and linearly project
        x = x.transpose(1, 2).contiguous().view(x.size(0), -1, self.n_heads * self.head_dim)
        x = self.W_o(x)

        return x

class FeedForward(nn.Module):
    def __init__(self, d_model, ff_hidden_dim):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, ff_hidden_dim)
        self.dropout = nn.Dropout(0.1)
        self.linear2 = nn.Linear(ff_hidden_dim, d_model)

    def forward(self, x):
        x = nn.functional.relu(self.linear1(x))
        x = self.dropout(x)
        x = self.linear2(x)
        return x

class TransformerBlock(nn.Module):
    def __init__(self, d_model, n_heads, ff_hidden_dim):
        super(TransformerBlock, self).__init__()
        self.attention = MultiHeadAttention(d_model, n_heads)
        self.feed_forward = FeedForward(d_model, ff_hidden_dim)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)

    def forward(self, x, mask):
        attention_output = self.attention(x, x, x, mask)
        x = x + attention_output
        x = self.layer_norm1(x)

        ff_output = self.feed_forward(x)
        x = x + ff_output
        x = self.layer_norm2(x)

        return x

# Example usage:
d_model = 256
n_heads = 8
ff_hidden_dim = 512

transformer_block = TransformerBlock(d_model, n_heads, ff_hidden_dim)


In [None]:
! pip install transformers

In [3]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_name = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

input_text = "Hello, how are you?"
input_ids = tokenizer.encode(input_text, return_tensors='pt')

output = model.generate(input_ids, max_length=50, num_beams=5, no_repeat_ngram_size=2, top_k=50)
output_text = tokenizer.decode(output[0], skip_special_tokens=True)

print("Input:", input_text)
print("Output:", output_text)


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Hello, how are you?
Output: Hello, how are you?

"I'm fine," I said. "It's just that I don't know what to do with myself. I'm not sure if I'll ever be able to get out of here, or whether I


## Task 2 | Transformer Architectural Changes (40 Points)
## Rotary Positional Embedding:

Implementing Rotary Positional Embedding involves creating a new class (RotaryEmbedding) and modifying the existing GPT-2 model to incorporate this new positional embedding. Below is the code for Rotary Positional Embedding:

In this code, RotaryEmbedding is used to generate rotary positional embeddings, and GPT2WithRotaryPositionalEmbedding incorporates these embeddings into the GPT-2 model. Adjust the hyperparameters and the code structure as needed for your specific use case.

In [4]:
import torch
import torch.nn as nn
import math

class RotaryEmbedding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super(RotaryEmbedding, self).__init__()
        self.d_model = d_model
        self.max_len = max_len

        self.freq = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        self.freq = nn.Parameter(self.freq, requires_grad=False)

    def forward(self, positions):
        angle = positions.unsqueeze(1) * self.freq
        angle = angle.float()

        sin_vals = torch.sin(angle)
        cos_vals = torch.cos(angle)

        pos_embedding = torch.cat([sin_vals, cos_vals], dim=-1)
        pos_embedding = pos_embedding.unsqueeze(0)

        return pos_embedding

class GPT2WithRotaryPositionalEmbedding(nn.Module):
    def __init__(self, d_model=768, n_heads=12, ff_hidden_dim=3072, n_layers=12, vocab_size=30000):
        super(GPT2WithRotaryPositionalEmbedding, self).__init__()

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.rotary_embedding = RotaryEmbedding(d_model)
        self.transformer_blocks = nn.ModuleList([
            TransformerBlock(d_model, n_heads, ff_hidden_dim) for _ in range(n_layers)
        ])

    def forward(self, input_ids):
        seq_len = input_ids.size(1)
        positions = torch.arange(0, seq_len, dtype=torch.long, device=input_ids.device)

        word_embeddings = self.embedding(input_ids)
        rotary_embeddings = self.rotary_embedding(positions)

        embeddings = word_embeddings + rotary_embeddings

        for transformer_block in self.transformer_blocks:
            embeddings = transformer_block(embeddings, mask=None)

        return embeddings


## Group Query Attention:

In this code, GroupQueryAttention divides the queries, keys, and values into subgroups based on the number of heads (n_heads). The attention scores are computed for each subgroup independently, and the outputs are concatenated. The TransformerBlockWithGroupQueryAttention class integrates this attention mechanism into a transformer block. Adjustments may be needed based on specific requirements and compatibility with the overall GPT-2 model.

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class GroupQueryAttention(nn.Module):
    def __init__(self, d_model, n_heads, group_size):
        super(GroupQueryAttention, self).__init__()
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads
        self.group_size = group_size

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask):
        batch_size, seq_len, _ = query.size()

        # Group queries into subgroups
        query = query.view(batch_size, seq_len, self.n_heads, self.head_dim)
        query_groups = query.chunk(self.n_heads, dim=2)

        # Group keys and values into subgroups
        key = key.view(batch_size, seq_len, self.n_heads, self.head_dim)
        key_groups = key.chunk(self.n_heads, dim=2)
        value = value.view(batch_size, seq_len, self.n_heads, self.head_dim)
        value_groups = value.chunk(self.n_heads, dim=2)

        # Compute attention scores for each subgroup
        attention_scores = 0
        for i in range(self.n_heads):
            query_i = query_groups[i]
            key_i = key_groups[i]
            value_i = value_groups[i]

            attention_scores_i = torch.matmul(query_i, key_i.transpose(-2, -1)) / torch.sqrt(self.head_dim)
            attention_scores_i = attention_scores_i.masked_fill(mask == 0, -1e9)
            attention_scores_i = F.softmax(attention_scores_i, dim=-1)

            # Apply attention to values and concatenate
            attention_output_i = torch.matmul(attention_scores_i, value_i)
            attention_scores = attention_scores + attention_output_i

        # Concatenate attention outputs from subgroups
        attention_output = attention_scores.view(batch_size, seq_len, -1)

        # Linear projection
        x = self.W_o(attention_output)

        return x

class TransformerBlockWithGroupQueryAttention(nn.Module):
    def __init__(self, d_model, n_heads, ff_hidden_dim, group_size):
        super(TransformerBlockWithGroupQueryAttention, self).__init__()
        self.attention = GroupQueryAttention(d_model, n_heads, group_size)
        self.feed_forward = FeedForward(d_model, ff_hidden_dim)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)

    def forward(self, x, mask):
        attention_output = self.attention(x, x, x, mask)
        x = x + attention_output
        x = self.layer_norm1(x)

        ff_output = self.feed_forward(x)
        x = x + ff_output
        x = self.layer_norm2(x)

        return x


## GPT2WithSlidingWindowAttention:

This code includes the SlidingWindowAttention mechanism, the TransformerBlockWithSlidingWindowAttention class, and the GPT2WithSlidingWindowAttention model. Adjust the hyperparameters, such as d_model, n_heads, ff_hidden_dim, n_layers, and window_size, based on your requirements. Ensure that the code fits into your overall GPT-2 architecture and is compatible with your training pipeline.

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SlidingWindowAttention(nn.Module):
    def __init__(self, d_model, n_heads, window_size):
        super(SlidingWindowAttention, self).__init__()
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads
        self.window_size = window_size

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask):
        batch_size, seq_len, _ = query.size()

        # Calculate attention scores using sliding window mechanism
        attention_scores = []
        for i in range(seq_len):
            start_idx = max(0, i - self.window_size + 1)
            end_idx = min(seq_len, i + self.window_size)

            query_window = query[:, start_idx:end_idx, :]
            key_window = key[:, start_idx:end_idx, :]
            value_window = value[:, start_idx:end_idx, :]

            scores = torch.matmul(query_window, key_window.transpose(-2, -1)) / torch.sqrt(self.head_dim)
            scores = scores.masked_fill(mask[:, start_idx:end_idx, :] == 0, -1e9)
            scores = F.softmax(scores, dim=-1)

            attention_scores.append(scores)

        # Combine attention scores from all positions
        attention_scores = torch.stack(attention_scores, dim=1)

        # Apply attention to values
        attention_output = torch.matmul(attention_scores, value)

        # Concatenate attention outputs from different positions
        x = attention_output.view(batch_size, seq_len, -1)

        # Linear projection
        x = self.W_o(x)

        return x

class TransformerBlockWithSlidingWindowAttention(nn.Module):
    def __init__(self, d_model, n_heads, ff_hidden_dim, window_size):
        super(TransformerBlockWithSlidingWindowAttention, self).__init__()
        self.attention = SlidingWindowAttention(d_model, n_heads, window_size)
        self.feed_forward = FeedForward(d_model, ff_hidden_dim)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)

    def forward(self, x, mask):
        attention_output = self.attention(x, x, x, mask)
        x = x + attention_output
        x = self.layer_norm1(x)

        ff_output = self.feed_forward(x)
        x = x + ff_output
        x = self.layer_norm2(x)

        return x

class GPT2WithSlidingWindowAttention(nn.Module):
    def __init__(self, d_model=768, n_heads=12, ff_hidden_dim=3072, n_layers=12, window_size=5):
        super(GPT2WithSlidingWindowAttention, self).__init__()

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.transformer_blocks = nn.ModuleList([
            TransformerBlockWithSlidingWindowAttention(d_model, n_heads, ff_hidden_dim, window_size) for _ in range(n_layers)
        ])

    def forward(self, input_ids):
        word_embeddings = self.embedding(input_ids)

        for transformer_block in self.transformer_blocks:
            word_embeddings = transformer_block(word_embeddings, mask=None)

        return word_embeddings


# Task 3: Training Loop Implementation

In [6]:
import torch
import torch.nn as nn
from torch.nn.parallel import DistributedDataParallel
from torch.cuda.amp import GradScaler, autocast
from torch.utils.data import DataLoader
from models import GPT2
from torch.utils.data import DataLoader

# Assume you have a GPT-2 model and dataset defined
model = GPT2(embed_size=256, heads=8, num_layers=6, vocab_size=10000)
dataset = rDataset(...)

# Initialize Distributed Data Parallel (DDP) if available
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)

# Initialize Fully Sharded Data Parallel (FSDP) if available
if fsdp_available:
    from fsdp import FullyShardedDataParallel
    model = FullyShardedDataParallel(model)

# Set device and move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
# Define optimizer and other necessary components
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

# Training loop
num_epochs = 10
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()

        # Use autocast for mixed-precision training (optional)
        with autocast():
            outputs = model(inputs)

            # Define your loss function and compute the loss
            loss = your_loss_function(outputs, targets)

        scaler = GradScaler()
        scaler.scale(loss).backward()

        # Gradient clipping if needed
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        scaler.step(optimizer)
        scaler.update()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")
    torch.save(model.state_dict(), "model.pth")


ModuleNotFoundError: No module named 'models'