#Task 1 || GPT-2 Model & Checkpoints

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

# Configuration for a small GPT-2 model
class GPT2Config:
    vocab_size = 100000
    max_position_embeddings = 768
    n_layers = 12
    n_heads = 12
    n_embd = 512
    layer_norm_epsilon = 1e-5
    initializer_range = 0.02

config = GPT2Config()

# Define the scaled dot product attention function
def scaled_dot_product_attention(query, key, value):
    temp = query.bmm(key.transpose(1, 2)) / math.sqrt(query.size(-1))
    softmax = nn.Softmax(dim=-1)
    return softmax(temp).bmm(value)

# Define a single head for the Multi-Head Attention
class AttentionHead(nn.Module):
    def __init__(self, embd_dim):
        super().__init__()
        self.query = nn.Linear(embd_dim, embd_dim)
        self.key = nn.Linear(embd_dim, embd_dim)
        self.value = nn.Linear(embd_dim, embd_dim)

    def forward(self, hidden_state):
        return scaled_dot_product_attention(
            self.query(hidden_state), self.key(hidden_state), self.value(hidden_state)
        )

# Define the Multi-Head Attention layer
class MultiHeadAttention(nn.Module):
    def __init__(self, embd_dim, n_heads):
        super().__init__()
        self.heads = nn.ModuleList([AttentionHead(embd_dim) for _ in range(n_heads)])
        self.linear = nn.Linear(n_heads * embd_dim, embd_dim)

    def forward(self, hidden_state):
        attention = [head(hidden_state) for head in self.heads]
        concatenated = torch.cat(attention, dim=-1)
        return self.linear(concatenated)

# Define the Pointwise Feed Forward layer
class PointwiseFeedForward(nn.Module):
    def __init__(self, embd_dim, ff_dim):
        super().__init__()
        self.linear1 = nn.Linear(embd_dim, ff_dim)
        self.linear2 = nn.Linear(ff_dim, embd_dim)

    def forward(self, hidden_state):
        return self.linear2(F.relu(self.linear1(hidden_state)))

# Define a single Transformer block
class TransformerBlock(nn.Module):
    def __init__(self, embd_dim, n_heads, ff_dim, layer_norm_epsilon):
        super().__init__()
        self.attention = MultiHeadAttention(embd_dim, n_heads)
        self.feed_forward = PointwiseFeedForward(embd_dim, ff_dim)
        self.layer_norm1 = nn.LayerNorm(embd_dim, eps=layer_norm_epsilon)
        self.layer_norm2 = nn.LayerNorm(embd_dim, eps=layer_norm_epsilon)

    def forward(self, hidden_state):
        attention_output = self.attention(hidden_state)
        norm1 = self.layer_norm1(hidden_state + attention_output)
        feed_forward_output = self.feed_forward(norm1)
        norm2 = self.layer_norm2(norm1 + feed_forward_output)
        return norm2

# Define the full GPT-2 model
class GPT2(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embd_dim = config.n_embd
        self.token_embedding = nn.Embedding(config.vocab_size, self.embd_dim)
        self.position_embedding = nn.Embedding(config.max_position_embeddings, self.embd_dim)
        self.blocks = nn.ModuleList(
            [TransformerBlock(self.embd_dim, config.n_heads, 4 * self.embd_dim, config.layer_norm_epsilon) for _ in range(config.n_layers)]
        )
        self.layer_norm = nn.LayerNorm(self.embd_dim, eps=config.layer_norm_epsilon)

    def forward(self, input_ids, positions_ids=None):
        if positions_ids is None:
            positions_ids = torch.arange(0, input_ids.size(1)).unsqueeze(0).to(input_ids.device)
        tokens = self.token_embedding(input_ids)
        positions = self.position_embedding(positions_ids)

        # Add positional encoding
        x = tokens + positions

        for block in self.blocks:
            x = block(x)

        x = self.layer_norm(x)
        return x

# Example usage
model = GPT2(config)
input_ids = torch.randint(0, config.vocab_size, (1, 768))
output = model(input_ids)
print(output)


#Task 2 || Transformer Architectural Changes

##Rotary Positional Embedding

In [None]:
import torch

def Rotary_Positional_Embedding(x, sincos):
    sin, cos = map(lambda t: t.repeat_interleave(2, dim=-1), sincos)
    return (x * cos) + (torch.roll(x, shifts=1, dims=-1) * sin)

##Group Query Attention

In [None]:
def Group_Query_Attention(query, key, value, num_groups):
    # Split queries into groups
    group_size = query.size(2) // num_groups
    query_groups = query.view(*query.size()[:2], num_groups, group_size)

    # Perform attention within each group
    attention_output = []
    for i in range(num_groups):
        group_attn_output = scaled_dot_product_attention(query_groups[:,:,i,:], key, value)
        attention_output.append(group_attn_output)

    # Concatenate the outputs of each group
    return torch.cat(attention_output, dim=-1)

##Sliding_Window_Attention

In [None]:
def Sliding_Window_Attention(query, key, value, window_size):
    # Assume query, key, and value are all the same size for simplicity
    batch_size, seq_length, dim = query.size()
    attention_scores = torch.empty((batch_size, seq_length, window_size), device=query.device)

    # Compute attention scores for a sliding window
    for i in range(seq_length):
        start = max(0, i - window_size // 2)
        end = min(seq_length, i + window_size // 2 + 1)
        attention_scores[:, i, :end-start] = torch.bmm(query[:, i:i+1, :], key[:, start:end, :].transpose(1, 2))

    # Apply softmax to get attention probabilities
    attention_probs = torch.nn.functional.softmax(attention_scores, dim=-1)

    # Compute weighted sum to get the attention output
    attention_output = torch.bmm(attention_probs, value[:, start:end, :])
    return attention_output

#Task 3 || Training Loop Implementation

##Single GPU training loop

In [None]:
import torch

# Assuming model, dataset, optimizer, and loss function are defined

# model = Model()
# dataset = Dataset()
# optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
# loss_function = torch.nn.CrossEntropyLoss()

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move model to the selected device
model.to(device)

# Assuming dataloader is defined
# dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

num_epochs = 10

for epoch in range(num_epochs):
    for batch in dataloader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)

        # Forward pass
        outputs = model(inputs)
        loss = loss_function(outputs, targets)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')