## Task 1: Model Implementation and Checkpoints


### Importing Libraries and Preliminaries

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# Define constants
n_layers = 4
embed_dim = 768
hidden_dim = embed_dim
vocab_size = 50257
pad_token_id = 0
max_seq_len = 1024

# Initialize embedding matrices
token_embedding = nn.Embedding(vocab_size, embed_dim)
position_embedding = nn.Embedding(max_seq_len, embed_dim)

# Positional encoding function
def positional_encoding(position, d_model):
    def get_position_angle_vec(position, i):
        return [
            position / np.power(10000, 2 * (i // 2) / d_model),
            np.sin(position / np.power(10000, (2 * (i // 2) + 1) / d_model)),
        ]
    position_enc = np.array([get_position_angle_vec(pos, i) for i in range(d_model) for pos in range(position)])
    position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])  # dim 2i
    position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])  # dim 2i+1
    return torch.from_numpy(position_enc).float().unsqueeze(0)


### 2. Transformer Decoder Block

In [None]:
class TransformerDecoderBlock(nn.Module):
    def __init__(self, embed_dim, hidden_dim, nhead, dropout=0.1):
        super(TransformerDecoderBlock, self).__init__()
        self.nhead = nhead
        self.self_attn = nn.MultiheadAttention(embed_dim, nhead, dropout=dropout)
        self.linear1 = nn.Linear(embed_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, embed_dim)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, mask):
        x = self.norm1(x)
        x, _ = self.self_attn(x, x, x, mask=mask)
        x = self.dropout1(x)
        x = x + x
        x = self.norm2(x)
        x = F.relu(self.linear1(x))
        x = self.dropout2(x)
        x = self.linear2(x)
        x = x + x
        return x


### 3. GPT2 Model:

In [None]:
class GPT2Model(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, n_layers, nhead, dropout=0.1):
        super(GPT2Model, self).__init__()
        self.token_embedding = token_embedding
        self.position_embedding = position_embedding
        self.dropout = nn.Dropout(dropout)
        self.decoder_blocks = nn.ModuleList([TransformerDecoderBlock(embed_dim, hidden_dim, nhead, dropout) for _ in range(n_layers)])
        self.to_logits = nn.Linear(embed_dim, vocab_size)

    def forward(self, input_ids, sequence_length):
        # Prepare embeddings
        token_embeddings = self.token_embedding(input_ids)
        position_encodings = self.position_embedding(torch.arange(sequence_length, device=input_ids.device))
        embeddings = token_embeddings + position_encodings
        embeddings = self.dropout(embeddings)

        # Layer-wise masking
        mask = torch.tril(torch.ones((sequence_length, sequence_length), dtype=torch.bool, device=input_ids.device))

        # Stack decoder blocks
        for block in self.decoder_blocks:
            # Pass embeddings through the block
            embeddings = block(embeddings, mask)

        # Output logits
        final_output = self.dropout(embeddings)
        logits = self.to_logits(final_output)

        return logits



In [None]:
# This code performs the following tasks:
'''
# 1. Looping through decoder blocks:
- Within the loop, we call the `forward` method of each `TransformerDecoderBlock` instance, passing the current embeddings and layer-wise mask.
- This allows each block to perform self-attention, feed-forward network, and residual connections with layer normalization based on the current state.

# 2. Implementing layer-wise masking:
- Before entering the loop, we create a triangular mask tensor using `torch.tril`. This ensures that each layer can only attend to past positions in the sequence, preventing information leakage from the future.

# 3. Outputting logits:
- After processing through all decoder blocks, we apply dropout to the final output and project it to vocabulary prediction logits using the `to_logits` linear layer.

Remember to replace `self.token_embedding` and `self.position_embedding` with your actual implementations based on the provided snippets. This code provides a comprehensive `forward` method for your GPT2 model, incorporating the desired functionalities and adhering to best practices for syntax and structure.

I hope this helps! Feel free to ask if you have any further questions or need further clarification on any specific parts of the code.

'''

'\n# 1. Looping through decoder blocks:\n- Within the loop, we call the `forward` method of each `TransformerDecoderBlock` instance, passing the current embeddings and layer-wise mask.\n- This allows each block to perform self-attention, feed-forward network, and residual connections with layer normalization based on the current state.\n\n# 2. Implementing layer-wise masking:\n- Before entering the loop, we create a triangular mask tensor using `torch.tril`. This ensures that each layer can only attend to past positions in the sequence, preventing information leakage from the future.\n\n# 3. Outputting logits:\n- After processing through all decoder blocks, we apply dropout to the final output and project it to vocabulary prediction logits using the `to_logits` linear layer.\n\nRemember to replace `self.token_embedding` and `self.position_embedding` with your actual implementations based on the provided snippets. This code provides a comprehensive `forward` method for your GPT2 model, 