# Importing Libraries

In [2]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F


from dataclasses import dataclass

# Beginning the Construction of GPT

In [5]:
torch.tril(torch.ones(3, 3))

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [None]:
@dataclass
class GPTConfig:
	no_of_layers: int = 12
	embedding_dim: int = 768
	vocab_size: int = 50257
	block_size: int = 1000


class GPT(nn.Module):

	def __init__(self, config):
		super().__init__()
		self.config = config

		self.transformer = nn.ModuleDict({
			'wte' : nn.Embedding(config.vocab_size, config.embedding_dim),
			'wpe' : nn.Embedding(config.block_size, config.embedding_dim),
			'h' : nn.ModuleList([Block(config)])

		})

In [6]:
import torch

# Setup dummy example
B = 1  # batch size
T = 6  # sequence length (max)
d_k = 4  # key/query dim (small for demo)
n_head = 1  # number of heads

# Dummy query and key matrices (values don't matter, just shape)
Q = torch.rand(B, n_head, T, d_k)
K = torch.rand(B, n_head, T, d_k)

# Compute raw attention scores
attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (d_k ** 0.5)
print("=== Raw Attention Scores ===")
print(attn_scores[0, 0])

# Build causal mask [1, 1, T, T]
causal_mask = torch.tril(torch.ones(T, T)).view(1, 1, T, T)
attn_scores_causal = attn_scores.masked_fill(causal_mask == 0, float('-inf'))
print("\n=== After Applying Causal Mask ===")
print(attn_scores_causal[0, 0])

# Build padding mask [B, 1, 1, T]
# Let's simulate a sequence where last token is <pad>
padding_mask = torch.tensor([[1, 1, 1, 1, 1, 0]], dtype=torch.float32).view(B, 1, 1, T)
attn_scores_causal_padding = attn_scores_causal.masked_fill(padding_mask == 0, float('-inf'))
print("\n=== After Applying Padding Mask ===")
print(attn_scores_causal_padding[0, 0])


=== Raw Attention Scores ===
tensor([[0.4928, 0.5976, 0.5355, 0.5677, 0.3879, 0.3832],
        [0.6862, 0.5043, 0.5703, 0.6302, 0.3045, 0.3339],
        [0.7786, 0.5263, 0.5879, 0.6733, 0.3143, 0.3676],
        [0.7321, 0.6448, 0.6475, 0.7295, 0.4387, 0.4442],
        [0.6297, 0.6150, 0.4805, 0.6117, 0.4627, 0.4856],
        [0.4735, 0.2725, 0.3169, 0.3923, 0.1927, 0.2157]])

=== After Applying Causal Mask ===
tensor([[0.4928,   -inf,   -inf,   -inf,   -inf,   -inf],
        [0.6862, 0.5043,   -inf,   -inf,   -inf,   -inf],
        [0.7786, 0.5263, 0.5879,   -inf,   -inf,   -inf],
        [0.7321, 0.6448, 0.6475, 0.7295,   -inf,   -inf],
        [0.6297, 0.6150, 0.4805, 0.6117, 0.4627,   -inf],
        [0.4735, 0.2725, 0.3169, 0.3923, 0.1927, 0.2157]])

=== After Applying Padding Mask ===
tensor([[0.4928,   -inf,   -inf,   -inf,   -inf,   -inf],
        [0.6862, 0.5043,   -inf,   -inf,   -inf,   -inf],
        [0.7786, 0.5263, 0.5879,   -inf,   -inf,   -inf],
        [0.7321, 0.6448, 0