In [31]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,  # Vocabulary size
    "context_length": 1024,      # Context length
    "embed_dim": 768,       # Embedding dimension
    "n_heads": 12,        # Number of attention heads
    "n_layers": 12,       # Number of layers
    "drop_rate": 0.1,     # Dropout rate
    "qkv_bias": False     # Query-Key-Value bias
}

In [2]:
import torch

  cpu = _conversion_method_template(device=torch.device("cpu"))


## DummyGPT code

In [3]:
import torch.nn as nn

In [42]:
class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.token_emb = nn.Embedding(cfg['vocab_size'], cfg['embed_dim'])  # 50247*768
        self.pos_emb = nn.Embedding(cfg['context_length'], cfg['embed_dim'])  # 1024*768
        self.drop_emb = nn.Dropout(cfg['drop_rate'])
        self.trf_blocks = nn.Sequential(*[DummyTransformerBlock(cfg) for _ in range(cfg['n_layers'])])
        self.final_norm = DummyLayerNorm(cfg['embed_dim'])
        self.out_head = nn.Linear(
            cfg['embed_dim'], cfg['vocab_size'], bias = False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.token_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device = in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits
 
class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
 
    def forward(self, x):
        return x
 
class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super().__init__()
 
    def forward(self, x):
        return x    

## Individual Components

### Tokenizer

In [6]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (6.6 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Downloading regex-2024.7.24-cp310-cp310-macosx_11_0_arm64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting requests>=2.26.0 (from tiktoken)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting charset-normalizer<4,>=2 (from requests>=2.26.0->tiktoken)
  Downloading charset_normalizer-3.3.2-cp310-cp310-macosx_11_0_arm64.whl.metadata (33 kB)
Collecting idna<4,>=2.5 (from requests>=2.26.0->tiktoken)
  Downloading idna-3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting urllib3<3,>=1.21.1 (from requests>=2.26.0->tiktoken)
  Downloading urllib3-2.2.2-py3-none-any.whl.metadata (6.4 kB)
Collecting certifi>=2017.4.17 (from requests>=2.26.0->tiktoken)
  Downloading certifi-2024.7.4-py3-none-any.whl.metadata (2.2 kB)
Downlo

In [7]:
import tiktoken

In [38]:
tokenizer = tiktoken.get_encoding(encoding_name = 'gpt2')
batch = []
txt1 = "Hello my name is Pranav."
txt2 = "Hello my name is Sam."

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))

In [39]:

batch[1] = nn.functional.pad(batch[1], (0,2))
batch

[tensor([15496,   616,  1438,   318,  1736,   272,   615,    13]),
 tensor([15496,   616,  1438,   318,  3409,    13,     0,     0])]

In [40]:
batch = torch.stack(batch, dim = 0)

In [23]:
tokenizer.decode([15496,   616,  1438,   318])

'Hello my name is'

In [26]:
tokenizer.decode([1736,   272,   615,    13])

' Pranav.'

In [43]:
torch.manual_seed(123)
model = DummyGPTModel(GPT_CONFIG_124M)
logits = model(batch)
print("Output shape:", logits.shape)
print(logits)

Output shape: torch.Size([2, 8, 50257])
tensor([[[ 0.1863, -0.5487,  0.4876,  ..., -0.0398,  0.4186, -0.1547],
         [-0.9641,  0.3431, -1.6978,  ...,  0.6259, -0.3982,  1.6753],
         [-0.1668,  1.0426,  0.6369,  ...,  0.4858,  0.1500, -0.4176],
         ...,
         [ 0.5609, -0.1211,  0.3259,  ...,  1.7252,  0.1595, -0.4307],
         [ 0.8629, -0.4972,  0.3844,  ...,  0.1534,  0.5506, -0.0335],
         [ 0.2213,  1.7510,  0.2642,  ...,  0.1696, -0.3050,  0.6064]],

        [[ 0.3145, -0.4676,  0.1417,  ...,  0.7270,  0.6297, -0.6830],
         [-1.0624,  0.7202, -1.1463,  ...,  1.0762, -0.3444,  1.9972],
         [ 0.0883,  1.3915,  0.2580,  ...,  0.0255, -0.1977, -0.6394],
         ...,
         [-0.1838,  1.4263,  0.3642,  ...,  1.4198,  1.1293,  1.1510],
         [ 1.0107, -0.0079,  1.4220,  ...,  0.5760, -0.4488, -0.4930],
         [ 0.0253,  0.6240,  0.3732,  ...,  0.4539, -1.0875, -0.9268]]],
       grad_fn=<UnsafeViewBackward0>)


### Layer Normalization