In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import tiktoken
from Layers import *

In [2]:
GPT_CONFIG_124M = {
    "vocab_size" : 50527,  # total no of unique tokens
    "context_length" : 1024,  # seq length or context length
    "emb_dim" : 768, # embedding dim, hidden dim, d_model
    "n_heads" : 12, # number of attention heads in MHA
    "n_layers" : 12, # number of transformer layers
    "drop_rate" : 0.1, # dropout rate
    "qkv_bias" : False # Query Key Value 
}

In [3]:
class GPTBlock(nn.Module):

    def __init__(self,cfg, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.token_embeddings = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_embeddings = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.emb_dropout = nn.Dropout(cfg["drop_rate"])
        self.transformer_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        self.final_norm = nn.LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, X):

        '''
        X is a minibatch of size (batch_size, seq_len)
        '''
        batch_size, seq_len = X.shape
        token_embed = self.token_embeddings(X)
        pos_embed = self.pos_embeddings(torch.arange(seq_len, device=X.device))
        out = token_embed + pos_embed
        out = self.emb_dropout(out)
        out = self.transformer_blocks(out)
        out = self.final_norm(out)
        logits = self.out_head(out)

        return logits



_____________________

In [4]:
# Get the input data
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

dataloader = CreateGPTDatasetV1.create_dataloader_v1(txt=raw_text, batch_size=2, max_length=4)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

print("Inputs are :\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Inputs are :
 tensor([[  13,  632,  373,  465],
        [ 832,  616, 7363,    0]])

Inputs shape:
 torch.Size([2, 4])


In [None]:
## NOTE
'''
The context length is 1024 but we only setting sequence lengths to be 4. These can be set to max value of context length.
'''

In [5]:
model = GPTBlock(GPT_CONFIG_124M)
out = model(inputs)

In [6]:
out.shape

torch.Size([2, 4, 50527])

In [7]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

Total number of parameters: 163,405,824


In [8]:
total_size_bytes = total_params * 4 #Assume each parameter is a 32bit floating point taking 4bytes
total_size_mb = total_size_bytes / (1024 * 1024) #B
print(f"Total size of the model: {total_size_mb:.2f} MB")

Total size of the model: 623.34 MB
