In [25]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [26]:
import torch
import torch.nn as nn
class DummyGPTModel(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    self.tok_emb= nn.Embedding(cfg["vocab_size"],cfg["emb_dim"])
    self.pos_emb= nn.Embedding(cfg["context_length"],cfg["emb_dim"])
    self.drop_emb= nn.Dropout(cfg["drop_rate"])

    self.trf_blocks= nn.Sequential(
        *[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])]
    )
    self.final_norm = DummyLayerNorm(cfg["emb_dim"])
    self.out_head= nn.Linear(cfg["emb_dim"],cfg["vocab_size"],bias=False)

  def forward(self,in_idx):
    batch_size,seq_len= in_idx.shape
    tok_embeds= self.tok_emb(in_idx)
    print(tok_embeds.shape)
    pos_embeds= self.pos_emb(torch.arange(seq_len,device=in_idx.device))
    print(pos_embeds.shape)
    x= tok_embeds+pos_embeds
    x= self.drop_emb(x)
    x=self.trf_blocks(x)
    x=self.final_norm(x)
    logits=self.out_head(x)
    return logits

class DummyTransformerBlock(nn.Module):
  def __init__(self,cfg):
    super().__init__()

  def forward(self,x):
    return x

class DummyLayerNorm(nn.Module):
  def __init__(self,normalized_shape,eps=1e-5):
    super().__init__()

  def forward(self,x):
    return x

In [27]:
!pip install tiktoken



In [28]:
import tiktoken
tokenizer= tiktoken.get_encoding("gpt2")
batch=[]
txt1="Every effort moves you"
txt2="Every day holds a"
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch,dim=0)
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [29]:
model = DummyGPTModel(GPT_CONFIG_124M)
logits= model(batch)
print(logits)

torch.Size([2, 4, 768])
torch.Size([4, 768])
tensor([[[-0.2535, -0.3889,  1.5002,  ...,  0.0708, -0.4895, -0.8134],
         [ 0.8437,  1.1698,  0.3753,  ..., -0.3563,  0.0862,  1.0870],
         [ 0.3511, -0.1863,  1.6779,  ...,  0.9093,  0.1901, -0.3551],
         [ 0.2928,  1.6927,  0.3337,  ..., -1.5287,  1.0519,  0.0889]],

        [[-0.2481, -0.6733,  1.2488,  ...,  0.1388, -0.6451, -0.1518],
         [-0.6809, -0.0164, -0.1427,  ..., -0.6387, -0.4914,  0.1511],
         [ 0.8166,  1.2937,  0.6436,  ..., -0.8961,  1.9247,  0.3397],
         [-0.2927,  0.0989, -1.0180,  ..., -1.1201,  0.9358, -0.2313]]],
       grad_fn=<UnsafeViewBackward0>)


In [30]:
class LayerNorm(nn.Module):
  def __init__(self,emb_dim):
    super().__init__()
    self.eps=1e-5
    self.scale= nn.Parameter(torch.ones(emb_dim))
    self.shift= nn.Parameter(torch.zeros(emb_dim))

  def forward(self,x):
    mean= x.mean(dim=-1,keepdim=True)
    var= x.var(dim=-1,keepdim=True,unbiased=False)
    norm_x= (x-mean)/torch.sqrt(var+self.eps)
    return self.scale * norm_x + self.shift

In [31]:
batch_example= torch.randn(2,5)
layer= nn.Sequential(nn.Linear(5,6),nn.ReLU())
out= layer(batch_example)
print(out)
print(batch_example)

tensor([[0.2756, 0.0000, 0.0000, 0.0781, 0.0000, 0.6024],
        [0.5110, 0.0000, 0.0000, 0.0000, 0.8750, 0.0000]],
       grad_fn=<ReluBackward0>)
tensor([[-1.1605,  0.4938, -0.1759,  0.2549, -0.9464],
        [-0.4554, -0.8502, -0.5240, -2.0425, -0.6820]])


In [32]:
ln = LayerNorm(emb_dim=5)
out_ln= ln(batch_example)
mean= out_ln.mean(dim=-1,keepdim=True)
var= out_ln.var(dim=-1,keepdim=True,unbiased=False)
print(mean)
print(var)

tensor([[-1.1921e-08],
        [ 0.0000e+00]], grad_fn=<MeanBackward1>)
tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [33]:
class GELU(nn.Module):
  def __init__(self):
    super().__init__()

  def forward(self,x):
    return 0.5*x*(1+torch.tanh(torch.sqrt(torch.tensor(2.0/torch.pi)))*(x+0.044715*torch.pow(x,3)))

In [34]:
class FeedForward(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    self.layers= nn.Sequential(
        nn.Linear(cfg["emb_dim"],4*cfg["emb_dim"]),
        GELU(),
        nn.Linear(4*cfg["emb_dim"],cfg["emb_dim"])
    )

  def forward(self,x):
    return self.layers(x)

In [35]:
ffn=FeedForward(GPT_CONFIG_124M)
x= torch.rand(3,2,768)
print(x)

tensor([[[0.0563, 0.7045, 0.8232,  ..., 0.6787, 0.5999, 0.9497],
         [0.8256, 0.7222, 0.5211,  ..., 0.8207, 0.0151, 0.7893]],

        [[0.0714, 0.0829, 0.0808,  ..., 0.7235, 0.4363, 0.7851],
         [0.8723, 0.3467, 0.2456,  ..., 0.2798, 0.8494, 0.2363]],

        [[0.7729, 0.4657, 0.6138,  ..., 0.8990, 0.6104, 0.7986],
         [0.2826, 0.1841, 0.5102,  ..., 0.1858, 0.7505, 0.5958]]])


In [36]:
out= ffn(x)
print(out)
print(out.shape)

tensor([[[-0.1300,  0.0582, -0.0312,  ..., -0.1637, -0.0172, -0.2211],
         [ 0.0030, -0.0003, -0.0150,  ..., -0.1140, -0.0395, -0.1370]],

        [[-0.0498, -0.0039,  0.0254,  ..., -0.0521, -0.0844, -0.2441],
         [-0.0862, -0.0684, -0.0536,  ..., -0.1231, -0.0444, -0.1629]],

        [[-0.0774,  0.0292, -0.1643,  ..., -0.0753, -0.1293, -0.2449],
         [-0.0506,  0.1334, -0.0506,  ...,  0.0008, -0.0290, -0.1960]]],
       grad_fn=<ViewBackward0>)
torch.Size([3, 2, 768])


In [37]:
#shortcut connections
class ExampleDeepNeuralNetwork(nn.Module):
  def __init__(self,layer_sizes,use_shortcut):
    super().__init__()
    self.use_shortcut=use_shortcut
    self.layers= nn.ModuleList([
        nn.Sequential(nn.Linear(layer_sizes[0],layer_sizes[1]),GELU()),
        nn.Sequential(nn.Linear(layer_sizes[1],layer_sizes[2]),GELU()),
        nn.Sequential(nn.Linear(layer_sizes[2],layer_sizes[3]),GELU()),
        nn.Sequential(nn.Linear(layer_sizes[3],layer_sizes[4]),GELU()),
        nn.Sequential(nn.Linear(layer_sizes[4],layer_sizes[5]),GELU()),
    ])

  def forward(self,x):
    for layer in self.layers:
      layer_output= layer(x)
      if self.use_shortcut and x.shape==layer_output.shape:
        x=x+layer_output
      else:
        x= layer_output

    return x


In [38]:
layer_sizes=[3,3,3,3,3,1]
sample_input= torch.tensor([[1.,0.,-1.]])
model_without_shortcut= ExampleDeepNeuralNetwork(
    layer_sizes,use_shortcut=False
)

In [39]:
def print_gradients(model,x):
  output= model(x)
  target= torch.tensor([[0.]])
  loss=nn.MSELoss()
  loss=loss(output,target)
  loss.backward()
  for name,param in model.named_parameters():
    if 'weight' in name:
      print(f"{name} has gradient mean of {param.grad.abs().mean().item()}")

In [40]:
print(print_gradients(model_without_shortcut,sample_input))

layers.0.0.weight has gradient mean of 4.243206058163196e-05
layers.1.0.weight has gradient mean of 3.883019962813705e-05
layers.2.0.weight has gradient mean of 0.00016868537932168692
layers.3.0.weight has gradient mean of 0.0001764867192832753
layers.4.0.weight has gradient mean of 0.0031230065505951643
None


In [41]:
model_without_shortcut= ExampleDeepNeuralNetwork(
    layer_sizes,use_shortcut=True
)
print(print_gradients(model_without_shortcut,sample_input))

layers.0.0.weight has gradient mean of 0.0030419996473938227
layers.1.0.weight has gradient mean of 0.005177794024348259
layers.2.0.weight has gradient mean of 0.0069147138856351376
layers.3.0.weight has gradient mean of 0.008061745204031467
layers.4.0.weight has gradient mean of 0.07722031325101852
None


In [42]:
print(GPT_CONFIG_124M)

{'vocab_size': 50257, 'context_length': 1024, 'emb_dim': 768, 'n_heads': 12, 'n_layers': 12, 'drop_rate': 0.1, 'qkv_bias': False}


In [43]:
class MultiHeadAttention(nn.Module):
  def __init__(self,d_in,d_out,context_length,dropout,num_heads,qkv_bias=False):
    super().__init__()
    assert(d_out%num_heads==0),\
    "d_out must be divisible by num_heads"
    self.d_out= d_out
    self.num_heads= num_heads
    self.head_dim= d_out//num_heads
    self.W_query= nn.Linear(d_in,d_out,bias=qkv_bias)
    self.W_key= nn.Linear(d_in,d_out,bias=qkv_bias)
    self.W_value= nn.Linear(d_in,d_out,bias=qkv_bias)
    self.out_proj=nn.Linear(d_out,d_out) #linear layer to combine head outputs
    self.dropout= nn.Dropout(dropout)
    self.register_buffer("mask",torch.triu(torch.ones(context_length,context_length),diagonal=1))

  def forward(self,x):
    b,num_tokens,d_in= x.shape
    keys= self.W_key(x) # (b,num_tokens,d_out)
    queries= self.W_query(x)
    values= self.W_value(x)
    # we implicitly split matrix by adding num_heads dim
    keys= keys.view(b,num_tokens,self.num_heads,self.head_dim)
    queries= queries.view(b,num_tokens,self.num_heads,self.head_dim)
    values= values.view(b,num_tokens,self.num_heads,self.head_dim)
    #transpose (b,num_tokens,num_heads,head_dim)->(b,num_heads,num_tokens,head_dim)
    #This dimension helps in parallel computation
    keys= keys.transpose(1,2)
    queries=queries.transpose(1,2)
    values= values.transpose(1,2)
    attn_scores= queries @ keys.transpose(2,3) #(b,num_heads,num_tokens,num_tokens)
    mask_bool= self.mask.bool()[:num_tokens,:num_tokens]
    attn_scores.masked_fill_(mask_bool,-torch.inf)
    attn_weights= torch.softmax(attn_scores/keys.shape[-1]**0.5,dim=-1)
    attn_weights= self.dropout(attn_weights)
    #context vec before tanspose (b,num_heads,num_token,head_dim)
    #after transpose (b,num_tokens,num_heads,head_dim)
    #merging the heads back
    context_vec= (attn_weights @ values).transpose(1,2)
    context_vec= context_vec.contiguous().view(b,num_tokens,self.d_out)
    context_vec= self.out_proj(context_vec)
    return context_vec


In [44]:
class TransformerBlock(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    self.att= MultiHeadAttention(
        d_in= cfg["emb_dim"],
        d_out= cfg["emb_dim"],
        context_length= cfg["context_length"],
        num_heads= cfg["n_heads"],
        dropout= cfg["drop_rate"],
        qkv_bias= cfg["qkv_bias"]
    )
    self.ff= FeedForward(cfg)
    self.norm1= LayerNorm(cfg["emb_dim"])
    self.norm2= LayerNorm(cfg["emb_dim"])
    self.drop_shortcut= nn.Dropout(cfg["drop_rate"])

  def forward(self,x):
    shortcut=x
    x= self.norm1(x)
    x= self.att(x)
    x= self.drop_shortcut(x)
    x=shortcut+x

    shortcut=x
    x= self.norm2(x)
    x= self.ff(x)
    x= self.drop_shortcut(x)
    x=x+shortcut

    return x

In [45]:
x= torch.rand(2,4,768)
block = TransformerBlock(GPT_CONFIG_124M)
output= block(x)
print("Input shape:",x.shape)
print("Output shape:",output.shape)

Input shape: torch.Size([2, 4, 768])
Output shape: torch.Size([2, 4, 768])


In [46]:
class GPTModel(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    self.tok_emb= nn.Embedding(cfg["vocab_size"],cfg["emb_dim"])
    self.pos_emb= nn.Embedding(cfg["context_length"],cfg["emb_dim"])
    self.drop_emb= nn.Dropout(cfg["drop_rate"])
    self.trf_blocks=nn.Sequential(
        *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
    )
    self.final_norm= LayerNorm(cfg["emb_dim"])
    self.out_head= nn.Linear(cfg["emb_dim"],cfg["vocab_size"],bias=False)

  def forward(self,in_idx):
    batch_size,seq_len= in_idx.shape
    tok_embeds= self.tok_emb(in_idx)
    pos_embeds= self.pos_emb(torch.arange(seq_len,device=in_idx.device))
    x= tok_embeds+pos_embeds
    x=self.drop_emb(x)
    x= self.trf_blocks(x)
    x= self.final_norm(x)
    logits= self.out_head(x)
    return logits

In [47]:
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [48]:
model= GPTModel(GPT_CONFIG_124M)
out= model(batch)
print("input shape:",batch.shape)
print("output shape:",out.shape)

input shape: torch.Size([2, 4])
output shape: torch.Size([2, 4, 50257])


In [49]:
total_params= sum(p.numel() for p in model.parameters())
print("Total parameters:",total_params)

Total parameters: 163009536


In [50]:
print("Token embedding layer shape:", model.tok_emb.weight.shape)
print("Output layer shape:", model.out_head.weight.shape)

Token embedding layer shape: torch.Size([50257, 768])
Output layer shape: torch.Size([50257, 768])


In [51]:
total_params_gpt2 = total_params - sum(p.numel() for p in model.out_head.parameters())
print(f"Number of trainable parameters considering weight tying: {total_params_gpt2:,}")

Number of trainable parameters considering weight tying: 124,412,160


In [52]:
total_size_bytes = total_params * 4 #A
total_size_mb = total_size_bytes / (1024 * 1024) #B
print(f"Total size of the model: {total_size_mb:.2f} MB")

Total size of the model: 621.83 MB
