### Layer normalization
Done to ensure that the outputs of a layer have a mean of 0 and a variance of 1. Helps with training stability. The input to a transformer block and its output are layer normalized. And so is the final output before the token decoding in GPT.

But take all this with a pinch of salt. We want to use layer normalization (mean=0, variance=1) only if it really helps. What if it doesn't? That's the beauty of scale and shift below - these are trainable parameters that the model can update during training to adjust the layer norm output to best suit the training needs.

Kinda beautiful if you think about it. Fence sitting. But still beautiful.

In [12]:
import torch
import torch.nn as nn

class LayerNorm(nn.Module):
    #embedded_dim represents the number of embedded dimensions
    def __init__(self, embedded_dim):
        super().__init__()
        self.epsilon = 1e-5 #see below. avoids a division by 0.
        self.scale = nn.Parameter(torch.ones(embedded_dim))
        self.shift = nn.Parameter(torch.zeros(embedded_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm = (x - mean)/torch.sqrt(var + self.epsilon)
        scaled_shifted_norm = (norm * self.scale) + self.shift
        return scaled_shifted_norm


batch = torch.randn(2, 5)
l = LayerNorm(batch.shape[-1])
out = l(batch)
out

torch.set_printoptions(sci_mode=False)
out.mean(dim=-1, keepdim=True), out.var(dim=-1, keepdim=True, unbiased=False)

(tensor([[    0.0000],
         [    0.0000]], grad_fn=<MeanBackward1>),
 tensor([[1.0000],
         [1.0000]], grad_fn=<VarBackward0>))

### GELU activation function
Gaussian error linear unit

GELU(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3))

Preferred over the ReLU. No elbows. Smoooooth. 


In [13]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0/torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))

### Feed forward layer
Now this one required understanding the GPT model as a whole after trying to train and use it. But think of it as an expansion in the dimensional space to give the model a chance to tease out connections in a much higher dimensional space than its embedding space, and then ofcourse back to the emdedding dimensions. The activation function here is GELU. The code is pretty straightforward but the concept is rich. 

In [14]:
class FeedFoward(nn.Module):
    # we are going to use a configuration dict here to avoid have to pass in random looking 
    # parameters
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"])
        )

    def forward(self, x):
        return self.layers(x)

##### Multihead Attention implemented previously

In [29]:
import torch
import torch.nn as nn

## So here, the d_out has to be a multiple of the number of heads.
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout,
                 num_heads, qkv_bias=False):
        super().__init__()
        self.d_in = d_in
        self.d_out = d_out
        self.num_heads = num_heads
        #The number of heads per dimension
        self.head_dim = self.d_out // self.num_heads
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length),
                       diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        #print(f"W_key weight shape: {self.W_key.weight.shape}")
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        #print(f"keys.shape: {keys.shape}")
        
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)

        
        #print(f"keys.shape after splitting into heads: {keys.shape}")

        #Now here, we are going to "rearrange" the matrices such that we go from
        #[b, num_tokens, num_heads, head_dim] -> [b, num_heads, num_tokens, head_dim]
        #and this should make a lot of sense because after all, we are trying to process
        #multiple heads here, and so arranging this in that hierarchy is what we need to do

        keys = keys.transpose(1,2)
        queries = queries.transpose(1,2)
        values = values.transpose(1,2)

        #and now really business as usual. We want to compute the attn weights and context
        #vectors, just that we need to remain hyper aware that our matrices are hierarchically
        #arranged as batches -> heads -> tokens -> embedded vectors
        attn_scores = queries @ keys.transpose(2,3)

        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        attn_scores.masked_fill_(mask_bool, -torch.inf)

        #scaled dot product
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)

        #apply the dropout if there is one specified
        attn_weights = self.dropout(attn_weights)

        #calculate the context vectors
        context_vectors = attn_weights @ values

        #Now, we want to go back to how we desire the context vectors, i.e.,
        #(b, num_tokens, vectors) where all of the vectors from the multiple heads are
        #"combined" to see the uniform result of multihead processing.
        #To get there, we need to start rearranging the hierarchy 

        context_vectors = context_vectors.transpose(1,2)
        #so now this will become (b, tokens, heads, vectors)

        context_vectors = context_vectors.contiguous().view(b, num_tokens, self.d_out)
        #so now this will become (b, tokens, vectors)

        #apply a linear projection that will be useful when this class is used in training
        context_vectors = self.out_proj(context_vectors)

        return context_vectors
        

        


### And now we actually have all the components we need to put together the transformer block

Each Transformer block:

-> Layer Norm -> Multihead attention -> Dropout -> Layer Norm -> Feed forward -> Dropout ->
|                 (shortcut)                    |                  (shortcut)             |
|-----------------------------------------------|-----------------------------------------|

### A couple of points to remember...

...because I know you will be confused O(months) from now when you look at this code again, Somindra.

So, why do we need two normalizers, when all they do is exactly the same ... normalization. Remember that each of them has trainable weight parameters, which makes them position specific in the pipeline. Scale and Shift and all that.

What's with the shortcut + x thing, if its a shortcut? The shortcut is taken during training and therefore backprop. So depending upon if the shortcut is chosen or not, we cannot backprop unless the input vectors have been .... added to the state of the vectors before the changes happen that we may skip over. The backprop won't work basically.... think physics again and not cs.

In [24]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layer_norm1 = LayerNorm(cfg["emb_dim"])
        self.attn = MultiHeadAttention(d_in = cfg["emb_dim"],
                                       d_out = cfg["emb_dim"],
                                       context_length = cfg["context_length"],
                                       dropout = cfg["drop_rate"],
                                       num_heads = cfg["n_heads"],
                                       qkv_bias = cfg["qkv_bias"])
        self.dropout = nn.Dropout(cfg["drop_rate"])
        self.layer_norm2 = LayerNorm(cfg["emb_dim"])
        self.feed_forward = FeedFoward(cfg)

    def forward(self, x):
        shortcut = x
        x = self.layer_norm1(x)
        x = self.attn(x)
        x = self.dropout(x)
        x = x + shortcut

        shortcut = x
        x = self.layer_norm2(x)
        x = self.feed_forward(x)
        x = self.dropout(x)
        x = x + shortcut

        return x
        

In [22]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "emb_dim": 768,          # Embedding dimension
    "n_heads": 12,           # Number of attention heads
    "n_layers": 12,          # Number of layers
    "drop_rate": 0.1,        # Dropout rate
    "qkv_bias": False        # Query-Key-Value bias
}

In [30]:
### Lets try it out

torch.manual_seed(123)

block = TransformerBlock(GPT_CONFIG_124M)

x = torch.randn(2, 4, 768) #batch size 2, each with 4 input tokens, 768 embedded dimensions

output = block(x)

print(x.shape)
print(output.shape)

torch.Size([2, 4, 768])
torch.Size([2, 4, 768])


### And finally, we are in a position to code the GPT itself.

Tokenized Text is fed into:

Token embedding + Positional embedding -> Dropout -> Transformer Block 1 -> Transformer Block 2 -> ... -> Transformer Block 12 -> Final Layer Normalization -> Linear Output Layer

In [31]:
class GPTModule(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.token_embedding = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.positional_embedding = nn.Embedding(cfg["context_length"], cfg["emb_dim"])

        self.dropout = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
                                *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])

        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape

        tok_emb = self.token_embedding(in_idx)
        pos_emb = self.positional_embedding(
            torch.arange(seq_len, device=in_idx.device)
        )

        x = tok_emb + pos_emb

        x = self.dropout(x)

        x = self.trf_blocks(x)

        x = self.final_norm(x)

        logits = self.out_head(x)

        return logits

In [45]:
### Let's try this out
torch.manual_seed(123)

import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2))) #batch is a list
batch = torch.stack(batch, dim=0) #converts from a list of two tensors to a tensor of shape 2,4

gpt = GPTModule(GPT_CONFIG_124M)

out = gpt(batch)

out.shape #[2, 4, 50257] i.e. 2 batches, 4 tokens each, 50257 dimensions per token

torch.Size([2, 4, 50257])

In [71]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    for _ in range(max_new_tokens):
        trimmed_idx = idx[:, -context_size:] #use only the last context_size as context if len(idx) > context_size

        with torch.no_grad():
            logits = model(trimmed_idx)

        logits = logits[:, -1, :] #we want only the last token
        probas = torch.softmax(logits, dim=-1)
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)
        
        idx = torch.cat((idx, idx_next), dim=1)

    return idx

In [74]:
context = "Hello, I am"

encoded_context = tokenizer.encode(context)

#print(encoded_context) #just a list

encoded_context_tensor = torch.tensor(encoded_context)
#print(encoded_context_tensor) #just a tensor with the same info as the list

#we want this to look like (batch_size, tokens)
encoded_context_tensor = encoded_context_tensor.unsqueeze(0)
#print(encoded_context_tensor)
#print(encoded_context_tensor.shape)

gpt.eval()
out = generate_text_simple(gpt, encoded_context_tensor, 6, context_size=GPT_CONFIG_124M["context_length"])

print(out)

decoded_text = tokenizer.decode(out.squeeze(0).tolist())

decoded_text

#'Hello, I am Featureiman Byeswickattribute argue'
# classic


tensor([[15496,    11,   314,   716, 27018, 24086, 47843, 30961, 42348,  7267]])


'Hello, I am Featureiman Byeswickattribute argue'