In [7]:
! pip3 install tiktoken




[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import torch
print(torch.__version__)

2.6.0+cu126


In [5]:
torch.cuda.is_available()

True

In [6]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 4070 Ti SUPER'

In [8]:
import importlib
import tiktoken
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [9]:
# LLM COnfiguration
GPT_CONFIG_124M = {
    "vocab_size" : 50257,  # Vocabulary size
    "context_length" : 1024,  # Context length
    "emb_dim" : 768,   # Embedding dimension
    "n_heads" : 12,   # Number of Attension heads
    "n_layers" : 12,  # Number of Transformer layers
    "dropout_rate" : 0.1,   # Dropout rate
    "qkv_bias" : False   #Query-key-value bias
}

In [11]:
class MultiHeadAttention(nn.Module):
    def __init__(self, dim_input, dim_output, context_length,  num_heads, dropout_rate, qkv_bias):
        super().__init__()
        assert ( dim_output % num_heads == 0), "dim_output must be divisible by num_heads"
        self.dim_output = dim_output
        self.num_heads = num_heads
        self.head_dim = dim_output // num_heads

        # Initialize the weights
        self.weight_query = nn.Linear(dim_input, dim_output, bias=qkv_bias)
        self.weight_key = nn.Linear(dim_input, dim_output, bias=qkv_bias)
        self.weight_value = nn.Linear(dim_input, dim_output, bias=qkv_bias)

        self.out_projection = nn.Linear(dim_output, dim_output) # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout_rate)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length), diagonal=1)
        )

    def forward(self, x):
        batch_size, num_tokens, dim_input = x.shape   # The input come in the shape of (batch_size, num_tokens, dim_input)

        keys = self.weight_key(x)  # Shape (batch_size, num_tokens, dim_output)
        queries = self.weight_query(x)
        values = self.weight_value(x)

        # We reshape the keys, queries and values to (batch_size, num_heads, num_tokens, head_dim)
        # We implicitily split the matrix by adding a 'num_heads' dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)

        keys = keys.view(batch_size, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(batch_size, num_tokens, self.num_heads, self.head_dim)
        values = values.view(batch_size, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute the attention scores
        attention_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Apply the mask to the attention scores 
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill the upper triangle of the attention scores with -inf
        attention_scores.masked_fill(mask_bool, -torch.inf)

        attension_weights = torch.softmax(attention_scores / keys.shape[-1] ** 0.5, dim=-1) # Scale the attention scores
        attension_weights = self.dropout(attension_weights)

        # Compute the attention output(context_vector)
        # Shape (batch_size, num_heads, num_tokens, head_dim) -> (batch_size, num_tokens, num_heads, head_dim)
        context_vector = (attension_weights @ values).transpose(1, 2) 

        # Combine heads, where self.dim_out = self.num_heads * self.head_dim
        context_vector = context_vector.contiguous().view(batch_size, num_tokens, self.dim_output)
        context_vector = self.out_projection(context_vector)   #Optional Projection

        return context_vector
    


<div class="alert alert-block alert-info">

Step 1: Reduce the projection dim to match desired output dim

Step 2: Use a Linear layer to combine head outputs

Step 3: Tensor shape: (b, num_tokens, d_out)

Step 4: We implicitly split the matrix by adding a `num_heads` dimension. Then we unroll last dim: (b,
num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)

Step 5: Transpose from shape (b, num_tokens, num_heads, head_dim) to (b, num_heads, num_tokens, head_dim)

Step 6: Compute dot product for each head

Step 7: Mask truncated to the number of tokens

Step 8: Use the mask to fill attention scores

Step 9: Tensor shape: (b, num_tokens, n_heads, head_dim)

Step 10: Combine heads, where self.d_out = self.num_heads * self.head_dim

Step 11: Add an optional linear projection
</div>

In [12]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * 
            (x + 0.044715 * torch.pow(x, 3))
        ))

In [13]:
class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(config['emb_dim'], 4 * config['emb_dim']), # Expanbding the demension 4x for better performance
            GELU(), # Activation function 
            nn.Linear(4 * config['emb_dim'], config['emb_dim']) #Contacting the demension back to original
        )
    def forward(self, x): 
        return self.layers(x)

In [14]:
class LayerNorm(nn.Module):
    def __init__(self, dim_input):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(dim_input))
        self.shift = nn.Parameter(torch.zeros(dim_input))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        varience = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(varience + self.eps)

        return self.scale * norm_x + self.shift

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.attention = MultiHeadAttention(
            dim_input = config["emb_dim"],
            dim_output = config["emb_dim"],
            context_length = config["context_length"],
            num_heads = config["n_heads"],
            dropout_rate = config["dropout_rate"],
            qkv_bias = config["qkv_bias"])
        self.feed_forward = FeedForward(config),
        self.layer_norm1 = LayerNorm(config["emb_dim"]),
        self.layer_norm2 = LayerNorm(config["emb_dim"]),
        self.drop_shortcut = nn.Dropout(config["dropout_rate"])
    
    def forward(self, x):
        # Shortcut connection for the attention block
        shortcut = x
        x = self.layer_norm1(x)
        x = self.attention(x)  # Shape [batch_size, context_length, emb_dim]
        x = self.drop_shortcut(x)
        x = x + shortcut   # Adding the orginal input to the output for shortcut connection 

        # Shortcut connection for the feed forward block
        shortcut = x
        x = self.layer_norm2(x)
        x = self.feed_forward(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        return x



In [None]:
import torch
import torch.nn as nn 

class GPT_Skeleton(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embedding = nn.Embedding(config["vocab_size"], config["emb_dim"])
        self.position_embedding = nn.Embedding(config["context_length"], config["emb_dim"])
        self.dropout_embedding = nn.Dropout(config["dropout_rate"])

        self.transformer_blocks = nn.Sequential(
            *[TransformerBlock(config) for _ in range(config["n_layers"])]
        )
        self.final_norm_layer = LayerNorm(config["emb_dim"])
        self.out_head = nn.Linear(config["emb_dim"], config["vocab_size"], bias=False)

    


    def forward(self, input):
        batch_size, num_tokens = input.shape
        token_embedding = self.token_embedding(input) # Shape (batch_size, num_tokens, emb_dim)
        positional_embedding = self.position_embedding(torch.arrange(num_tokens, device=input.device))
        x = token_embedding + positional_embedding 
        x = self.dropout_embedding(x)  #Input Passing to the dropout Layer 
        x = self.transformer_blocks(x)  #Passing the input to the transformer blocks
        x = self.final_norm_layer(x) #Final Layer Norm
        logits = self.out_head(x)

        return logits

In [None]:
torch.manual_seed(123)
model = GPT_Skeleton(GPT_CONFIG_124M)
out = model(batch)
print("Input batch:\n", batch)
print("\nOutput shape:", out.shape)
print(out)

In [None]:
def text_generation(model, idx, max_new_tokens, context_size):
    # idx is the input with the shape of (batch_size, num_tokens)
    for _ in range(max_new_tokens):
        # Crop current context if it exceeds the supported context size
        # E.g., if LLM supports only 5 tokens, and the context size is 10
        # then only the last 5 tokens are used as context
        idx_crop = idx[:, -context_size:]

        # Get the logits for the next token
        with torch.no_grad():
            logits = model(idx_crop)
        # Focus only on the last time step
        # (batch, n_tokens, vocab_size) becomes (batch, vocab_size)

        logits = logits[:, -1, :]

        # Apply softmax to get the probabilities
        probs = torch.softmax(logits, dim=-1)    # (batch, vocab_size)

        # Get the idx of the next token with higgest probability
        idx_next = torch.argmax(probs, dim=-1) # (batch, 1)

        # Append the new token to the input 
        idx = torch.cat((idx, idx_next), dim=1) # (batch, n_tokens+1)

    return idx