In [1]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)


  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hello, I'm a language model, but what I'm really doing is making a human-readable document. There are other languages, but those are"},
 {'generated_text': "Hello, I'm a language model, not a syntax model. That's why I like it. I've done a lot of programming projects.\n"},
 {'generated_text': "Hello, I'm a language model, and I'll do it in no time!\n\nOne of the things we learned from talking to my friend"},
 {'generated_text': "Hello, I'm a language model, not a command line tool.\n\nIf my code is simple enough:\n\nif (use (string"},
 {'generated_text': "Hello, I'm a language model, I've been using Language in all my work. Just a small example, let's see a simplified example."}]

In [2]:
generator("The stars and galaxies are", max_length=20, num_return_sequences=1)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'The stars and galaxies are visible because our light was emitted so fast that we only see them when we'}]

In [3]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2") # 124M parameters

for k, v in model.state_dict().items():
    print(k, v.shape)

transformer.wte.weight torch.Size([50257, 768])
transformer.wpe.weight torch.Size([1024, 768])
transformer.h.0.ln_1.weight torch.Size([768])
transformer.h.0.ln_1.bias torch.Size([768])
transformer.h.0.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.0.attn.c_attn.bias torch.Size([2304])
transformer.h.0.attn.c_proj.weight torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias torch.Size([768])
transformer.h.0.ln_2.weight torch.Size([768])
transformer.h.0.ln_2.bias torch.Size([768])
transformer.h.0.mlp.c_fc.weight torch.Size([768, 3072])
transformer.h.0.mlp.c_fc.bias torch.Size([3072])
transformer.h.0.mlp.c_proj.weight torch.Size([3072, 768])
transformer.h.0.mlp.c_proj.bias torch.Size([768])
transformer.h.1.ln_1.weight torch.Size([768])
transformer.h.1.ln_1.bias torch.Size([768])
transformer.h.1.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.1.attn.c_attn.bias torch.Size([2304])
transformer.h.1.attn.c_proj.weight torch.Size([768, 768])
transformer.h.1.attn.c_proj.bias 

# **Token Embeddings**

Token embeddings wte.weights torch.Size([50257, 768]) are basically a lookup table that contains the embeddings of each token in the vocabulary. The vocabulary size is 50257, and the embedding size is 768.



In [4]:
model.state_dict()['transformer.wte.weight'].shape

torch.Size([50257, 768])

In [5]:
from transformers import GPT2Tokenizer
import json
from collections import OrderedDict

def load_and_analyze_vocab():
    # Load the GPT-2 tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    
    # Get the vocabulary as a dictionary
    vocab = tokenizer.get_vocab()
    
    # Sort vocabulary by token index
    sorted_vocab = OrderedDict(sorted(vocab.items(), key=lambda x: x[1]))
    
    # Create a more detailed vocabulary analysis
    vocab_analysis = {
        'token_to_id': sorted_vocab,
        'id_to_token': {v: k for k, v in sorted_vocab.items()},
        'special_tokens': {
            'pad_token': tokenizer.pad_token,
            'eos_token': tokenizer.eos_token,
            'unk_token': tokenizer.unk_token,
            'bos_token': tokenizer.bos_token
        },
        'vocab_size': len(vocab),
        'sample_encodings': {
            'Hello': tokenizer.encode('Hello'),
            'World': tokenizer.encode('World'),
            'Hello World': tokenizer.encode('Hello World')
        }
    }
    
    return vocab_analysis

# Load and analyze vocabulary
vocab_info = load_and_analyze_vocab()

# Save vocabulary to JSON file
with open('gpt2_vocabulary.json', 'w', encoding='utf-8') as f:
    json.dump(vocab_info, f, ensure_ascii=False, indent=2)

# Print some basic statistics and examples
print(f"Vocabulary size: {vocab_info['vocab_size']}")
print("\nFirst 10 tokens:")
for i, (token, idx) in enumerate(list(vocab_info['token_to_id'].items())[:10]):
    print(f"{idx}: {repr(token)}")

print("\nSpecial tokens:")
for token_type, token in vocab_info['special_tokens'].items():
    print(f"{token_type}: {repr(token)}")

print("\nSample encodings:")
for text, encoding in vocab_info['sample_encodings'].items():
    print(f"{text}: {encoding}")

Vocabulary size: 50257

First 10 tokens:
0: '!'
1: '"'
2: '#'
3: '$'
4: '%'
5: '&'
6: "'"
7: '('
8: ')'
9: '*'

Special tokens:
pad_token: None
eos_token: '<|endoftext|>'
unk_token: '<|endoftext|>'
bos_token: '<|endoftext|>'

Sample encodings:
Hello: [15496]
World: [10603]
Hello World: [15496, 2159]


# **Tokenize using byte-pair encoding**

In [5]:
import json
import regex as re
from typing import List

class SimpleGPT2Tokenizer:
    def __init__(self):
        # Load GPT-2 encoder, decoder and byte_encoder
        with open('utils/vocab/gpt2_vocabulary.json', 'r', encoding='utf-8') as f:
            self.encoder = json.load(f)['token_to_id']
        self.decoder = {v:k for k,v in self.encoder.items()}
        
        # Initialize byte-level encodings
        self._initialize_byte_encodings()
        
        # GPT-2 regex pattern
        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
    
    def _initialize_byte_encodings(self):
        bytes_to_unicode_list = list(range(ord('!'), ord('~')+1)) + \
                              list(range(ord('¡'), ord('¬')+1)) + \
                              list(range(ord('®'), ord('ÿ')+1))
        cs = bytes_to_unicode_list.copy()
        n = 0
        
        # Add missing bytes
        for b in range(256):
            if b not in bytes_to_unicode_list:
                bytes_to_unicode_list.append(b)
                cs.append(256 + n)
                n += 1
                
        cs = [chr(n) for n in cs]
        self.byte_encoder = dict(zip(bytes_to_unicode_list, cs))
        self.byte_decoder = {v:k for k,v in self.byte_encoder.items()}

    def byte_encode(self, text: str) -> str:
        """Convert text to bytes then to unicode representation"""
        return ''.join([self.byte_encoder[b] for b in text.encode('utf-8')])

    def byte_decode(self, text: str) -> str:
        """Convert unicode representation back to text"""
        return bytes(self.byte_decoder[c] for c in text).decode('utf-8', errors='replace')

    def encode(self, text: str) -> List[int]:
        """Convert text to tokens using greedy longest match algorithm"""
        bpe_tokens = []
        byte_encoded = self.byte_encode(text)
        i = 0
        
        # Longest match algorithm - sliding window approach from left to right
        while i < len(byte_encoded):
            longest_match = None
            longest_length = 0
            
            # Try to find the longest matching token starting at position i
            for j in range(i + 1, len(byte_encoded) + 1):
                current_substr = byte_encoded[i:j]
                if current_substr in self.encoder:
                    if len(current_substr) > longest_length:
                        longest_match = current_substr
                        longest_length = len(current_substr)
            
            if longest_match:
                # Add the longest matching token
                bpe_tokens.append(self.encoder[longest_match])
                i += longest_length
            else:
                # Handle unknown tokens character by character
                char = byte_encoded[i]
                if char in self.encoder:
                    bpe_tokens.append(self.encoder[char])
                i += 1
        
        return bpe_tokens

    def decode(self, tokens: List[int]) -> str:
        """Convert tokens back to text"""
        text = ''.join(self.decoder[token] for token in tokens)
        return self.byte_decode(text)


# Example usage
tokenizer = SimpleGPT2Tokenizer()
text = "Deep learning is one of the revolutionary technologies in the 21st century!"

# Encode and decode
tokens = tokenizer.encode(text)
decoded_text = tokenizer.decode(tokens)

print(f"Original text: {text}")
print(f"Tokens: {tokens}")
print(f"Decoded text: {decoded_text}")

Original text: Deep learning is one of the revolutionary technologies in the 21st century!
Tokens: [29744, 4673, 318, 530, 286, 262, 12253, 8514, 287, 262, 2310, 301, 4289, 0]
Decoded text: Deep learning is one of the revolutionary technologies in the 21st century!


In [6]:
# Get the corresponding embeddings for each token from the lookup table
token_embeddings = model.state_dict()['transformer.wte.weight'][tokens]

In [7]:
token_embeddings.shape

torch.Size([14, 768])

In [8]:
# Include the batch dimension shape (1, num_tokens, embedding_dim)
token_embeddings = token_embeddings.unsqueeze(0)

token_embeddings.shape

torch.Size([1, 14, 768])

In [9]:
import torch
# Get the positional embeddings for each token

position_ids = torch.arange(len(tokens)).unsqueeze(0)

position_embeddings = model.state_dict()['transformer.wpe.weight'][position_ids]
position_embeddings.shape

torch.Size([1, 14, 768])

In [10]:
# Concatenate token embeddings and positional embeddings

embeddings = token_embeddings + position_embeddings
embeddings.shape

torch.Size([1, 14, 768])

In [11]:
embeddings

tensor([[[-0.0830, -0.2365,  0.1785,  ..., -0.1328,  0.0512,  0.1872],
         [ 0.0478, -0.0467, -0.0417,  ..., -0.0668,  0.0224, -0.0033],
         [-0.0055, -0.0747,  0.1101,  ...,  0.1342, -0.0187, -0.0468],
         ...,
         [ 0.0711, -0.0996,  0.0806,  ..., -0.0234, -0.1299, -0.1057],
         [-0.0567,  0.0671,  0.2829,  ..., -0.0525, -0.0687, -0.2013],
         [-0.1060, -0.0190,  0.1377,  ..., -0.1406,  0.0136,  0.0454]]])

# **Layer Normalization in Transformers**

Layer Normalization is a technique used to normalize the activations in neural network layers, particularly in transformers.


Given an input vector $$ x = (x_1, x_2, \dots, x_H) $$ for a particular sample with $$ (H) $$ features:

1. **Compute Mean:**  
   $$ \mu = \frac{1}{H} \sum_{i=1}^{H} x_i $$

2. **Compute Variance:**  
   $$ \sigma^2 = \frac{1}{H} \sum_{i=1}^{H} (x_i - \mu)^2 $$

3. **Normalize Each Feature:**  
   $$ \hat{x}_i = \frac{x_i - \mu}{\sqrt{\sigma^2 + \epsilon}} $$  
   where \( \epsilon \) is a small constant for numerical stability.

4. **Apply Scale and Shift:**  
   $$ y_i = \gamma \hat{x}_i + \beta $$  
   where \( \gamma \) (scale) and \( \beta \) (shift) are learnable parameters.

---

### **LayerNorm in Transformer Architecture**

In a Transformer block, LayerNorm is typically applied as:

$$ \text{Output} = \text{LayerNorm}(x + \text{SubLayer}(x)) $$

Where:
- \( x \) is the input to the sub-layer.
- "SubLayer" could be the self-attention mechanism or a feed-forward layer.
- LayerNorm ensures stable gradient flow and better performance.


In [19]:
ln1_weight = model.state_dict()['transformer.h.0.ln_1.weight']
ln1_bias = model.state_dict()['transformer.h.0.ln_1.bias']

print(ln1_weight.shape)
print(ln1_bias.shape)
# Define layer normalization function
def layer_norm(x, weight, bias, eps=1e-5):
    mean = x.mean(-1, keepdim=True)
    std = x.std(-1, keepdim=True)
    norm_x = (x - mean) / (std + eps)

    # Scaling and shifiing using weight and bias parameters
    return norm_x * weight + bias

# Apply layer normalization
ln1_output = layer_norm(embeddings, ln1_weight, ln1_bias)
ln1_output

torch.Size([768])
torch.Size([768])


tensor([[[-4.9977e-02, -8.4150e-02,  9.8113e-03,  ..., -7.5416e-02,
           1.2459e-02,  8.0012e-02],
         [ 4.6974e-02, -9.2930e-03, -9.1281e-02,  ..., -6.6427e-02,
           6.2249e-03, -1.6448e-02],
         [-5.2359e-03, -4.0788e-02,  2.8857e-02,  ...,  1.2823e-01,
          -2.8203e-02, -5.8558e-02],
         ...,
         [ 8.2508e-02, -6.6363e-02,  2.8833e-03,  ..., -3.2398e-02,
          -1.3710e-01, -1.1933e-01],
         [-7.0218e-02,  9.3280e-02,  1.6860e-01,  ..., -6.3457e-02,
          -8.0316e-02, -2.1896e-01],
         [-1.3806e-01,  7.8144e-03,  5.6480e-02,  ..., -1.6285e-01,
           2.1035e-04,  3.3637e-02]]])

# **Self Attention Mechanism in Transformers**

Self-attention mechanism computes a weighted sum of all values based on the similarity between queries and keys. The main steps are:

1. **Input Projections**  
    Query, Key, Value matrices are obtained by projecting input embeddings:
    $$ Q = XW_Q, K = XW_K, V = XW_V $$
    where $X \in \mathbb{R}^{L \times d_{model}}$ is the input and $W_Q, W_K, W_V \in \mathbb{R}^{d_{model} \times d_k}$ are learnable parameters.

2. **Multi-head Splitting**  
    Split Q, K, V into h heads:
    $$ Q_i, K_i, V_i \in \mathbb{R}^{L \times (d_k/h)} $$
    where $i \in [1,h]$ represents each attention head.

3. **Scaled Dot-Product Attention**  
    For each head:
    $$ \text{Attention}(Q_i, K_i, V_i) = \text{softmax}\left(\frac{Q_iK_i^T}{\sqrt{d_k}}\right)V_i $$
    where:
    - $Q_iK_i^T$ computes similarity scores
    - $\sqrt{d_k}$ scales to prevent vanishing gradients
    - softmax normalizes attention weights

4. **Multi-head Concatenation**  
    Concatenate outputs from all heads:
    $$ \text{MultiHead}(Q,K,V) = \text{Concat}(\text{head}_1,...,\text{head}_h)W_O $$
    where $W_O \in \mathbb{R}^{hd_v \times d_{model}}$ is the output projection.

5. **Final Output**  
    $$ \text{Attention}(X) = \text{MultiHead}(XW_Q, XW_K, XW_V) $$

The complete attention mechanism allows the model to attend to different aspects of the input simultaneously through multiple heads.


In [31]:
NUM_HEADS = 12
EMBEDDING_DIM = 768

def scaled_dot_product_attention(query, key, value, mask=None, dropout=None):
    # Compute the dot product of query and key
    scores = torch.matmul(query, key.transpose(-2, -1))

    # Scale the scores
    scores = scores / query.size(-1)**0.5

    # Apply the mask (if any)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)

    # Apply the softmax function
    attention_weights = torch.nn.functional.softmax(scores, dim=-1)

    # Apply dropout (if any)
    if dropout is not None:
        attention_weights = dropout(attention_weights)

    # Compute the weighted sum of values
    output = torch.matmul(attention_weights, value)
    return output, attention_weights


attn_weights = model.state_dict()['transformer.h.0.attn.c_attn.weight']
attn_biases = model.state_dict()['transformer.h.0.attn.c_attn.bias']

# The ln1_output [1, 14, 768] is projected to [1, 14, 768*3] using the attention weights
linear_projection = torch.matmul(ln1_output, attn_weights)
linear_projection = linear_projection + attn_biases

query, key, value = linear_projection.chunk(3, dim=-1)
query.shape, key.shape, value.shape


(torch.Size([1, 14, 768]), torch.Size([1, 14, 768]), torch.Size([1, 14, 768]))

## Splitting to multiple heads

In [32]:
# Split the query, key, and value into multiple heads
query = query.view(1, 14, NUM_HEADS, EMBEDDING_DIM // NUM_HEADS).transpose(1, 2)
key = key.view(1, 14, NUM_HEADS, EMBEDDING_DIM // NUM_HEADS).transpose(1, 2)
value = value.view(1, 14, NUM_HEADS, EMBEDDING_DIM // NUM_HEADS).transpose(1, 2)

query.shape, key.shape, value.shape

(torch.Size([1, 12, 14, 64]),
 torch.Size([1, 12, 14, 64]),
 torch.Size([1, 12, 14, 64]))

In [38]:
# Apply scaled dot-product attention
attn_output, attn_weights = scaled_dot_product_attention(query, key, value)
attn_output.shape, attn_weights.shape

(torch.Size([1, 12, 14, 64]), torch.Size([1, 12, 14, 14]))

In [40]:
# Merge the heads

attn_output = attn_output.transpose(1, 2).contiguous().view(1, 14, EMBEDDING_DIM)
attn_proj_weight = model.state_dict()['transformer.h.0.attn.c_proj.weight']
attn_proj_bias = model.state_dict()['transformer.h.0.attn.c_proj.bias']

# Apply the projection layer
attn_proj = torch.matmul(attn_output, attn_proj_weight) + attn_proj_bias
attn_proj.shape

torch.Size([1, 14, 768])

In [41]:
# Compute the residual connection
attn_output = attn_proj + ln1_output
attn_output.shape

torch.Size([1, 14, 768])

In [42]:
# Apply layer normalization
ln2_weight = model.state_dict()['transformer.h.0.ln_2.weight']
ln2_bias = model.state_dict()['transformer.h.0.ln_2.bias']

ln2_output = layer_norm(attn_output, ln2_weight, ln2_bias)
ln2_output.shape

torch.Size([1, 14, 768])

# **MultiLayer Perceptron**

The MultiLayer Perceptron (MLP) in GPT-2 consists of two linear transformations with a GELU activation in between. The computation flow is:

$$ h_{intermediate} = \text{GELU}(xW_1 + b_1) $$
$$ h_{output} = h_{intermediate}W_2 + b_2 $$

where:
- $x \in \mathbb{R}^{L \times d_{model}}$ is the input
- $W_1 \in \mathbb{R}^{d_{model} \times 4d_{model}}$ expands dimensions
- $W_2 \in \mathbb{R}^{4d_{model} \times d_{model}}$ projects back
- $b_1, b_2$ are bias terms

The GELU (Gaussian Error Linear Unit) activation is defined as:

$$ \text{GELU}(x) = x \cdot \Phi(x) $$

where $\Phi(x)$ is the cumulative distribution function of the standard normal distribution:

$$ \Phi(x) = \frac{1}{2}\left[1 + \text{erf}\left(\frac{x}{\sqrt{2}}\right)\right] $$

GELU can be approximated as:

$$ \text{GELU}(x) \approx 0.5x\left(1 + \tanh\left[\sqrt{2/\pi}(x + 0.044715x^3)\right]\right) $$

In GPT-2:
- Input dimension ($d_{model}$): 768
- Intermediate dimension ($4d_{model}$): 3072
- Output dimension ($d_{model}$): 768

The complete MLP computation in a transformer block:

$$ \text{MLP}(x) = \text{LayerNorm}(x + \text{FFN}(x)) $$

where FFN is the feed-forward network:

$$ \text{FFN}(x) = \text{GELU}(xW_1 + b_1)W_2 + b_2 $$

In [43]:
mlp_weight1 = model.state_dict()['transformer.h.0.mlp.c_fc.weight']
mlp_bias1 = model.state_dict()['transformer.h.0.mlp.c_fc.bias']

mlp_weight2 = model.state_dict()['transformer.h.0.mlp.c_proj.weight']
mlp_bias2 = model.state_dict()['transformer.h.0.mlp.c_proj.bias']

mlp_weight1.shape, mlp_bias1.shape, mlp_weight2.shape, mlp_bias2.shape

(torch.Size([768, 3072]),
 torch.Size([3072]),
 torch.Size([3072, 768]),
 torch.Size([768]))

In [46]:
# Apply the first linear layer
mlp_output = torch.matmul(ln2_output, mlp_weight1) + mlp_bias1
# Apply the GELU activation function
mlp_output = torch.nn.functional.gelu(mlp_output)
# Apply the second linear layer
mlp_output = torch.matmul(mlp_output, mlp_weight2) + mlp_bias2
mlp_output.shape

torch.Size([1, 14, 768])

In [49]:
block_output = mlp_output + ln2_output
block_output.shape

torch.Size([1, 14, 768])

In [51]:
block_output.shape

torch.Size([1, 14, 768])

In [3]:
# Extracting parameters and saving them as a file

from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import os

model = GPT2LMHeadModel.from_pretrained("gpt2") # 124M parameters

output_dir = 'parameters/gpt2'
os.makedirs(output_dir, exist_ok=True)

for k, v in model.state_dict().items():
    print(k, v.shape)
    np.save(os.path.join(output_dir, k), v.numpy())

transformer.wte.weight torch.Size([50257, 768])
transformer.wpe.weight torch.Size([1024, 768])
transformer.h.0.ln_1.weight torch.Size([768])
transformer.h.0.ln_1.bias torch.Size([768])
transformer.h.0.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.0.attn.c_attn.bias torch.Size([2304])
transformer.h.0.attn.c_proj.weight torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias torch.Size([768])
transformer.h.0.ln_2.weight torch.Size([768])
transformer.h.0.ln_2.bias torch.Size([768])
transformer.h.0.mlp.c_fc.weight torch.Size([768, 3072])
transformer.h.0.mlp.c_fc.bias torch.Size([3072])
transformer.h.0.mlp.c_proj.weight torch.Size([3072, 768])
transformer.h.0.mlp.c_proj.bias torch.Size([768])
transformer.h.1.ln_1.weight torch.Size([768])
transformer.h.1.ln_1.bias torch.Size([768])
transformer.h.1.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.1.attn.c_attn.bias torch.Size([2304])
transformer.h.1.attn.c_proj.weight torch.Size([768, 768])
transformer.h.1.attn.c_proj.bias 