In [6]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import math
from torch.nn import functional as F
import numpy as np
import tiktoken

# E0 - Fetching the origial model

Let's print the weights of the original model:
<br><br><br>
**Examples**:
**Token embeddings**: 
`transformer.wte.weight torch.Size([50257, 768])`: Basically a lookup table for the vocabulary used, `50257` tokens, with an embedding dimension of `768` for each token.
<br><br>
**Positional embeddings**
`transformer.wpe.weight torch.Size([1024, 768])`: Basically a lookup table for each of the `1024` position with an embedding dimension of `768`.


In [171]:
from transformers import GPT2LMHeadModel

model_hf = GPT2LMHeadModel.from_pretrained('gpt2') #124M
sd_hf = model_hf.state_dict()

# Print key and values of the tensor
for k in sd_hf:
    print(k, sd_hf[k].shape)

transformer.wte.weight torch.Size([50257, 768])
transformer.wpe.weight torch.Size([1024, 768])
transformer.h.0.ln_1.weight torch.Size([768])
transformer.h.0.ln_1.bias torch.Size([768])
transformer.h.0.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.0.attn.c_attn.bias torch.Size([2304])
transformer.h.0.attn.c_proj.weight torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias torch.Size([768])
transformer.h.0.ln_2.weight torch.Size([768])
transformer.h.0.ln_2.bias torch.Size([768])
transformer.h.0.mlp.c_fc.weight torch.Size([768, 3072])
transformer.h.0.mlp.c_fc.bias torch.Size([3072])
transformer.h.0.mlp.c_proj.weight torch.Size([3072, 768])
transformer.h.0.mlp.c_proj.bias torch.Size([768])
transformer.h.1.ln_1.weight torch.Size([768])
transformer.h.1.ln_1.bias torch.Size([768])
transformer.h.1.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.1.attn.c_attn.bias torch.Size([2304])
transformer.h.1.attn.c_proj.weight torch.Size([768, 768])
transformer.h.1.attn.c_proj.bias 

<img src="images/trans_arch.jpg" alt="Alt text" width="500"/>

# E1 - Linear layers

A **linear layer** (or **fully connected layer**) in a neural network is a layer where each input neuron is connected to each output neuron with a certain weight. This layer performs a linear transformation on the input vector.

Given an input vector $\mathbf{x} \in \mathbb{R}^n$ and a weight matrix $\mathbf{W} \in \mathbb{R}^{m \times n}$, and a bias vector $\mathbf{b} \in \mathbb{R}^m$, the output $\mathbf{y}$ of a linear layer can be computed as:

$$
\mathbf{y} = \mathbf{W} \mathbf{x} + \mathbf{b}
$$

- **Input Vector** $\mathbf{x}$: $\begin{bmatrix} x_1 \\ x_2 \\ \vdots \\ x_n \end{bmatrix}$
- **Weight Matrix** $\mathbf{W}$: $\begin{bmatrix} w_{11} & w_{21} & \cdots & w_{n1} \\ w_{12} & w_{22} & \cdots & w_{n2} \\ \vdots & \vdots & \ddots & \vdots \\ w_{1m} & w_{2m} & \cdots & w_{nm} \end{bmatrix}$
- **Bias Vector** $\mathbf{b}$: $\begin{bmatrix} b_1 \\ b_2 \\ \vdots \\ b_m \end{bmatrix}$
- **Output Vector** $\mathbf{y}$: $\begin{bmatrix} y_1 \\ y_2 \\ \vdots \\ y_m \end{bmatrix}$

![Alt text](images/q_k_v.png)

If we create the following:

`nn.Linear(3, 4)`

We get a weight matrix according to the dimensionality of of the output neurons $y$

- **Input Vector** $\mathbf{x}$: $\begin{bmatrix} x_1 \\ x_2 \\ x_3 \end{bmatrix}$
- **Weight Matrix** $\mathbf{W} = \begin{bmatrix}
w_{11} & w_{21} & w_{31} \\
w_{12} & w_{22} & w_{32} \\
w_{13} & w_{23} & w_{33} \\
w_{14} & w_{24} & w_{34}
\end{bmatrix}$
- **Bias Vector** $\mathbf{b}$: $\begin{bmatrix} b_1 \\ b_2 \\ b_3 \\ b_4 \end{bmatrix}$
- **Output Vector** $\mathbf{y}$: $\begin{bmatrix} y_1 \\ y_2 \\ y_3 \\ y_4 \end{bmatrix}$



In [19]:
class MyLinearLayer(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.linear = nn.Linear(input_size, output_size)

    # Define flow
    def forward(self, x):
        return self.linear(x)

input_size = 3
output_size = 9

x = torch.randn(1, input_size)  # Random input tensor

model = MyLinearLayer(input_size, output_size)
output = model(x)

state_dict = model.state_dict()

# Print of the state_dict
print(state_dict.keys())

# Access weights of a specific layer
print(state_dict['linear.weight'])

# When we look at the weight matrix, it's had exactly the dimension we expectec based on the math above.

odict_keys(['linear.weight', 'linear.bias'])
tensor([[-0.3509, -0.1814, -0.2050],
        [ 0.3712,  0.2231,  0.5085],
        [-0.5417,  0.5079, -0.1018],
        [ 0.5199,  0.3205, -0.4828],
        [-0.0970,  0.5154,  0.3764],
        [ 0.3046,  0.5113, -0.1598],
        [-0.4367, -0.1647,  0.3665],
        [-0.1620,  0.2772,  0.1710],
        [-0.1128, -0.3102, -0.1087]])


# E2 - Mask

In [20]:
block_size = 10  # The maximum length of the input sequence

ones_matrix = torch.ones(block_size, block_size)
lower_triangular_matrix = torch.tril(ones_matrix)
mask = lower_triangular_matrix.view(1, 1, block_size, block_size)

print(f"""{ones_matrix}, \n\n {lower_triangular_matrix} \n\n {mask}""")

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]), 

 tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 

# E3 - Splitting

In [21]:
batch_size = 4  # Number of examples in the batch
sequence_length = 3  # Length of the sequence for each example (e.g. "I love dogs"), assuming "I love dogs" would result in three tokens: "I", "love", "dogs"
embedding_dimension = 5  # Dimension of the embedding space

# Create a sample tensor with random values
input_tensor = torch.randn(batch_size, sequence_length, embedding_dimension)
print("\n\nInput tensor \n\n", input_tensor)

B, T, C = input_tensor.size()

# In a second step we are going to use a linear layer with three times the embedding dimension.
lin_layer = nn.Linear(embedding_dimension, 3 * embedding_dimension)
qkv = lin_layer(input_tensor)
print("\n\nWeights linear layer \n\n", lin_layer.weight)

print("\n\nqkv tensor \n\n", qkv)
q, k, v = qkv.split(embedding_dimension, dim=2)



Input tensor 

 tensor([[[ 0.8794,  0.0162, -0.0192,  0.2003, -0.7611],
         [ 1.0562, -0.5795, -0.5137,  1.1017,  1.6426],
         [ 0.6179, -0.3312, -0.2809,  1.2661, -0.7663]],

        [[ 2.1343,  0.2184,  0.4019,  0.7011, -0.7549],
         [ 0.0954, -0.9134, -0.2705,  0.0145,  1.8395],
         [ 0.1218, -0.3264,  0.8258, -0.9379,  1.2374]],

        [[-0.8918, -0.0756,  0.2246, -0.9340,  0.6094],
         [ 0.6002, -1.3578,  0.9987, -0.9037,  0.3855],
         [-0.4717, -1.1441,  0.0813,  0.0183,  0.6153]],

        [[ 1.0018,  2.1894, -0.4847,  0.9124,  1.0803],
         [-1.0512,  0.0516,  0.9709, -0.0821, -0.7628],
         [-1.0259, -1.8742, -0.3618, -0.0867, -1.1570]]])


Weights linear layer 

 Parameter containing:
tensor([[ 0.1323, -0.4311, -0.3484, -0.3857, -0.2718],
        [-0.2823, -0.4313, -0.2143,  0.0630, -0.2421],
        [ 0.0306, -0.1347,  0.4031,  0.0497,  0.1724],
        [ 0.0738,  0.0929, -0.2537, -0.3381,  0.0239],
        [-0.2294, -0.3157,  0.1039

In [22]:
n_head = 1
# B = batch
# T = sequence length
# head = number of heads
# C = embedding dimension --> The tensor k is being split into head smaller chunks. This is a common step in multi-head attention, where the input is projected into multiple subspaces corresponding to different heads.
k = k.view(B, T, n_head, C // n_head).transpose(1, 2) # transpose starts from zero. Hence, we are transposing T and head.
q = q.view(B, T, n_head, C // n_head).transpose(1, 2)
v = v.view(B, T, n_head, C // n_head).transpose(1, 2)
k

tensor([[[[-0.1881, -0.9082, -0.3982, -0.1748,  0.4831],
          [-0.4870,  0.3338,  0.6576,  1.1507, -0.3581],
          [ 0.3891, -0.3131,  0.1238,  0.3145,  0.7347]]],


        [[[-0.4980, -1.2683, -0.7886,  0.0757,  1.1504],
          [-0.6843,  0.2379,  0.7497,  0.5966, -1.1216],
          [-1.2493, -0.6761, -0.1807, -0.4770, -0.8084]]],


        [[[-0.6575, -0.3624, -0.0052, -0.7125, -0.9024],
          [-0.8934, -1.1339, -0.0502, -0.3495, -0.5188],
          [-0.1730, -0.0621,  0.6239,  0.0634, -0.6826]]],


        [[[-0.9399,  0.0199, -0.7161,  0.1496,  0.4105],
          [-0.1005, -0.6379, -0.3483, -1.1627,  0.2624],
          [ 0.8259, -0.3492,  0.8266, -0.1793, -0.4273]]]],
       grad_fn=<TransposeBackward0>)

#E4 - Attention

In [23]:
# Dimensions of the tensors: batch, sequence length, number of heads, embedding dimension (i.e. the embedding)
print(q.shape, k.shape, v.shape)


print(q)
print("\n\n")
print(k.transpose(-2, -1))

# What happens is that the transformation allows us to multiply the q and k tensors.


torch.Size([4, 1, 3, 5]) torch.Size([4, 1, 3, 5]) torch.Size([4, 1, 3, 5])
tensor([[[[ 0.5620, -0.4771, -0.1193, -0.1770, -0.0912],
          [ 0.0133, -0.6892,  0.2262, -0.3412,  0.0587],
          [ 0.3586, -0.1289, -0.1339, -0.5227, -0.0447]]],


        [[[ 0.2992, -0.9787,  0.0876, -0.3416, -0.4438],
          [ 0.3113, -0.4422,  0.3197, -0.1326,  0.5185],
          [ 0.2108, -0.8520,  0.5322, -0.0466,  0.4928]]],


        [[[ 0.3473, -0.3930,  0.1170,  0.0381,  0.5481],
          [ 0.8769, -0.3709,  0.6103, -0.1829,  0.6759],
          [ 0.5444,  0.0385,  0.2644, -0.3156,  0.6890]]],


        [[[-0.9717, -1.7501, -0.2431, -0.0448, -0.8145],
          [ 0.0558, -0.1769,  0.2017, -0.4720,  0.4686],
          [ 1.4625,  1.0273, -0.1435, -0.3189,  0.9108]]]],
       grad_fn=<TransposeBackward0>)



tensor([[[[-0.1881, -0.4870,  0.3891],
          [-0.9082,  0.3338, -0.3131],
          [-0.3982,  0.6576,  0.1238],
          [-0.1748,  1.1507,  0.3145],
          [ 0.4831, -0.3581,  

In [24]:
# Check batches 0
# This transformation allows us to multiply the q and k tensors.
print(f"{k[0]}, \n\n {k.transpose(-2, -1)[0]}")

# The @ operator is used for matrix multiplication.
# k.size(-1) is the length of the embeddings.
att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))

tensor([[[-0.1881, -0.9082, -0.3982, -0.1748,  0.4831],
         [-0.4870,  0.3338,  0.6576,  1.1507, -0.3581],
         [ 0.3891, -0.3131,  0.1238,  0.3145,  0.7347]]],
       grad_fn=<SelectBackward0>), 

 tensor([[[-0.1881, -0.4870,  0.3891],
         [-0.9082,  0.3338, -0.3131],
         [-0.3982,  0.6576,  0.1238],
         [-0.1748,  1.1507,  0.3145],
         [ 0.4831, -0.3581,  0.7347]]], grad_fn=<SelectBackward0>)


# E5 - Mask

In [25]:

# We now apply the mask and set.
# We replace values with '-inf' where the mask is zero.
# While mask has the dimensions of the the sequence length we define in the overall setting (e.g. 1024)
# It automatically adapts to the sequence length of the current batch (e.g. attn)
att = att.masked_fill(mask=mask[:, :, :T, :T] == 0, value=float('-inf'))
print(att)

tensor([[[[ 0.1619,    -inf,    -inf],
          [ 0.2779, -0.2243,    -inf],
          [ 0.0772, -0.3985, -0.0152]]],


        [[[ 0.2177,    -inf,    -inf],
          [ 0.3310, -0.3306,    -inf],
          [ 0.5006, -0.2364, -0.0714]]],


        [[[-0.2720,    -inf,    -inf],
          [-0.4136, -0.3042,    -inf],
          [-0.3444, -0.3535, -0.1887]]],


        [[[ 0.3181,    -inf,    -inf],
          [-0.0352,  0.3169,    -inf],
          [-0.4137, -0.0638,  0.1782]]]], grad_fn=<MaskedFillBackward0>)


# E6 - Merging

### Step 1 - Check for contigiousy
Contiguous Memory:  `[   Book 1   ] [   Book 2   ] [   Book 3   ] [   Book 4   ] [   Book 5   ]`

Non-Contiguous Memory:  `[         ] [   Book 3   ] [         ] [   Book 1   ] [   Book 2   ]`

Think of `.contiguous()` as a librarian who reorganizes the books on the shelf:
`[   Book 1   ] [   Book 2   ] [   Book 3   ] [   Book 4   ] [   Book 5   ]`

### Step 2 - Merging the `heads`
We started out with `n_head`. In reality we would use several heads to capture different different "perspectives".
In order to merge the 'heads' we apply .view(B, T, C) to our originally four-dimensional tensor (B, T, n_heads, C).
We are essentially flattening (combining) the elements in the fourth dimension into the existing dimensions (B, T, C).

In [26]:
att = F.softmax(att, dim=-1)
y = att @ v # matrix multiplication attention * values
y = y.transpose(1, 2).contiguous().view(B, T, C)
print(y)
c_proj = nn.Linear(embedding_dimension, embedding_dimension)
y = c_proj(y)

print(y)

tensor([[[ 0.0430,  0.4818, -0.7781, -0.1399,  0.3219],
         [-0.2011,  0.5236, -0.6125, -0.3346,  0.3160],
         [-0.0851,  0.4677, -0.7423, -0.3454,  0.1781]],

        [[ 0.2435,  0.8231, -1.1731, -0.8928,  0.5210],
         [-0.0878,  0.6865, -0.6798, -0.5871,  0.4745],
         [-0.2452,  0.6413, -0.3379, -0.4157,  0.5949]],

        [[-0.8175,  0.1464,  0.6494,  0.6556,  0.6400],
         [-0.2834,  0.4094,  0.2859,  0.3151,  0.5472],
         [-0.2951,  0.3427,  0.2294,  0.3045,  0.3887]],

        [[-1.6209,  0.4503,  0.0465, -0.6641,  1.2012],
         [-0.8728,  0.2269,  0.2086, -0.0842,  0.7600],
         [-0.2434,  0.1376, -0.1420,  0.3246,  0.1729]]],
       grad_fn=<ViewBackward0>)
tensor([[[ 0.0402,  0.5025,  0.7070, -0.5162,  0.6892],
         [-0.1152,  0.4472,  0.7358, -0.3647,  0.6219],
         [-0.0133,  0.4823,  0.7441, -0.4807,  0.5555]],

        [[ 0.1551,  1.1552,  0.9167, -0.3060,  0.7705],
         [-0.0968,  0.6976,  0.7757, -0.2331,  0.7158],
      

In [27]:
# When we compare the shape of the output tensor with the input tensor, we see that the dimensions are the same.
# batch_size=4, sequence_length=3, embedding_dimension=5
print(input_tensor.shape, y.shape)

torch.Size([4, 3, 5]) torch.Size([4, 3, 5])


# E7 - Stacking blocks

In [28]:
# Instead of 'test' the origial code uses the class Block(config)
# This leads to a list with n.layers of blocks. E.g. n.layer = 3 --> 3 blocks stacked on each other
blocks = ['test' for _ in range(3)]

# E8 - ModuleDict

`ModuleDict` allows to create a dict with layers as items.
It therefore allows us to store a collection of modules in a single object.

In [29]:
import torch
import torch.nn as nn

class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.ModuleDict({
            'lin1': nn.Linear(5, 20),  # Input size is 5, output size is 20
            'lin2': nn.Linear(20, 100),  # Input size is 20, output size is 3
        })

    def forward(self, x):
        x = self.layers['lin1'](x)
        x = self.layers['lin2'](x)
        return x

# Dummy tensor
dummy_input = torch.randn(3, 5)  # Batch size of 3, input size of 5

model = MyModel()
output = model(dummy_input)
print("Output shape:", output.shape)

Output shape: torch.Size([3, 100])


# E9 - Positional Embeddings

In [96]:
import numpy as np

block_size = 200 # The maximum length of the input sequence
n_embd = 5
T = 100 # Sequence length

# Arrange returns a 1D tensor with values from the start (0 in this case) to the end (T), excluding T.
# it's similar to Python’s built-in range function but returns a tensor instead of a list.
pos = torch.arange(start=0, end=T, step=1, dtype=torch.long, device='cpu') # Shape (T)
wpe = nn.Embedding(block_size, n_embd)
pos_emd = wpe(pos)

numpy_array = pos_emd.detach().numpy()

# The the number '1' is embedded into a 5-dimensional vector.
print(pos[1], numpy_array[1])

tensor(1) [-0.28834486 -0.42112446 -0.90688705 -1.8311492  -1.5700469 ]


# E10 - Regular Embeddings

In [112]:
import tiktoken

enc = tiktoken.get_encoding('gpt2')
tokens = enc.encode("I like dogs!")
print(tokens) # Check tokens created here: https://tiktokenizer.vercel.app/?model=gpt2

tokens = torch.tensor(tokens, dtype=torch.long).unsqueeze(0)  # Shape: (1, 8)

n_embd = 64  # Adjust embedding dimension as needed
wte = nn.Embedding(50257, 5) # 50257 is the number of tokens in the GPT-2 vocabulary. This needs to match.
embeddings = wte(tokens)  # Shape: (1, 8, n_embd)

embeddings[0]


[40, 588, 6844, 0]


tensor([[ 2.7135e-01, -2.1596e+00,  3.2757e-01,  9.1524e-01,  9.1762e-01],
        [ 2.0205e-03, -7.0155e-01, -7.5294e-01, -5.6002e-01, -7.3883e-01],
        [-2.8199e-01, -1.6643e+00,  6.9581e-01, -4.5845e-01,  3.5994e-01],
        [-1.1862e+00,  3.1415e-01, -8.7480e-01, -1.6340e+00,  4.5589e-01]],
       grad_fn=<SelectBackward0>)

# E11 - Multinomal distribution

I case we have a set of probabilities a multinomal distribution can help to introduce some randomness.
We see, for example, that p=0.8 is not picked 80% of the time. In context of a LLM we want to introduce some randomness:
If we only choose the most probable token at each step this would leading to repetitive and predictable text.

In [170]:
from scipy.stats import multinomial

# Parameters:
n = 100  # number of trials
p = [0.1, 0.1, 0.8]  # probabilities of each outcome

# Generate random samples
samples = multinomial.rvs(n, p, size=5)  # Generate 10 samples

print(samples)

[[ 9 10 81]
 [ 7  8 85]
 [14  7 79]
 [13 11 76]
 [11  3 86]]


# E12 - weight sharing

# TODO

# E13 - Residual stream

The residual stream adds the input of an layer to its output.<br>
We are not talking about weights here. We are talking about the acutal- in and outputs.
<br>

Example:<br>
`attn_output, _ = self.attention(x, x, x)`<br>
`x = x + attn_output # Residual connection: add the input 'x' back to the output`<br>
`x = self.norm1(x)`<br>

<br>
The problem with this approch is an explosion of the deviation of the output.

<img src="images/res.png" alt="Alt text" width="200"/>

In [100]:
x = torch.zeros(30)
n = 100
for i in range(n):
    # We create random values with mean 0 and standard deviation 1.
    x += torch.randn(30)
# Just after 100 interations the std moved to >10!
print(x.std())

# We can fix this by dividing the sum by the square root of the number of iterations.
x = torch.zeros(30)
for i in range(n):
    x += n**-0.5 * torch.randn(30)
print(x.std())

tensor(11.1456)
tensor(0.9557)


# E14 - Quantization

Quantization is used to speed up training time and work more memory efficient.
It works as follows:

<img src="images/prec2.png" alt="Alt text" width="400"/>

Different procedures are available with `TF32` being the gold standard for training.

<img src="images/prec1.png" alt="Alt text" width="400"/>

What is missing here is sign (0 for positive and 1 for negative)

### `.unsqueeze()`

In [139]:
"""
Adds a new dimension to the tensor at the specified position. In this case, 0 indicates that the new dimension will be added at the beginning.
Adding this extra dimension is often necessary for batching in PyTorch. Transformer models expect input tensors to have a specific shape, typically [batch_size, sequence_length].
By unsqueezing the tensor, we create a batch dimension, even if there's only one sequence in the batch.
"""

tokens = [101, 2009, 2003, 1037, 3722, 102]
tens = torch.tensor(tokens)

# The shape of the tensor is (6,) because it's a 1D tensor with 6 elements.
# For our model we need a batch dimension, so we add a dimension at the beginning.
print(tens.shape)

tens2 = tens.unsqueeze(0)
print(tens2.shape)

# Repeat the tensor 5 times along the first dimension
tens.unsqueeze(0).repeat(5, 1)

torch.Size([6])
torch.Size([1, 6])


tensor([[ 101, 2009, 2003, 1037, 3722,  102],
        [ 101, 2009, 2003, 1037, 3722,  102],
        [ 101, 2009, 2003, 1037, 3722,  102],
        [ 101, 2009, 2003, 1037, 3722,  102],
        [ 101, 2009, 2003, 1037, 3722,  102]])

### `@classmethod`

In [136]:
class Dog():
    def bark(self):
        print("Woof!")


class Cat():
    @classmethod
    def miau(self):
        print("Miau!")

# Using the Dog class to create an instance of a dog and call the bark method
doggi = Dog()
doggi.bark()

# While the Dog class has an instance method, the Cat class has a class method.
# This means that the Cat class method can be called without creating an instance of the class.
Cat.miau()
try:
    Dog.bark()
except:
    print("Can't call method without instance")

Woof!
Miau!
Can't call method without instance


### `state_dict()`

`state_dict()` returns the different model layers.

h.# stand for the individual heads.

In [150]:
from transformers import GPT2LMHeadModel

model_hf = GPT2LMHeadModel.from_pretrained("gpt2") # Load the GPT-2 model
sd_hf = model_hf.state_dict() # State dict are the raw tensors

print("Model information:", model_hf.config)
print("Number of heads:", model_hf.config.num_attention_heads)
print("Number of layers:", model_hf.config.num_hidden_layers)

for k, v in sd_hf.items():
    print(k, v.shape)

Model information: GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 50257
}

Number of heads: 12
Number of layers: 12
transformer.wte.weight torch.Size([50257, 768])
t

In [151]:
# Inspect a singular layer

# We get a context length `n_ctx` of 1024 tokens with am embedding dimension of 768 per token.
print(sd_hf["transformer.wpe.weight"].shape)

# The first element of the tensor is the embedding for the first token.
sd_hf["transformer.wpe.weight"][0]



torch.Size([1024, 768])


tensor([-1.8821e-02, -1.9742e-01,  4.0267e-03,  1.1347e-02,  6.3824e-02,
        -1.0501e-01,  3.6937e-02, -1.6803e-01, -4.9111e-02, -5.6461e-02,
        -2.4560e-03,  1.3503e-02, -4.1711e-03,  1.5115e-02,  1.6595e-02,
        -1.3808e-01, -6.3314e-03, -4.6150e-02,  2.6675e-02, -2.0417e-01,
         1.3454e-02, -3.6267e-02,  1.9301e-02, -2.5931e-02,  8.0243e-03,
         8.4712e-03, -1.9906e-02,  6.6802e-02,  7.1151e-03, -2.6618e-02,
         2.0829e-02, -3.3732e-02, -8.2898e-03,  9.8622e-03, -2.7369e-02,
        -9.9118e-02, -7.5254e-01,  2.3550e-02, -3.0513e-02,  7.7456e-02,
         3.4301e-03,  7.1132e-03,  2.6479e-02, -1.2113e-03,  1.1219e-01,
        -2.0606e-03, -2.2458e-02, -2.2287e-02,  2.3570e-02,  3.9777e-01,
         1.8856e-02,  2.0280e-02,  6.3043e-01,  2.3146e-02, -4.6894e-02,
         4.0653e+00, -1.7403e-02, -5.1683e-02,  7.2271e-02, -7.9312e-02,
         4.0248e-02,  1.9908e-02, -4.6380e-02, -2.8380e-02,  7.2535e-03,
         2.6772e-02,  1.4972e-03, -2.9892e-01, -1.1

# Flash attention (Fa) [E14]

Fa is a kernel fusion algorithm. Fa needs more flops than the regular procedure but it's way more mindful of the memory (reads and writes).<br>

<img src="images/flash.png" alt="Alt text" width="800"/><br><br>


Our original code looks like this:

<code>
att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))<br>
att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf')<br>
att = F.softmax(att, dim=-1)<br>
y = att @ v
</code>

<br>

Instead we are using:

`F.scaled_dot_product_attention(q, k, v, is_causal=True)`



# Discussion

### D1

Every head captures a different dimension. We split the embedding by the number of heads:

1) `k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)`

We need to do so as we concatonate the final attention matrices afterwards:

2) `y = y.transpose(1, 2).contiguous().view(B, T, C)`

We see in #2 that we transpose the tensor `y` from four to three dimensions. The transformation to three dimensions merges the `h` Attention matrices.

<img src="images/trans_head.jpg" alt="Alt text" width="300"/>

In [166]:
import numpy as np
from scipy.stats import multinomial

# Parameters:
n = 100  # number of trials
p = [0.1, 0.1, 0.8]  # probabilities of each outcome

# Generate random samples
samples = multinomial.rvs(n, p, size=10)  # Generate 10 samples

print(samples)


[[11  6 83]
 [ 8  9 83]
 [13 15 72]
 [ 9 11 80]
 [17  9 74]
 [13 14 73]
 [10 13 77]
 [12  9 79]
 [12 12 76]
 [ 9  8 83]]


# Training

### Dataset

In [8]:
with open("input.txt", "r") as file:
    text = file.read()

data = text[:1000]
print(data[:100]) # Get the first 1k characters.

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


### Data preparation

We need to generate training examples: sentences.
As we are using self-supervised learning we only need to create batches of sentences.

In [21]:
# gpt2 tokenizer has a compression of 3:1
enc = tiktoken.get_encoding('gpt2')
tokens = enc.encode(data)
print(tokens[:24])

# We can convert the tokens to a tensor and reshape it to a 4x6 tensor.
# Here we get for example 4 sequences with 6 tokens each.
# The problem though is that the lost token has no 'label'
buf = torch.tensor(tokens[:24])
x = buf.view(4,6)
print(x)

[5962, 22307, 25, 198, 8421, 356, 5120, 597, 2252, 11, 3285, 502, 2740, 13, 198, 198, 3237, 25, 198, 5248, 461, 11, 2740, 13]
tensor([[ 5962, 22307,    25,   198,  8421,   356],
        [ 5120,   597,  2252,    11,  3285,   502],
        [ 2740,    13,   198,   198,  3237,    25],
        [  198,  5248,   461,    11,  2740,    13]])


In [20]:
buf = torch.tensor(tokens[:24+1])
print(buf)
x = buf[:-1].view(4,6)
y = buf[1:].view(4,6) # y is basically x shifted by one token to the right
print(x)
print(y)

tensor([ 5962, 22307,    25,   198,  8421,   356,  5120,   597,  2252,    11,
         3285,   502,  2740,    13,   198,   198,  3237,    25,   198,  5248,
          461,    11,  2740,    13,   198])
tensor([[ 5962, 22307,    25,   198,  8421,   356],
        [ 5120,   597,  2252,    11,  3285,   502],
        [ 2740,    13,   198,   198,  3237,    25],
        [  198,  5248,   461,    11,  2740,    13]])
tensor([[22307,    25,   198,  8421,   356,  5120],
        [  597,  2252,    11,  3285,   502,  2740],
        [   13,   198,   198,  3237,    25,   198],
        [ 5248,   461,    11,  2740,    13,   198]])


### `buf = torch.tensor(tokens[:B*T+1])`

This only works with texts who are larger than B*T+1

In [57]:
B = 5 # Number of batches
T = 10 # Maximum sequence length

enc = tiktoken.get_encoding('gpt2')
sentence = """
The dog (Canis familiaris or Canis lupus familiaris) is a domesticated descendant of the wolf.
Also called the domestic dog, it was domesticated from an extinct population of Pleistocene wolves over 14,000 years ago.
The dog was the first species to be domesticated by humans.
Experts estimate that hunter-gatherers domesticated dogs more than 15,000 years ago, which was before the development of agriculture.
Due to their long association with humans, dogs have expanded to a large number of domestic individuals and gained the ability to thrive on a starch-rich diet that would be inadequate for other canids.
"""
tokens = enc.encode(sentence)
buf = torch.tensor(tokens[:B*T+1])
x = buf[:-1].view(B,T)
y = buf[1:].view(B,T)
print(buf,"\n")
print(x, "\n")
print(y, "\n")




tensor([  198,   464,  3290,   357,  6090,   271,  5385,   271,   393,  1680,
          271,   300,   929,   385,  5385,   271,     8,   318,   257, 26026,
         3474, 45923,   286,   262, 17481,    13,   220,   198,  7583,  1444,
          262,  5928,  3290,    11,   340,   373, 26026,  3474,   422,   281,
        28881,  3265,   286, 18063,   396, 34973, 23214,   625,  1478,    11,
          830]) 

tensor([[  198,   464,  3290,   357,  6090,   271,  5385,   271,   393,  1680],
        [  271,   300,   929,   385,  5385,   271,     8,   318,   257, 26026],
        [ 3474, 45923,   286,   262, 17481,    13,   220,   198,  7583,  1444],
        [  262,  5928,  3290,    11,   340,   373, 26026,  3474,   422,   281],
        [28881,  3265,   286, 18063,   396, 34973, 23214,   625,  1478,    11]]) 

tensor([[  464,  3290,   357,  6090,   271,  5385,   271,   393,  1680,   271],
        [  300,   929,   385,  5385,   271,     8,   318,   257, 26026,  3474],
        [45923,   286,   262,

In [59]:
a = 1
b = 2

a = b
print(a)

2


# FAQ

### F1 - Block size / sequence length
- Sequence length (T) is the length of the input sequence, which can be less than or equal to the block size.
- The block size is the maximum length of input sequences that the model can process.

### F2 - Batch optimization
- Always use numbers to the power of two e.g. batch size of 16, 24, 32 etc.
- This is most efficient for the GPU
- Always max out the maximum batch size that fits on your GPU

#