In [1]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

# E1 - Linear layers

A **linear layer** (or **fully connected layer**) in a neural network is a layer where each input neuron is connected to each output neuron with a certain weight. This layer performs a linear transformation on the input vector.

Given an input vector $\mathbf{x} \in \mathbb{R}^n$ and a weight matrix $\mathbf{W} \in \mathbb{R}^{m \times n}$, and a bias vector $\mathbf{b} \in \mathbb{R}^m$, the output $\mathbf{y}$ of a linear layer can be computed as:

$$
\mathbf{y} = \mathbf{W} \mathbf{x} + \mathbf{b}
$$

- **Input Vector** $\mathbf{x}$: $\begin{bmatrix} x_1 \\ x_2 \\ \vdots \\ x_n \end{bmatrix}$
- **Weight Matrix** $\mathbf{W}$: $\begin{bmatrix} w_{11} & w_{12} & \cdots & w_{1n} \\ w_{21} & w_{22} & \cdots & w_{2n} \\ \vdots & \vdots & \ddots & \vdots \\ w_{m1} & w_{m2} & \cdots & w_{mn} \end{bmatrix}$
- **Bias Vector** $\mathbf{b}$: $\begin{bmatrix} b_1 \\ b_2 \\ \vdots \\ b_m \end{bmatrix}$
- **Output Vector** $\mathbf{y}$: $\begin{bmatrix} y_1 \\ y_2 \\ \vdots \\ y_m \end{bmatrix}$

![Alt text](/Users/soeren/code/gpt2/images/q_k_v.png)

If we create the following:

`nn.Linear(3, 4)`

We get a weight matrix according to the dimensionality of of the output neurons $y$

- **Input Vector** $\mathbf{x}$: $\begin{bmatrix} x_1 \\ x_2 \\ x_3 \end{bmatrix}$
- **Weight Matrix** $\mathbf{W} = \begin{bmatrix}
w_{11} & w_{12} & w_{13} \\
w_{21} & w_{22} & w_{23} \\
w_{31} & w_{32} & w_{33} \\
w_{41} & w_{42} & w_{43}
\end{bmatrix}$
- **Bias Vector** $\mathbf{b}$: $\begin{bmatrix} b_1 \\ b_2 \\ b_3 \\ b_4 \end{bmatrix}$
- **Output Vector** $\mathbf{y}$: $\begin{bmatrix} y_1 \\ y_2 \\ y_3 \\ y_4 \end{bmatrix}$



In [2]:
class MyLinearLayer(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.linear = nn.Linear(input_size, output_size)

    # Define flow
    def forward(self, x):
        return self.linear(x)

input_size = 3
output_size = 9

x = torch.randn(1, input_size)  # Random input tensor

model = MyLinearLayer(input_size, output_size)
output = model(x)

state_dict = model.state_dict()

# Print of the state_dict
print(state_dict.keys())

# Access weights of a specific layer
print(state_dict['linear.weight'])

# When we look at the weight matrix, it's had exactly the dimension we expectec based on the math above.

odict_keys(['linear.weight', 'linear.bias'])
tensor([[ 0.1082, -0.1936, -0.4974],
        [-0.1222,  0.1985,  0.0237],
        [-0.1828, -0.2036,  0.5703],
        [-0.4793,  0.0586, -0.4492],
        [ 0.5626,  0.5330, -0.3480],
        [-0.1288,  0.1013,  0.1506],
        [-0.2883, -0.3445,  0.0686],
        [ 0.1738,  0.3542,  0.4543],
        [ 0.5254, -0.1385, -0.2611]])


# E2 - Mask

In [3]:
block_size = 10  # Example block size

ones_matrix = torch.ones(block_size, block_size)
lower_triangular_matrix = torch.tril(ones_matrix)
reshaped_tensor = lower_triangular_matrix.view(1, 1, block_size, block_size)

print(f"""{ones_matrix}, \n\n {lower_triangular_matrix} \n\n {reshaped_tensor}""")





tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]), 

 tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 

# E3 - Splitting

In [4]:
batch_size = 4  # Number of examples in the batch
sequence_length = 3  # Length of the sequence for each example (e.g. "I love dogs"), assuming "I love dogs" would result in three tokens: "I", "love", "dogs"
embedding_dimension = 5  # Dimension of the embedding space

# Create a sample tensor with random values
rand_tensor = torch.randn(batch_size, sequence_length, embedding_dimension)
print("\n\nRand tensor \n\n", rand_tensor)

B, T, C = rand_tensor.size()

# In a second step we are going to use a linear layer with three times the embedding dimension.
lin_layer = nn.Linear(embedding_dimension, 3 * embedding_dimension)
qkv = lin_layer(rand_tensor)
print("\n\nWeights linear layer \n\n", lin_layer.weight)

print("\n\nqkv tensor \n\n", qkv)
q, k, v = qkv.split(embedding_dimension, dim=2)



Rand tensor 

 tensor([[[-1.7575,  0.7193, -1.9861,  0.5738,  0.5189],
         [ 0.9489, -2.5567, -0.5070,  0.5043, -0.2296],
         [ 0.6315,  2.3076, -0.7179,  1.4599, -1.1427]],

        [[ 1.7366, -1.2806,  0.4704, -0.5723, -0.7280],
         [ 0.8246,  1.1727,  1.3063, -0.5635, -1.9792],
         [-0.2773,  0.5857, -1.4455, -0.7405,  0.9141]],

        [[ 0.1281,  0.0568, -0.7959, -0.4782,  0.7437],
         [ 1.0682, -0.3148, -0.1492, -0.8080,  1.0871],
         [-0.5311,  0.0943,  0.4168,  0.4737,  0.0230]],

        [[-0.4430,  0.3532, -0.9002, -0.1892,  1.3197],
         [-1.0167,  1.1315, -0.9219,  1.6219, -0.6803],
         [ 1.8157,  0.6416, -0.0551,  0.8636, -1.8395]]])


Weights linear layer 

 Parameter containing:
tensor([[-2.0065e-01, -1.9127e-01,  1.9535e-01, -2.4680e-01,  4.0346e-01],
        [-1.4478e-01,  4.0237e-01,  4.8295e-02, -2.1675e-01,  3.5998e-02],
        [-1.4243e-02,  3.1968e-01, -3.2070e-01,  3.4785e-01,  1.8699e-01],
        [-9.4957e-02,  3.2350e

In [5]:
n_head = 1
# B = batch
# T = sequence length
# head = number of heads
# C = embedding dimension --> The tensor k is being split into head smaller chunks. This is a common step in multi-head attention, where the input is projected into multiple subspaces corresponding to different heads.
k = k.view(B, T, n_head, C // n_head).transpose(1, 2) # transpose starts from zero. Hence, we are transposing T and head.
q = q.view(B, T, n_head, C // n_head).transpose(1, 2)
v = v.view(B, T, n_head, C // n_head).transpose(1, 2)
k

tensor([[[[ 0.5970,  0.7426,  0.7730,  0.2325,  0.4139],
          [ 0.6774, -0.3361, -0.5261,  0.5069,  0.3542],
          [ 0.4941,  0.7538, -0.4417,  0.0328, -0.5142]]],


        [[[ 0.1475, -0.9339, -0.9463, -0.2146, -0.3045],
          [-0.7788, -0.2067, -0.7124, -0.2632, -0.5228],
          [ 0.6370, -0.4917,  0.1583, -0.6259, -0.2261]]],


        [[[ 0.6242, -0.6364, -0.0892, -0.5008, -0.2814],
          [ 0.7822, -1.3659, -0.5189, -0.8683, -0.6754],
          [ 0.2109, -0.0777,  0.0032,  0.0694, -0.1184]]],


        [[[ 0.8273, -0.5531,  0.1761, -0.5193, -0.2953],
          [ 0.3803,  1.1396,  0.2775,  0.5924,  0.2095],
          [ 0.2071,  0.2213, -1.0140,  0.1385, -0.4151]]]],
       grad_fn=<TransposeBackward0>)

#E4 - Attention

In [6]:
# Dimensions of the tensors: batch, sequence length, number of heads, embedding dimension (i.e. the embedding)
print(q.shape, k.shape, v.shape)


print(q)
print("\n\n")
print(k.transpose(-2, -1))

# What happens is that the transformation allows us to multiply the q and k tensors.


torch.Size([4, 1, 3, 5]) torch.Size([4, 1, 3, 5]) torch.Size([4, 1, 3, 5])
tensor([[[[-2.3078e-01,  2.4920e-01,  1.6170e+00,  5.5065e-01, -8.3204e-01],
          [-1.4309e-01, -1.4012e+00, -1.0733e-01, -4.9145e-01, -5.5607e-01],
          [-1.6553e+00,  3.5177e-01,  1.6815e+00,  1.5193e+00, -1.3839e+00]]],


        [[[-2.8969e-01, -7.3919e-01, -4.9175e-01, -1.2807e-01, -2.1139e-01],
          [-9.1966e-01,  3.7340e-01, -1.9342e-01,  1.1599e+00, -3.3098e-01],
          [ 8.7197e-02,  3.0634e-01,  9.9657e-01,  7.2647e-02, -3.2579e-01]]],


        [[[ 1.0044e-01,  3.2152e-03,  6.7274e-01, -1.8531e-02, -3.3423e-01],
          [ 3.2914e-01, -1.6734e-01,  2.8266e-01, -3.5214e-01, -1.3639e-01],
          [-6.3327e-02, -5.9950e-02,  5.0159e-01,  4.6701e-01, -5.4604e-01]]],


        [[[ 2.9903e-01,  1.5822e-01,  1.0173e+00,  6.6900e-04, -4.1335e-01],
          [-9.9289e-01,  8.8861e-02,  1.5373e+00,  1.1744e+00, -1.2438e+00],
          [-1.5787e+00, -3.5388e-01,  5.8180e-01,  1.0117e+00, -1.

In [7]:
import math

# The @ operator is used for matrix multiplication.
# k.size(-1) is the length of the embeddings.
att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))

k.size