In [86]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

# E1 - Linear layers

A **linear layer** (or **fully connected layer**) in a neural network is a layer where each input neuron is connected to each output neuron with a certain weight. This layer performs a linear transformation on the input vector.

Given an input vector $\mathbf{x} \in \mathbb{R}^n$ and a weight matrix $\mathbf{W} \in \mathbb{R}^{m \times n}$, and a bias vector $\mathbf{b} \in \mathbb{R}^m$, the output $\mathbf{y}$ of a linear layer can be computed as:

$$
\mathbf{y} = \mathbf{W} \mathbf{x} + \mathbf{b}
$$

- **Input Vector** $\mathbf{x}$: $\begin{bmatrix} x_1 \\ x_2 \\ \vdots \\ x_n \end{bmatrix}$
- **Weight Matrix** $\mathbf{W}$: $\begin{bmatrix} w_{11} & w_{12} & \cdots & w_{1n} \\ w_{21} & w_{22} & \cdots & w_{2n} \\ \vdots & \vdots & \ddots & \vdots \\ w_{m1} & w_{m2} & \cdots & w_{mn} \end{bmatrix}$
- **Bias Vector** $\mathbf{b}$: $\begin{bmatrix} b_1 \\ b_2 \\ \vdots \\ b_m \end{bmatrix}$
- **Output Vector** $\mathbf{y}$: $\begin{bmatrix} y_1 \\ y_2 \\ \vdots \\ y_m \end{bmatrix}$

![Alt text](/Users/soeren/code/gpt2/images/q_k_v.png)

If we create the following:

`nn.Linear(3, 4)`

We get a weight matrix according to the dimensionality of of the output neurons $y$

- **Input Vector** $\mathbf{x}$: $\begin{bmatrix} x_1 \\ x_2 \\ x_3 \end{bmatrix}$
- **Weight Matrix** $\mathbf{W} = \begin{bmatrix}
w_{11} & w_{12} & w_{13} \\
w_{21} & w_{22} & w_{23} \\
w_{31} & w_{32} & w_{33} \\
w_{41} & w_{42} & w_{43}
\end{bmatrix}$
- **Bias Vector** $\mathbf{b}$: $\begin{bmatrix} b_1 \\ b_2 \\ b_3 \\ b_4 \end{bmatrix}$
- **Output Vector** $\mathbf{y}$: $\begin{bmatrix} y_1 \\ y_2 \\ y_3 \\ y_4 \end{bmatrix}$



In [87]:
class MyLinearLayer(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.linear = nn.Linear(input_size, output_size)

    # Define flow
    def forward(self, x):
        return self.linear(x)

input_size = 3
output_size = 9

x = torch.randn(1, input_size)  # Random input tensor

model = MyLinearLayer(input_size, output_size)
output = model(x)

state_dict = model.state_dict()

# Print of the state_dict
print(state_dict.keys())

# Access weights of a specific layer
print(state_dict['linear.weight'])

# When we look at the weight matrix, it's had exactly the dimension we expectec based on the math above.

AttributeError: 'MyLinearLayer' object has no attribute 'weight'

# E2 - Splitting

In [88]:
batch_size = 4  # Number of examples in the batch
sequence_length = 3  # Length of the sequence for each example (e.g. "I love dogs"), assuming "I love dogs" would result in three tokens: "I", "love", "dogs"
embedding_dimension = 5  # Dimension of the embedding space

# Create a sample tensor with random values
rand_tensor = torch.randn(batch_size, sequence_length, embedding_dimension)
print("\n\nRand tensor \n\n", rand_tensor)

B, T, C = rand_tensor.size()

# In a second step we are going to use a linear layer with three times the embedding dimension.
lin_layer = nn.Linear(embedding_dimension, 3 * embedding_dimension)
qkv = lin_layer(rand_tensor)
print("\n\nWeights linear layer \n\n", lin_layer.weight)

print("\n\nqkv tensor \n\n", qkv)
q, k, v = qkv.split(embedding_dimension, dim=2)



Rand tensor 

 tensor([[[-1.1030e+00, -1.5641e+00, -2.4768e-01, -3.3742e-02,  2.4014e+00],
         [-9.6585e-01, -1.5485e+00, -1.6011e+00,  2.0549e-01, -3.6359e-01],
         [ 6.9281e-01, -1.1421e+00, -1.4178e+00, -1.3838e+00,  1.5762e+00]],

        [[-2.8017e-01,  5.7541e-01, -2.3022e-01,  8.5519e-01,  2.9211e-02],
         [-1.0060e+00, -7.4851e-02,  1.3897e+00,  9.9862e-01, -1.9336e-02],
         [ 1.9113e+00, -5.3601e-01, -8.4473e-01, -2.5183e-01, -2.7436e-01]],

        [[ 7.8483e-02, -5.7096e-01, -2.8123e-01,  2.7012e-01, -8.8670e-01],
         [ 4.0694e-04, -2.6898e-01,  1.0925e+00,  2.1568e-01,  3.5610e-02],
         [ 7.5496e-01,  4.7470e-01, -1.0231e-01,  6.8636e-02,  9.7441e-01]],

        [[ 4.6341e-01, -8.9418e-02, -2.9654e-01, -7.6041e-02, -5.3760e-01],
         [-4.5800e-01,  4.5955e-01,  8.8806e-01, -1.4692e+00, -1.0137e+00],
         [ 8.3687e-01, -9.5285e-01, -1.0146e+00, -2.6947e-01,  1.1127e+00]]])


Weights linear layer 

 Parameter containing:
tensor([[-0.338

In [91]:
head = 1
# B = batch
# T = sequence length
# head = number of heads
# C = embedding dimension --> The tensor k is being split into head smaller chunks. This is a common step in multi-head attention, where the input is projected into multiple subspaces corresponding to different heads.
k = k.view(B, T, head, C // head).transpose(1, 2) # transpose starts from zero. Hence, we are transposing T and head.
k

tensor([[[[-0.2475, -0.2184, -0.0242,  1.0634, -0.3952],
          [-0.0235, -0.2712, -0.0025,  0.3450, -1.0928],
          [-0.4788, -0.2057, -0.3942,  1.1117, -0.2122]]],


        [[[-0.2046, -0.3435,  0.2523, -0.0026, -0.1321],
          [-0.2916,  0.2952,  0.8161, -0.3996,  0.1911],
          [-0.6116,  0.1143, -0.4544,  0.1183, -0.3818]]],


        [[[-0.3192,  0.1676,  0.2610, -0.2489, -0.3608],
          [-0.4922,  0.4425,  0.5608, -0.2528,  0.3034],
          [-0.4797, -0.2091, -0.0386,  0.3662,  0.2024]]],


        [[[-0.4019,  0.0911,  0.2222, -0.1092, -0.0548],
          [-0.4735,  0.6095,  1.3262, -0.4480,  1.1590],
          [-0.4522, -0.1879, -0.4716,  0.7236, -0.4857]]]],
       grad_fn=<TransposeBackward0>)

# E3 - Mask

In [102]:
block_size = 10  # Example block size

ones_matrix = torch.ones(block_size, block_size)
lower_triangular_matrix = torch.tril(ones_matrix)
reshaped_tensor = lower_triangular_matrix.view(1, 1, block_size, block_size)

print(f"""{ones_matrix}, \n\n {lower_triangular_matrix} \n\n {reshaped_tensor}""")





tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]), 

 tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 

In [94]:
lower_triangular_matrix

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])