<table style="width:100%">
<tr>
<td style="vertical-align:middle; text-align:left;">
<font size="2">
Supplementary code for the <a href="http://mng.bz/orYv">Build a Large Language Model From Scratch</a> book by <a href="https://sebastianraschka.com">Sebastian Raschka</a><br>
<br>Code repository: <a href="https://github.com/rasbt/LLMs-from-scratch">https://github.com/rasbt/LLMs-from-scratch</a>
</font>
</td>
<td style="vertical-align:middle; text-align:left;">
<a href="http://mng.bz/orYv"><img src="https://sebastianraschka.com/images/LLMs-from-scratch-images/cover-small.webp" width="100px"></a>
</td>
</tr>
</table>


# Chapter 3 Exercise solutions

# Exercise 3.1

In [2]:
import torch

inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

d_in, d_out = 3, 2

In [3]:
import torch.nn as nn

class SelfAttention_v1(nn.Module):

    def __init__(self, d_in, d_out):
        super().__init__()
        self.d_out = d_out
        self.W_query = nn.Parameter(torch.rand(d_in, d_out))
        self.W_key   = nn.Parameter(torch.rand(d_in, d_out))
        self.W_value = nn.Parameter(torch.rand(d_in, d_out))

    def forward(self, x):
        keys = x @ self.W_key
        queries = x @ self.W_query
        values = x @ self.W_value
        
        attn_scores = queries @ keys.T # omega
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)

        context_vec = attn_weights @ values
        return context_vec

torch.manual_seed(123)
sa_v1 = SelfAttention_v1(d_in, d_out)

In [4]:
class SelfAttention_v2(nn.Module):

    def __init__(self, d_in, d_out):
        super().__init__()
        self.d_out = d_out
        self.W_query = nn.Linear(d_in, d_out, bias=False)
        self.W_key   = nn.Linear(d_in, d_out, bias=False)
        self.W_value = nn.Linear(d_in, d_out, bias=False)

    def forward(self, x):
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)
        
        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=1)

        context_vec = attn_weights @ values
        return context_vec

torch.manual_seed(123)
sa_v2 = SelfAttention_v2(d_in, d_out)

In [5]:
# sa_v1.W_query

In [6]:
# sa_v2.W_query.weight.T


In [7]:
# sa_v1.W_query = nn.Parameter(sa_v2.W_query.weight.T)

In [8]:
# sa_v1.W_query

In [42]:
class CausalSelfAttention(nn.Module):

    def __init__(self, d_in, d_out, dropout_rate=0.5):
        super().__init__()
        self.d_out = d_out
        self.W_query = nn.Linear(d_in, d_out, bias=False)
        self.W_key   = nn.Linear(d_in, d_out, bias=False)
        self.W_value = nn.Linear(d_in, d_out, bias=False)
        self.dropout_rate = dropout_rate

    def forward(self, x):

        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)
        
        attn_scores = queries @ keys.T
        # mask the unnormalized attention_scores
        attn_mask = torch.triu(torch.ones(attn_scores.shape), diagonal=1)
        attn_scores = attn_scores.masked_fill(attn_mask.bool(), -torch.inf)
        #dropout_mask = torch.nn.Dropout(self.dropout_rate)
        #attn_scores = dropout_mask(attn_scores)
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=1)

        context_vec = attn_weights @ values
        return context_vec

torch.manual_seed(123)
sa_v3 = CausalSelfAttention(d_in, d_out)

In [43]:
sa_v3.forward(inputs)

tensor([[-0.4519,  0.2216],
        [-0.5874,  0.0058],
        [-0.6300, -0.0632],
        [-0.5675, -0.0843],
        [-0.5526, -0.0981],
        [-0.5299, -0.1081]], grad_fn=<MmBackward0>)

In [11]:
W_query = nn.Linear(d_in, d_out, bias=False)
W_key   = nn.Linear(d_in, d_out, bias=False)
keys = W_key(inputs)
queries = W_query(inputs)

attn_scores = queries @ keys.T

In [15]:
context_length = attn_scores.shape[1]
attn_mask = torch.triu(torch.ones(context_length, context_length), diagonal=1)
attn_mask

tensor([[0., 1., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1., 1.],
        [0., 0., 0., 1., 1., 1.],
        [0., 0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0.]])

In [16]:
torch.triu(torch.ones(context_length, context_length), diagonal=1)

tensor([[0., 1., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1., 1.],
        [0., 0., 0., 1., 1., 1.],
        [0., 0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0.]])

In [17]:
context_length = attn_scores.shape[0]

In [18]:
mask = torch.triu(torch.ones(context_length, context_length), diagonal=1)
masked = attn_scores.masked_fill(mask.bool(), -torch.inf)
print(masked)

tensor([[-0.2327,    -inf,    -inf,    -inf,    -inf,    -inf],
        [-0.2396,  0.1015,    -inf,    -inf,    -inf,    -inf],
        [-0.2323,  0.1004,  0.1045,    -inf,    -inf,    -inf],
        [-0.1344,  0.0502,  0.0523,  0.0470,    -inf,    -inf],
        [-0.0349,  0.0520,  0.0538,  0.0331,  0.0708,    -inf],
        [-0.2142,  0.0650,  0.0679,  0.0668,  0.1004,  0.0395]],
       grad_fn=<MaskedFillBackward0>)


In [19]:
attn_mask

tensor([[0., 1., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1., 1.],
        [0., 0., 0., 1., 1., 1.],
        [0., 0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0.]])

In [20]:
attn_scores

tensor([[-0.2327,  0.1055,  0.1098,  0.0913,  0.1549,  0.0521],
        [-0.2396,  0.1015,  0.1057,  0.0902,  0.1501,  0.0518],
        [-0.2323,  0.1004,  0.1045,  0.0885,  0.1481,  0.0507],
        [-0.1344,  0.0502,  0.0523,  0.0470,  0.0753,  0.0272],
        [-0.0349,  0.0520,  0.0538,  0.0331,  0.0708,  0.0174],
        [-0.2142,  0.0650,  0.0679,  0.0668,  0.1004,  0.0395]],
       grad_fn=<MmBackward0>)

In [21]:
attn_scores.masked_fill(attn_mask == 0, float('-inf'))

tensor([[  -inf, 0.1055, 0.1098, 0.0913, 0.1549, 0.0521],
        [  -inf,   -inf, 0.1057, 0.0902, 0.1501, 0.0518],
        [  -inf,   -inf,   -inf, 0.0885, 0.1481, 0.0507],
        [  -inf,   -inf,   -inf,   -inf, 0.0753, 0.0272],
        [  -inf,   -inf,   -inf,   -inf,   -inf, 0.0174],
        [  -inf,   -inf,   -inf,   -inf,   -inf,   -inf]],
       grad_fn=<MaskedFillBackward0>)

In [33]:
torch.manual_seed(123)
dropout = torch.nn.Dropout(0.8)
example = torch.ones(6, 6)
print(dropout(example))

tensor([[0., 0., 0., 5., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [5., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])


In [35]:
print(sum(sum(dropout(example))))

tensor(30.)


In [44]:
class CausalAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length,
                dropout, qkv_bias=False):
        super().__init__()
        self.d_out = d_out
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key   = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
           'mask',
           torch.triu(torch.ones(context_length, context_length),
           diagonal=1)
        )
 
    def forward(self, x):
        b, num_tokens, d_in = x.shape
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)
 
        attn_scores = queries @ keys.transpose(1, 2)
        attn_scores.masked_fill_(
            self.mask.bool()[:num_tokens, :num_tokens], -torch.inf) 
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5, dim=-1
        )
        attn_weights = self.dropout(attn_weights)
 
        context_vec = attn_weights @ values
        return context_vec

# Exercise 3.2

In [45]:
batch = torch.stack((inputs, inputs), dim=0)
print(batch.shape)

torch.Size([2, 6, 3])


In [127]:
inputs

tensor([[0.4300, 0.1500, 0.8900],
        [0.5500, 0.8700, 0.6600],
        [0.5700, 0.8500, 0.6400],
        [0.2200, 0.5800, 0.3300],
        [0.7700, 0.2500, 0.1000],
        [0.0500, 0.8000, 0.5500]])

In [46]:
batch

tensor([[[0.4300, 0.1500, 0.8900],
         [0.5500, 0.8700, 0.6600],
         [0.5700, 0.8500, 0.6400],
         [0.2200, 0.5800, 0.3300],
         [0.7700, 0.2500, 0.1000],
         [0.0500, 0.8000, 0.5500]],

        [[0.4300, 0.1500, 0.8900],
         [0.5500, 0.8700, 0.6600],
         [0.5700, 0.8500, 0.6400],
         [0.2200, 0.5800, 0.3300],
         [0.7700, 0.2500, 0.1000],
         [0.0500, 0.8000, 0.5500]]])

In [47]:
torch.manual_seed(123)
context_length = batch.shape[1]
ca = CausalAttention(d_in, d_out, context_length, 0.0)
context_vecs = ca(batch)
print("context_vecs.shape:", context_vecs.shape)

context_vecs.shape: torch.Size([2, 6, 2])


In [48]:
class MultiHeadAttentionWrapper(nn.Module):
    def __init__(self, d_in, d_out, context_length,
                 dropout, num_heads, qkv_bias=False):
        super().__init__()
        self.heads = nn.ModuleList(
            [CausalAttention(d_in, d_out, context_length, dropout, qkv_bias) 
             for _ in range(num_heads)]
        )
 
    def forward(self, x):
        return torch.cat([head(x) for head in self.heads], dim=-1)

In [161]:
torch.manual_seed(123)
context_length = batch.shape[1] # This is the number of tokens
d_in, d_out = 3, 4
mha = MultiHeadAttentionWrapper(d_in, d_out, context_length, 0.0, num_heads=1)
context_vecs_wrapper = mha(batch)
 
print(context_vecs_wrapper)
print("context_vecs.shape:", context_vecs_wrapper.shape)

tensor([[[-0.3132, -0.2272,  0.4772,  0.1063],
         [-0.2320,  0.0293,  0.5789,  0.3056],
         [-0.2068,  0.1162,  0.6118,  0.3695],
         [-0.1635,  0.1328,  0.5457,  0.3531],
         [-0.1687,  0.1813,  0.5315,  0.3400],
         [-0.1411,  0.1727,  0.5063,  0.3432]],

        [[-0.3132, -0.2272,  0.4772,  0.1063],
         [-0.2320,  0.0293,  0.5789,  0.3056],
         [-0.2068,  0.1162,  0.6118,  0.3695],
         [-0.1635,  0.1328,  0.5457,  0.3531],
         [-0.1687,  0.1813,  0.5315,  0.3400],
         [-0.1411,  0.1727,  0.5063,  0.3432]]], grad_fn=<CatBackward0>)
context_vecs.shape: torch.Size([2, 6, 4])


In [154]:
batch.shape

torch.Size([2, 6, 3])

If we want to have an output dimension of 2, as earlier in single-head attention, we can have to change the projection dimension `d_out` to 1:

```python
torch.manual_seed(123)

d_out = 1
mha = MultiHeadAttentionWrapper(d_in, d_out, context_length, 0.0, num_heads=2)

context_vecs = mha(batch)

print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)
```

In [259]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, 
                 context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"
 
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            'mask',
             torch.triu(torch.ones(context_length, context_length), diagonal=1)
        )
 
    def forward(self, x):
        num_inputs, num_tokens, d_in = x.shape
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)
 
        keys = keys.view(num_inputs, num_tokens, num_heads, head_dim) # (num_inputs x num_tokens x num_heads x head_dim)
        queries = queries.view(num_inputs, num_tokens, num_heads, head_dim) # (num_inputs x num_tokens x num_heads x head_dim)
        values = values.view(num_inputs, num_tokens, num_heads, head_dim) # (num_inputs x num_tokens x num_heads x head_dim)


        # transpose inputs
        # (num_inputs, num_tokens, num_heads, head_dim) -> (num_inputs, num_heads, num_tokens, head_dim)
        keys= keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # compute attention scores
        # (num_inputs x num_heads x num_tokens x num_tokens)
        attn_scores = queries @ keys.transpose(2, 3) 

        # apply mask
        # (num_inputs x num_heads x num_tokens x num_tokens)
        self.mask.bool()[:num_tokens, :num_tokens]
        masked = attn_scores.masked_fill(self.mask.bool(), -torch.inf)
        

        # apply softmax (on last dimension)
        # (num_inputs x num_heads x num_tokens x num_tokens)
        attn_weights = torch.softmax(masked / keys.shape[1] ** 0.5, dim=-1)

        # apply random dropout
        # (num_inputs x num_heads x num_tokens x num_tokens)
        attn_weights = self.dropout(attn_weights)

        # generate contet vector
        # values already transposed
        context_vec = (attn_weights @ values).transpose(1,2) # (num_inputs x num_tokens * num_heads x head_dim)
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out) # (num_inputs, num_tokens, d_out)
        context_vec = self.out_proj(context_vec) # (num_inputs, num_tokens, d_out)
        return context_vec

torch.Size([2, 6, 2, 1])

In [200]:
#batch.shape = 

batch = batch[1:,:,:]

In [201]:
batch.shape

torch.Size([1, 6, 3])

In [223]:
x = batch
num_tokens = batch.shape[1] # 6
d_in = batch.shape[2] #. 3
d_out = 8
num_heads=2
b, num_tokens, d_in = batch.shape
W_query = nn.Linear(d_in, d_out, bias=False)
W_key = nn.Linear(d_in, d_out, bias=False)
W_value = nn.Linear(d_in, d_out, bias=False)
head_dim = d_out // num_heads
keys= W_key(x)
queries = W_query(x)
values = W_value(x)
num_inputs = b # 2


In [221]:
print(f"d_in: {d_in}")
print(f"d_out: {d_out}")
print(f"num_inputs: {num_inputs}")
print(f"num_heads : {num_heads}")
print(f"dim_head : {d_out // num_heads}")
print(f"num_tokens : {num_tokens}")

d_in: 3
d_out: 8
num_inputs: 1
num_heads : 2
dim_head : 4
num_tokens : 6


In [260]:
mha = MultiHeadAttention(d_in, d_out, context_length=num_tokens, dropout=0.5, num_heads=num_heads, qkv_bias=False)


tensor([[[-0.3499,  0.2475,  0.7858, -0.1963,  0.1205,  0.1612,  0.5282,
           0.0608],
         [-0.0854,  0.3290,  0.4207, -0.0790,  0.0129,  0.1088,  0.4140,
          -0.0376],
         [-0.3457,  0.4110,  0.9467, -0.1131, -0.0625,  0.3940,  0.5060,
           0.2879],
         [-0.0151,  0.4282,  0.7362, -0.0475, -0.1657,  0.4405,  0.3618,
           0.5702],
         [-0.1075,  0.3970,  0.7199, -0.0820, -0.1167,  0.3618,  0.4300,
           0.3607],
         [-0.0122,  0.3797,  0.4348, -0.0277, -0.0550,  0.1627,  0.3461,
           0.1996]]], grad_fn=<ViewBackward0>)

In [262]:
mha(batch).shape # num_inputs * num_tokens * d_out

torch.Size([1, 6, 8])

In [224]:

keys = keys.view(num_inputs, num_tokens, num_heads, head_dim) # (num_inputs x num_tokens x num_heads x head_dim)
queries = queries.view(num_inputs, num_tokens, num_heads, head_dim) # (num_inputs x num_tokens x num_heads x head_dim)
values = values.view(num_inputs, num_tokens, num_heads, head_dim) # (num_inputs x num_tokens x num_heads x head_dim)


# transpose inputs
# (num_inputs, num_tokens, num_heads, head_dim) -> (num_inputs, num_heads, num_tokens, head_dim)
keys= keys.transpose(1, 2)
queries = queries.transpose(1, 2)
values = values.transpose(1, 2)

# compute attention scores
# (num_inputs x num_heads x num_tokens x num_tokens)
attn_scores = queries @ keys.transpose(2, 3) 

# apply mask
# (num_inputs x num_heads x num_tokens x num_tokens)
#self.mask.bool()[:num_tokens, :num_tokens]
#masked = attn_scores.masked_fill(self.mask.bool(), -torch.inf)


# apply softmax (on last dimension)
# (num_inputs x num_heads x num_tokens x num_tokens)
attn_weights = torch.softmax(masked / keys.shape[1] ** 0.5, dim=-1)

# apply random dropout
# (num_inputs x num_heads x num_tokens x num_tokens)
# attn_weights = Dropout(attn_weights)

# generate contet vector
# values already transposed
context_vec = (attn_weights @ values).transpose(1,2 ) # (num_inputs x num_tokens * num_heads x head_dim)
context_vec = context_vec.contiguous().view(num_inputs, num_tokens, d_out) # num_inputs * num_tokens * d_out


In [225]:
# matrix per input, head of num_tokensxnum_tokens
attn_scores.shape # num_inputs * num_heads * num_tokens * num_tokens 

torch.Size([1, 2, 6, 6])

In [244]:
# Define the matrix with arbitrary integer values
data = [
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
]
matrix = torch.tensor(data, dtype=torch.int32)
data2 = [
    [1, 2],
    [1, 2],
    [1, 2]
] 
matrix2 = torch.tensor(data2, dtype=torch.int32)
matrix @ matrix2


tensor([[ 6, 12],
        [15, 30],
        [24, 48]], dtype=torch.int32)

In [246]:
mask = torch.triu(torch.ones(context_length, context_length), diagonal=1)
masked = attn_scores.masked_fill(mask.bool(), -torch.inf)
print(masked.shape)
attn_weights = torch.softmax(masked / keys.shape[1] ** 0.5, dim=-1)


torch.Size([1, 2, 6, 6])


In [247]:
attn_weights.shape # (num_inputs * num_heads * num_tokens * num_tokens)

torch.Size([1, 2, 6, 6])

In [248]:
values.shape # values (after transpose) is num_inputs * num_heads * num_tokens * dim_head

torch.Size([1, 2, 6, 4])

In [250]:
context_vec = (attn_weights @ values).transpose(1, 2)

In [254]:
context_vec.shape # num_inputs * num_tokens * d_out

torch.Size([1, 6, 8])

In [237]:
context_vec

tensor([[[[-0.1257,  0.6342, -0.6926,  0.2984],
          [-0.1871, -0.0844, -0.3135, -0.1658]],

         [[-0.0660,  0.6053, -0.6154,  0.2087],
          [-0.3221, -0.2609, -0.3324, -0.0850]],

         [[-0.0417,  0.5938, -0.5909,  0.1743],
          [-0.3644, -0.3148, -0.3316, -0.0635]],

         [[-0.0360,  0.5115, -0.4934,  0.1453],
          [-0.3362, -0.3028, -0.3050, -0.0335]],

         [[ 0.0157,  0.4784, -0.4930,  0.0895],
          [-0.3234, -0.2758, -0.2138, -0.0773]],

         [[-0.0085,  0.4477, -0.4341,  0.1015],
          [-0.3193, -0.2886, -0.2518, -0.0360]]]],
       grad_fn=<TransposeBackward1>)

In [238]:
context_vec.shape # num_inputs * num_heads *

torch.Size([1, 6, 2, 4])

In [191]:
context_vec.transpose(1, 2).shape

torch.Size([2, 6, 1, 4])

In [172]:
attn_scores = queriesv @ keysv.transpose(2, 3)

In [173]:
attn_scores.shape # (num_inputs x num_heads x num_tokens x num_tokens)

torch.Size([2, 1, 6, 6])

torch.Size([2, 1, 6, 6])


In [183]:
keys.shape

torch.Size([2, 6, 1, 4])

RuntimeError: Expected size for first two dimensions of batch2 tensor to be: [12, 6] but got: [12, 1].

# Exercise 3.3

```python
context_length = 1024
d_in, d_out = 768, 768
num_heads = 12

mha = MultiHeadAttention(d_in, d_out, context_length, 0.0, num_heads)
```

In [None]:
batch 

In [265]:
context_length = 1024
d_in, d_out = 768, 768
num_heads = 12

mha = MultiHeadAttention(d_in, d_out, context_length, 0.0, num_heads)
mha(batch)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (6x3 and 768x768)

In [66]:
inputs.shape

torch.Size([6, 3])

Optionally, the number of parameters is as follows:

```
2360064  # (2.36 M)
```