# Terminologies

## Encoder
- Maps discrete numeric tokens to continous dense representation encoding semantic information.

## Decoder
- Produces a sequence given the input/dense representation generating one token at a time.

## Attention 
- Dynamically adjusts token representations by adding up the weighted vectors of the token present in its vicinity.

## Multi-Head Attention
- Works similarly to having multiple convolution kernels each learning different semantics at the feature-level.

# Self-Attention Impl.

In [None]:
# He went to the bank to deposit money
# He went to the bank of the river

In [3]:
import torch
torch.manual_seed(123);

In [4]:
corpus = "Saarbrücken is situated in Saarland. It has a University called Saarland University."
dictionary = {s:i for i,s in enumerate(sorted(set(corpus.replace('.','').split())))}
print(dictionary)

{'It': 0, 'Saarbrücken': 1, 'Saarland': 2, 'University': 3, 'a': 4, 'called': 5, 'has': 6, 'in': 7, 'is': 8, 'situated': 9}


In [None]:
# <unk>, <pad>, <s>, </s>

In [142]:
len(dictionary)

10

In [5]:
# tokenized sequence
sequence = 'Saarland University is in Saarbrücken.'
tokenized_sequence = torch.tensor([dictionary[token] for token in sequence.replace('.','').split()])
tokenized_sequence
#q-> Saarland
# K [Saarland, ...]

tensor([2, 3, 8, 7, 1])

In [None]:
#word2vec
# Saarland -> [0.1, 0.2, 0.22, 0.44, 0.1] = v1

# attention
# Saarland -> 0.5 * v1 + 0.2 *v2 ... + 0.08 * vn

# he went to the bank of the river
# To the bank of the river he went

In [None]:
# attention
# v1 = [....] -> Saarland
## Queries= W_q*v1, Keys=W_k*v1, Values=W_v*v1

In [7]:
# attention hyperparams
d = 16 # embedding dimension
# dimensions of query === key as we have to compute dot product
d_q, d_k, d_v = 24, 24, 28 

In [8]:
embeddings = torch.nn.Embedding(len(dictionary), d)
sequence_embeddings = embeddings(tokenized_sequence)
sequence_embeddings.shape

torch.Size([5, 16])

In [149]:
W_query = torch.nn.Parameter(torch.rand(d_q, d))
W_key = torch.nn.Parameter(torch.rand(d_k, d))
W_value = torch.nn.Parameter(torch.rand(d_v, d))

## Computing the Unnormalized Attention Weights

<img height=500 width=300 src='https://sebastianraschka.com/images/blog/2023/self-attention-from-scratch/query.png'>

In [63]:
W_query.shape, sequence_embeddings.shape

(torch.Size([24, 16]), torch.Size([5, 16]))

In [98]:
# [d_q, d] x [d, n_seq] -> d_q x n_seq -> [n_seq x d_q]
queries = W_query.matmul(sequence_embeddings.T).T
# [d_k, d] x [d, n_seq] -> d_k x n_seq -> [n_seq x d_k]
keys = W_key.matmul(sequence_embeddings.T).T
# [d_v, d] x [d, n_seq] -> d_v x n_seq -> [n_seq x d_v]
values = W_value.matmul(sequence_embeddings.T).T

## Normalized Attention Scores

Here scaling `1/sqrt(d_k)` ensures that the Euclidean length of the weight vectors will be approximately in the same magnitude

In [None]:
# Saarland is good
#q * K.T/sqrt(d_k) [1, n_seq] -> [0.5, 0.4, 0.1]
# [0.9, 0.1, 0.0]

In [159]:
kq = queries.matmul(keys.T)
kq_norm = kq/torch.sqrt(torch.tensor(d_k))
kq_norm_softmax = torch.nn.functional.softmax(kq_norm, dim=1)

assert kq_norm_softmax.shape == (len(tokenized_sequence), len(tokenized_sequence)), "unequal dimensions"

In [166]:
## attended weights
attention_res = values.T.matmul(kq_norm_softmax.T).T

## Multi-headed Attention

In [1]:
d_model = 512
n_heads = 8
512//8

64

In [12]:
n_heads = 3
# extend the input to work with all heads
stacked_inputs = sequence_embeddings.T.repeat(n_heads, 1, 1)

W_query = torch.nn.Parameter(torch.rand(n_heads, d_q, d))
W_key = torch.nn.Parameter(torch.rand(n_heads, d_k, d))
W_value = torch.nn.Parameter(torch.rand(n_heads, d_v, d))

In [13]:
W_query.shape, stacked_inputs.shape

(torch.Size([3, 24, 16]), torch.Size([3, 16, 5]))

In [14]:


multihead_keys = torch.bmm(W_query, stacked_inputs)
multihead_queries = torch.bmm(W_key, stacked_inputs).permute(0, 2, 1)
multihead_values = torch.bmm(W_value, stacked_inputs).permute(0, 2, 1)

In [15]:
multihead_queries.shape, multihead_keys.shape

(torch.Size([3, 5, 24]), torch.Size([3, 24, 5]))

In [16]:
kq_multihead = multihead_queries.bmm(multihead_keys)
kq_multihead_norm = kq_multihead/torch.sqrt(torch.tensor(d_k))
attention_result_multihead = kq_multihead_norm.bmm(multihead_values)



In [None]:
8, 5, 64 -> 5, 512

In [18]:
attention_result_multihead.shape

torch.Size([3, 5, 28])

In [None]:
attention_concat = attention_result_multihead.reshape(n_heads* len(tokenized_sequence), d_v)

In [201]:
attention_concat.shape

torch.Size([15, 28])

In [197]:
linear = torch.nn.Linear(28, 64)

In [202]:
linear(attention_concat).shape

torch.Size([15, 64])

## Cross-Attention

In [None]:
"Saarland has Saarbrücken." -> memory Softmax({Saarland, Saarbrucken})->

In [19]:
# now we have another sentence for performing cross attention
sequence2 = '<s>'
tokenized_sequence2 = torch.tensor([dictionary[token] for token in sequence.replace('.','').split()])
sequence_embeddings2 = embeddings(tokenized_sequence2)

In [208]:
# extend the input to work with all heads
stacked_inputs2 = sequence_embeddings2.T.repeat(n_heads, 1, 1)

multihead_queries2 = torch.bmm(W_key, stacked_inputs2).permute(0, 2, 1)

In [210]:
kq_multihead2 = multihead_queries2.bmm(multihead_keys)
kq_multihead_norm2 = kq_multihead2/torch.sqrt(torch.tensor(d_k))
attention_result_multihead2 = kq_multihead_norm2.bmm(multihead_values)

attention_concat2 = attention_result_multihead2.reshape(n_heads* len(tokenized_sequence2), d_v)
attention_concat2.shape

torch.Size([15, 28])

In [211]:
linear = torch.nn.Linear(28, 64)

In [212]:
linear(attention_concat2).shape

torch.Size([15, 64])