### Setup

In [3]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import math

In [4]:
# Hyperparameters 
N_ENCODER_LAYERS = 6
N_DECODER_LAYERS = 6
D_MODEL = 512
D_QUERY_KEY = 256
D_VALUE = 256
D_EMBED = 256

# Other settings 
BATCH_DIMENSION = 0 # number of sentences
SEQUENCE_DIMENSION = 1 # number of words per sentence
FEATURE_DIMENSION = 2 # word vector dimensionality

def get_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    else: 
        return torch.device('cpu')


Index into the batch dimension to get a given sentence. 
Theo batch dimension is named how it is because including it as a whole gives us the whole batch which is made up of sentences

Index into the sequence dimension to get a given word vector 
Index into the feature dimension to get a given number in the word vector

### Tokenize

In [5]:
batch_one_sentences = [
    "hello there, you are funny",
    "why is my watter bottle brown?",
    "who pooped on the floor!?",
    "peaches, peaches, peaches, peaches"
]

batch_two_senteces = [
    "why so serious?",
    "darkness is my ally",
    "who pooped on the floor!?"
]

batches = [batch_one_sentences, batch_two_senteces]

In [6]:
word_to_index = {}
for b in batches:
    for s in b:
        for w in s.split():
            if w not in word_to_index:
                word_to_index[w] = len(word_to_index)

tokenized_sentences = [[[word_to_index[word] for word in sentence.split()] for sentence in batch] for batch in batches]

print(tokenized_sentences)

[[[0, 1, 2, 3, 4], [5, 6, 7, 8, 9, 10], [11, 12, 13, 14, 15], [16, 16, 16, 17]], [[5, 18, 19], [20, 6, 7, 21], [11, 12, 13, 14, 15]]]


### Data Prep (padding)

In [7]:
print([[1]] + [[0]] * 5)

[[1], [0], [0], [0], [0], [0]]


In [8]:
#pad the batches and sentences to be the same length
max_len = max(len(b) for b in batches)
padded_tokenized_sentences = [batch + [[0]] * (max_len - len(batch)) for batch in tokenized_sentences]

for i in range(len(tokenized_sentences)):
    max_len = max(len(sentence) for sentence in tokenized_sentences[i])
    padded_tokenized_sentences[i] = [sentence + [0] * (max_len - len(sentence)) for sentence in tokenized_sentences[i]]

In [9]:
padded_tokenized_sentences

[[[0, 1, 2, 3, 4, 0],
  [5, 6, 7, 8, 9, 10],
  [11, 12, 13, 14, 15, 0],
  [16, 16, 16, 17, 0, 0]],
 [[5, 18, 19, 0, 0], [20, 6, 7, 21, 0], [11, 12, 13, 14, 15]]]

In [49]:
torch.tensor(padded_tokenized_sentences[0]).shape

torch.Size([4, 6])

In [50]:
(torch.tensor(padded_tokenized_sentences[0]) != 0).unsqueeze(-2).shape

torch.Size([4, 1, 6])

In [10]:
(torch.tensor(padded_tokenized_sentences[0]) != 0).unsqueeze(-2)

tensor([[[False,  True,  True,  True,  True, False]],

        [[ True,  True,  True,  True,  True,  True]],

        [[ True,  True,  True,  True,  True, False]],

        [[ True,  True,  True,  True, False, False]]])

In [11]:
torch_training_batches = [torch.tensor(batch, dtype=torch.long) for batch in padded_tokenized_sentences]
torch_training_batches

[tensor([[ 0,  1,  2,  3,  4,  0],
         [ 5,  6,  7,  8,  9, 10],
         [11, 12, 13, 14, 15,  0],
         [16, 16, 16, 17,  0,  0]]),
 tensor([[ 5, 18, 19,  0,  0],
         [20,  6,  7, 21,  0],
         [11, 12, 13, 14, 15]])]

### Embedding

In [12]:
#Map inputs to embeddings


vocab_size = len(word_to_index)
embedding_dim = 10

#matrix of size vocab_size x embedding_dim is initualized with random weights
#In training, the embeddings are adjusted with the rest of the parameters in the model
embed = nn.Embedding(vocab_size, embedding_dim)

for i in range(len(torch_training_batches)):
    torch_training_batches[i] = embed(torch_training_batches[i])

print(torch_training_batches[0].shape)

torch.Size([4, 6, 10])


In [13]:
embed

Embedding(22, 10)

### practice working with fake training data

In [14]:
# give me the embedding for second word in the second sentence of the second batch
torch_training_batches[1][1][1]

tensor([-1.3601, -0.5811, -0.7539, -0.1906,  0.7925,  0.3993,  1.7720, -0.3877,
         0.0407,  1.6659], grad_fn=<SelectBackward0>)

In [15]:
# use the second batch and first sentence 
torch_training_batches[0].shape

torch.Size([4, 6, 10])

### Positional Encoding

In [16]:
#Create the positonal Encoding Layer 

### Attention

#### Figuring out matrix multiplication dimensions with dummy data

In [17]:
n_heads = 2
n_sentences = 2
n_words = 3
d_embed = 4
d_q_k = 2
d_v = 3


batch = torch.tensor([[[1, 1, 1, 1],
                        [2, 2, 2, 2],
                        [3, 3, 3, 3]],
                        [[0, 0, 0, 0],
                         [-1, -1, -1, -1],
                         [-2, -2, -2, -2]]], dtype=torch.float32)


head_1_q_k_v_weights = torch.tensor([
    [-1, -1, -2, -2, -3, -3, -3],
    [-1, -1, -2, -2, -3, -3, -3],
    [-1, -1, -2, -2, -3, -3, -3],
    [-1, -1, -2, -2, -3, -3, -3]
], dtype=torch.float32)

head_2_q_k_v_weights = torch.tensor([
    [1, 1, 2, 2, 3, 3, 3],
    [1, 1, 2, 2, 3, 3, 3],
    [1, 1, 2, 2, 3, 3, 3],
    [1, 1, 2, 2, 3, 3, 3]
], dtype=torch.float32)

head_1_q_k_v = torch.matmul(batch, head_1_q_k_v_weights)
head_2_q_k_v = torch.matmul(batch, head_2_q_k_v_weights)
manual_result = torch.stack([head_1_q_k_v, head_2_q_k_v])
manual_result

tensor([[[[ -4.,  -4.,  -8.,  -8., -12., -12., -12.],
          [ -8.,  -8., -16., -16., -24., -24., -24.],
          [-12., -12., -24., -24., -36., -36., -36.]],

         [[  0.,   0.,   0.,   0.,   0.,   0.,   0.],
          [  4.,   4.,   8.,   8.,  12.,  12.,  12.],
          [  8.,   8.,  16.,  16.,  24.,  24.,  24.]]],


        [[[  4.,   4.,   8.,   8.,  12.,  12.,  12.],
          [  8.,   8.,  16.,  16.,  24.,  24.,  24.],
          [ 12.,  12.,  24.,  24.,  36.,  36.,  36.]],

         [[  0.,   0.,   0.,   0.,   0.,   0.,   0.],
          [ -4.,  -4.,  -8.,  -8., -12., -12., -12.],
          [ -8.,  -8., -16., -16., -24., -24., -24.]]]])

In [18]:
head_weights = torch.stack([head_1_q_k_v_weights, head_2_q_k_v_weights])
matrix_result = torch.matmul(batch, torch.unsqueeze(head_weights, 1))
matrix_result

tensor([[[[ -4.,  -4.,  -8.,  -8., -12., -12., -12.],
          [ -8.,  -8., -16., -16., -24., -24., -24.],
          [-12., -12., -24., -24., -36., -36., -36.]],

         [[  0.,   0.,   0.,   0.,   0.,   0.,   0.],
          [  4.,   4.,   8.,   8.,  12.,  12.,  12.],
          [  8.,   8.,  16.,  16.,  24.,  24.,  24.]]],


        [[[  4.,   4.,   8.,   8.,  12.,  12.,  12.],
          [  8.,   8.,  16.,  16.,  24.,  24.,  24.],
          [ 12.,  12.,  24.,  24.,  36.,  36.,  36.]],

         [[  0.,   0.,   0.,   0.,   0.,   0.,   0.],
          [ -4.,  -4.,  -8.,  -8., -12., -12., -12.],
          [ -8.,  -8., -16., -16., -24., -24., -24.]]]])

#### Figuring out how to add a bias to heads

In [19]:
torch.unsqueeze(torch.stack([torch.ones((1, 3)), torch.zeros((1, 3))]), dim = 1)

tensor([[[[1., 1., 1.]]],


        [[[0., 0., 0.]]]])

In [20]:
torch.stack([torch.ones((1, 3)), torch.zeros((1, 3))]).unsqueeze(dim=1).shape

torch.Size([2, 1, 1, 3])

In [21]:
torch.ones((2, 2, 2, 3)) + torch.unsqueeze(torch.stack([torch.ones((1, 3)), torch.zeros((1, 3))]), dim=1)

tensor([[[[2., 2., 2.],
          [2., 2., 2.]],

         [[2., 2., 2.],
          [2., 2., 2.]]],


        [[[1., 1., 1.],
          [1., 1., 1.]],

         [[1., 1., 1.],
          [1., 1., 1.]]]])

In [22]:
print("batch shape: ", batch.shape)
print("weight shape: ", torch.unsqueeze(head_weights, 1).shape)
print("result shape: ", torch.matmul(batch, torch.unsqueeze(head_weights, 1)).shape)

batch shape:  torch.Size([2, 3, 4])
weight shape:  torch.Size([2, 1, 4, 7])
result shape:  torch.Size([2, 2, 3, 7])


In [23]:
torch.tensor((n_heads, 1, 4, d_q_k * 2 + d_v), dtype=torch.float32)

tensor([2., 1., 4., 7.])

In [24]:
head_weights.shape

torch.Size([2, 4, 7])

In [25]:
torch.unsqueeze(torch.stack([torch.ones((1, d_q_k * 2 + d_v)), torch.zeros((1, d_q_k * 2 + d_v))]), dim=1)

tensor([[[[1., 1., 1., 1., 1., 1., 1.]]],


        [[[0., 0., 0., 0., 0., 0., 0.]]]])

#### Figuring out how to split the query, key, weight result

In [26]:
q, k, v = torch.split(matrix_result, [d_q_k, d_q_k, d_v], dim=-1)

In [27]:
print("q shape: ", q.shape)
print("k shape: ", k.shape)
print("v shape: ", v.shape)

q shape:  torch.Size([2, 2, 3, 2])
k shape:  torch.Size([2, 2, 3, 2])
v shape:  torch.Size([2, 2, 3, 3])


#### How to use transpose to swap dimensions

In [28]:
torch.transpose(k, 2, 3).shape

torch.Size([2, 2, 2, 3])

#### Figuring out the softmax dimension

In [29]:
s = nn.Softmax(dim=-1)
t = torch.stack([torch.tensor([1, 1], dtype=torch.float), torch.tensor([3, 5], dtype=torch.float)])

In [30]:
s(t)

tensor([[0.5000, 0.5000],
        [0.1192, 0.8808]])

#### Actual modules

In [31]:
# comments to add back in for testing purposes if needed

# self.weight = nn.Parameter(head_weights.unsqueeze(dim=1))
# self.bias = nn.Parameter(torch.unsqueeze(torch.stack([torch.ones((1, d_q_k * 2 + d_v)), torch.zeros((1, d_q_k * 2 + d_v))]), dim=1)) if bias else None
#         self.bias = nn.Parameter(torch.rand((n_heads, 1, 1, d_q_k * 2 + d_v), dtype=torch.float32)) if bias else None

        # if self.bias is not None:
        #     fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
        #     bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
        #     nn.init.uniform_(self.bias, -bound, bound)

        # if self.bias != None:
        #     output += self.bias

In [32]:
class MultiHeadLinearLayer(nn.Module):
    def __init__(self, n_heads, d_model, d_out, bias=False):
        super(MultiHeadLinearLayer, self).__init__()
        for key, value in list(locals().items())[1:4]:
            setattr(self, key, value)

        self.weight = nn.Parameter(torch.rand((n_heads, 1, d_model, d_out), dtype=torch.float32))
        self.bias = nn.Parameter(torch.rand((n_heads, 1, 1, d_out), dtype=torch.float32)) if bias else None

        self.reset_parameters()
        
    def reset_parameters(self):
        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
            nn.init.uniform_(self.bias, -bound, bound)


    def forward(self, input):
        output = torch.matmul(input, self.weight)
        if self.bias != None:
            output += self.bias
        
        return output

In [33]:
#Create the Multi head attention layer

'''
[description]

Parameters:

Returns:

Raises:

Example:

'''
class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads: int, d_model: int, d_q_k: int, d_v: int):
        super(MultiHeadAttention, self).__init__()
        for key, value in list(locals().items())[1:5]:
            setattr(self, key, value)

        self.softmax = nn.Softmax(dim=-1)

        self.query_weights = MultiHeadLinearLayer(n_heads, d_model, d_q_k)
        self.key_weights = MultiHeadLinearLayer(n_heads, d_model, d_q_k)
        self.value_weights = MultiHeadLinearLayer(n_heads, d_model, d_v)

        self.linear_out = nn.Linear(in_features=n_heads * d_v, out_features=d_model, bias=True)

    def _scaled_dot_product_attention(self, q, k, v):
        return torch.matmul(self.softmax(torch.matmul(q, torch.transpose(k, -2, -1)) / torch.sqrt(torch.tensor(q.shape[-1], dtype=torch.float32))), v)

    def forward(self, x, x_encoder=None):

        q = self.query_weights(x)

        if x_encoder:
            k = self.key_weights(x_encoder)
            v = self.value_weights(x_encoder)
        else:
            k = self.key_weights(x)
            v = self.value_weights(x)
        
        values = self._scaled_dot_product_attention(q, k, v)
        concatenated = torch.cat([v for v in values], dim=-1)

        return self.linear_out(concatenated)
    

In [34]:
class AddAndNorm(nn.Module):
    def __init__(self):
        super(AddAndNorm, self).__init__()
        self.norm = nn.functional.layer_norm

    def forward(self, prev, curr):
        return self.norm(prev + curr, prev.shape)

In [35]:
class FeedForwardLayer(nn.Module):
    def __init__(self,in_features: int, hidden_features: int, out_features: int):
        super(FeedForwardLayer, self).__init__()
        self.input_layer = nn.Linear(in_features=in_features, out_features=hidden_features)
        self.relu = nn.ReLU()
        self.output_layer = nn.Linear(in_features=hidden_features, out_features=out_features)
    
    def forward(self, x):
        return self.output_layer(self.relu(self.input_layer(x)))
    

In [36]:
class EncoderLayer(nn.Module):
    def __init__(self, n_heads: int, d_model: int, d_q_k: int, d_v: int, d_ff: int):
        super(EncoderLayer, self).__init__()
        for key, value in list(locals().items())[1:6]:
            setattr(self, key, value)
            
        self.multi_head_attention = MultiHeadAttention(n_heads, d_model, d_q_k, d_v)
        self.add_and_norm = AddAndNorm()
        self.feed_forward = FeedForwardLayer(in_features=d_model, hidden_features=d_ff, out_features=d_model)

    def forward(self, x):
        out_1 = self.add_and_norm(x, self.multi_head_attention(x))
        return self.add_and_norm(out_1, self.feed_forward(x))

In [37]:
class Encoder(nn.Module):
    def __init__(self, n_layers: int, n_heads: int, d_model: int, d_q_k: int, d_v: int, d_ff: int):
        super(Encoder, self).__init__()
        for key, value in list(locals().items())[1:7]:
            setattr(self, key, value)
        
        self.layers = nn.ModuleList([EncoderLayer(n_heads, d_model, d_q_k, d_v, d_ff) for _ in range(n_layers)])

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

In [38]:
device = get_device()
m_h_attention = MultiHeadAttention(2, 4, 2, 3)
m_h_attention.to(device)
o = m_h_attention(batch.to(device))
o.shape

torch.Size([2, 3, 4])

In [39]:
batch.shape

torch.Size([2, 3, 4])

In [40]:
o.shape

torch.Size([2, 3, 4])

In [41]:
test = torch.rand((100, 50, 512))
encoder_layer = EncoderLayer(8, 512, 64, 64, 2048)
encoder_layer.to(device)
encoder_layer(test.to(device)).shape

torch.Size([100, 50, 512])

In [42]:
# Create Encoder Layer
encoder = Encoder(6, 8, 512, 64, 64, 2048)
encoder.to(device)
encoder(test.to(device)).shape

torch.Size([100, 50, 512])

In [43]:
total_parameters = sum(p.numel() for p in encoder.parameters() if p.requires_grad)
total_parameters

18892800

In [44]:
# Create the Masked Multi-head attention layer

class MaskedMultiHeadAttention(nn.Module):
    def __init__(self, n_heads: int, d_model: int, d_q_k: int, d_v: int):
        super(MaskedMultiHeadAttention, self).__init__()
        for key, value in list(locals().items())[1:5]:
            setattr(self, key, value)
        
        self.multi_head_attention = MultiHeadAttention(n_heads, d_model, d_q_k, d_v)
    
    def add_mask(self, x):
        pass

    def forward(self, x):
        return self.multi_head_attention(self.add_mask(x))
    

In [45]:
# Create the Decoder Layer (Take the keys and values from the last encoder layer)

In [46]:
# Create the Linear Layer

In [47]:
# Create the Softmax Layer 

In [48]:
# put it all together 

# based on the dimension of the input, change the batch, sequence, and feature dimensions as needed. Set to none if non-existent