In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

In [82]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, d_q_k, d_v):
        super(MultiHeadAttention, self).__init__()
        for key, value in list(locals().items())[1:5]:
            setattr(self, key, value)

        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.W_q = nn.Linear(d_model, d_q_k * num_heads)
        self.W_k = nn.Linear(d_model, d_q_k * num_heads)
        self.W_v = nn.Linear(d_model, d_v * num_heads)
        self.W_o = nn.Linear(d_v * num_heads, d_model)
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_q_k)
        if mask is not None:
            #Set the masked values such that when they are passed through the softmax, they
            #become 0
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output
        
    def split_heads(self, x, d):
        batch_size, seq_length, d_model = x.size()
        #view splits the embedding dimension 
        return x.view(batch_size, seq_length, self.num_heads, d).transpose(1, 2)
        
    def combine_heads(self, x):
        batch_size, _, seq_length, d = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_v * self.num_heads)
        
    def forward(self, q, k, v, mask=None):
        Q = self.split_heads(self.W_q(q), self.d_q_k)
        K = self.split_heads(self.W_k(k), self.d_q_k)
        V = self.split_heads(self.W_v(v), self.d_v)
        
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output

In [83]:
mha = MultiHeadAttention(6, 8, 2, 4)

In [84]:
x = torch.tensor([[[1, 2, 3, 4, 5, 6],
                    [7, 8, 9, 10, 11, 12]],
                    [[13, 14, 15, 16, 17, 18],
                     [19, 20, 21, 22, 23, 24]]], dtype=torch.float)

In [86]:
mha(x, x, x)

tensor([[[-2.9677, -1.4866, -2.8680, -4.3427, -0.9212,  1.6678],
         [-3.0570, -1.8421, -2.7074, -4.2729, -1.0963,  1.8373]],

        [[-6.9505, -0.9727, -7.2178, -9.3986, -2.5982,  4.5516],
         [-6.8928, -0.9621, -7.3239, -9.3857, -2.6173,  4.5706]]],
       grad_fn=<ViewBackward0>)

### Explain the split heads method


In [19]:
batch_size, seq_length, d_model = x.size()

#split up the embedding dimension into the heads. Here, our embedding dimension is 6 and we have 3 heads. 
x.view(batch_size, seq_length, 3, 2)

tensor([[[[ 1,  2],
          [ 3,  4],
          [ 5,  6]],

         [[ 7,  8],
          [ 9, 10],
          [11, 12]]],


        [[[13, 14],
          [15, 16],
          [17, 18]],

         [[19, 20],
          [21, 22],
          [23, 24]]]])

In [26]:
#We then shuffle the data by switching the sequence dimension with the head dimension so that we can have all the token embeddings 
# in one head instead of all head embeddings for each token
split_heads = x.view(batch_size, seq_length, 3, 2).transpose(1, 2)
split_heads

tensor([[[[ 1,  2],
          [ 3,  4],
          [ 5,  6]],

         [[ 7,  8],
          [ 9, 10],
          [11, 12]]],


        [[[13, 14],
          [15, 16],
          [17, 18]],

         [[19, 20],
          [21, 22],
          [23, 24]]]])

### Explain the combine heads function

In [27]:
# batch, head, seq, emb -> batch, seq, head, emb - This just switches the dimensions back
split_heads.transpose(1, 2)

tensor([[[[ 1,  2],
          [ 3,  4],
          [ 5,  6]],

         [[ 7,  8],
          [ 9, 10],
          [11, 12]]],


        [[[13, 14],
          [15, 16],
          [17, 18]],

         [[19, 20],
          [21, 22],
          [23, 24]]]])

In [30]:
#Make the new tensor contiguous for fast retrieval of the new tensor
#reshuffle the data so that it is in the same shape as befor the split. This merges the heads
split_heads.transpose(1, 2).contiguous().view(batch_size, seq_length, d_model)

tensor([[[ 1,  2,  3,  4,  5,  6],
         [ 7,  8,  9, 10, 11, 12]],

        [[13, 14, 15, 16, 17, 18],
         [19, 20, 21, 22, 23, 24]]])

### Position-Wise Feed Forward

In [56]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

### Positional Encoding

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        
        # create a placeholder for the positional encodings
        pe = torch.zeros(max_seq_length, d_model)

        # create a column array that acts as a scaling factor for the sin and cosine computations. The idea is that the position will effect the frequency of the function 
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)

        # create a row array to multiply with the position matrix. shape should be (1, d_model / 2) because we will use the same values for both the sine and cosine computations
        # which are interleaved
        # these are the positons within the encoding vector where the sine and cosine functions will be applied 
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))

        
        #interleave the sin and cosine computation in our placeholder
        pe[:, 0::2] = torch.sin(position * div_term) #put the sine computations in the evenly indexed columns for every row
        pe[:, 1::2] = torch.cos(position * div_term) #put the cosine computations in the odd indexed columns for every row
        
        #registers a tensor as a buffer, not a parameter. This is not updated by the optimizer
        # Automatically will be included in the model's state_dict
        # using register buffer allows for automatic device management. 
        # This will also be included in the computation graph so that all parameters recieve the correct credit for the outputs
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

### Understand the positional encoding

In [74]:
a = torch.exp(torch.arange(0, d_model, 2).float())
a

tensor([ 1.0000,  7.3891, 54.5981])

In [77]:
b = -(math.log(10000.0) / d_model)
b

-1.5350567286626973

In [78]:
a * b

tensor([ -1.5351, -11.3426, -83.8113])

In [80]:
torch.exp(torch.arange(0, 10, 2).float())

tensor([1.0000e+00, 7.3891e+00, 5.4598e+01, 4.0343e+02, 2.9810e+03])

In [81]:
-(math.log(10000.0) / 10)

-0.9210340371976183

In [79]:
torch.exp(torch.arange(0, 10, 2).float()) * -(math.log(10000.0) / 10)

tensor([-9.2103e-01, -6.8056e+00, -5.0287e+01, -3.7157e+02, -2.7456e+03])

In [64]:
max_seq_length = 5
torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1) * torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))

tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00],
        [1.0000e+00, 4.6416e-02, 2.1544e-03],
        [2.0000e+00, 9.2832e-02, 4.3089e-03],
        [3.0000e+00, 1.3925e-01, 6.4633e-03],
        [4.0000e+00, 1.8566e-01, 8.6177e-03]])

In [63]:
torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))

tensor([1.0000, 0.0464, 0.0022])

### Encoder

Dropout: regularization technique where randomly selected neurons are ignored during training. This helps to reduce overfitting by making the neural network less sensitive to the specific weights of neurons. This forces the network to learn more distributed representations.

Residual connections: Help to avoid the vanishing gradient problem. 

Layer Normalization: normalizes the inputs across the features instead of the batch dimension. Ensures that the mean and variance are stable across the features which helps to stabilize the learning process. 

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):

        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

### Decoder

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_q_k, d_v, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads, d_q_k, d_v)
        self.cross_attn = MultiHeadAttention(d_model, num_heads, d_q_k, d_v)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

### Transformer

In [None]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        #maske the padding tokens
        src_mask = (src != 0).unsqueeze(-1)
        tgt_mask = (tgt != 0).unsqueeze(-1)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        #the final target mask consists of the padding mask and the no peak mask
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

In [139]:
embedding_size = 4
batch_size = 2
seq_length = 3

In [143]:
tokenized_batch = torch.randint(1, 10, (batch_size, seq_length))
tokenized_batch

tensor([[3, 3, 6],
        [4, 4, 2]])

In [145]:
tokenized_batch[:, -1] = 0 #symbolic of padding
tokenized_batch

tensor([[3, 3, 0],
        [4, 4, 0]])

In [173]:
src_mask = (tokenized_batch != 0).unsqueeze(-1)
src_mask

tensor([[[ True],
         [ True],
         [False]],

        [[ True],
         [ True],
         [False]]])

In [174]:
embedded_batch = torch.randint(1, 10, (batch_size, seq_length, embedding_size))
embedded_batch

tensor([[[7, 4, 6, 3],
         [4, 2, 6, 3],
         [4, 2, 2, 3]],

        [[4, 3, 1, 3],
         [6, 4, 9, 2],
         [3, 7, 3, 2]]])

In [175]:
masked_batch = embedded_batch.masked_fill(src_mask == 0, -1)
masked_batch

tensor([[[ 7,  4,  6,  3],
         [ 4,  2,  6,  3],
         [-1, -1, -1, -1]],

        [[ 4,  3,  1,  3],
         [ 6,  4,  9,  2],
         [-1, -1, -1, -1]]])

In [168]:
tokenized_batch

tensor([[3, 3, 0],
        [4, 4, 0]])

In [169]:
embedded_batch

tensor([[[2, 1, 1, 4],
         [7, 3, 3, 2],
         [2, 2, 1, 4]],

        [[6, 9, 4, 9],
         [3, 4, 8, 6],
         [5, 4, 4, 9]]])

In [192]:
nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
nopeak_mask

tensor([[[ True, False, False],
         [ True,  True, False],
         [ True,  True,  True]]])

In [None]:
class Batch:
    "Object for holding a batch of data with mask during training."
    def __init__(self, src, trg=None, pad=0):
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)
        if trg is not None:
            self.trg = trg[:, :-1]
            self.trg_y = trg[:, 1:]
            self.trg_mask = \
                self.make_std_mask(self.trg, pad)
            self.ntokens = (self.trg_y != pad).data.sum()

    @staticmethod
    def make_std_mask(tgt, pad):
        "Create a mask to hide padding and future words."
        tgt_mask = (tgt != pad).unsqueeze(-2)
        tgt_mask = tgt_mask & Variable(
            subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data))
        return tgt_mask