In [3]:
import tensorflow as tf
import random
import numpy as np
from keras.layers import LayerNormalization, Layer, Dense, ReLU, Dropout,TextVectorization, Embedding
# from keras.backend import softmax 
from tensorflow.keras.activations import softmax
from tensorflow import math, matmul, reshape, shape, transpose, cast, float32

In [22]:
tf.__version__

'2.16.2'

In [21]:
tf.keras.__version__

'3.6.0'

### Positional embedding

In [4]:
class PositionEmbeddingFixedWeights(Layer):
    def __init__(self, seq_length, vocab_size, output_dim, **kwargs):
        super().__init__(**kwargs)
        word_embedding_matrix = self.get_position_encoding(vocab_size, output_dim)
        pos_embedding_matrix = self.get_position_encoding(seq_length, output_dim)
        self.word_embedding_layer = Embedding(input_dim=vocab_size, output_dim=output_dim,weights=[word_embedding_matrix],trainable=False)
        self.position_embedding_layer = Embedding(input_dim=seq_length, output_dim=output_dim,weights=[pos_embedding_matrix],trainable=False)
    
    def get_position_encoding(self, seq_len, d, n=10000):
        P = np.zeros((seq_len, d))
        for k in range(seq_len):
            for i in np.arange(int(d/2)):
                denominator = np.power(n, 2*i/d)
                P[k, 2*i] = np.sin(k/denominator)
                P[k, 2*i+1] = np.cos(k/denominator)
        return P
    
    
    def call(self, inputs):
        position_indices = tf.range(tf.shape(inputs)[-1])
        embedded_words = self.word_embedding_layer(inputs)
        embedded_indices = self.position_embedding_layer(position_indices)
        return embedded_words + embedded_indices

### Single headed Self Attention

In [5]:
class DotProductAttention(Layer):
    def __init__(self,**kwargs):
        super().__init__(**kwargs)
    
    def call(self,queries,keys,values,d_k,mask = None):
        
        # Scoring the queries against the keys after transposing the latter, and scaling
        scores = matmul(queries, keys, transpose_b=True) / math.sqrt(cast(d_k, float32))
        
        # Apply mask to the attention scores
        if mask is not None:
            scores += -1e9 * mask
        
        # Computing the weights by a softmax operation
        weights = softmax(scores)
        
        # Computing the attention by a weighted sum of the value vectors
        return matmul(weights, values)
        
    

### Multi - headed self attention

In [6]:
class MultiHeadAttention(Layer):
    def __init__(self, h, d_k, d_v, d_model, **kwargs):
        super().__init__(**kwargs)
        self.attention = DotProductAttention() # Scaled dot product attention
        self.heads = h # Number of attention heads to use
        self.d_k = d_k # Dimensionality of the linearly projected queries and keys
        self.d_v = d_v # Dimensionality of the linearly projected values
        self.d_model = d_model # Dimensionality of the model
        self.W_q = Dense(d_k) # Learned projection matrix for the queries
        self.W_k = Dense(d_k) # Learned projection matrix for the keys
        self.W_v = Dense(d_v) # Learned projection matrix for the values
        self.W_o = Dense(d_model) # Learned projection matrix for the multi-head output
    def reshape_tensor(self, x, heads, flag):
        if flag:
            # Tensor shape after reshaping and transposing:
            # (batch_size, heads, seq_length, -1)
            x = reshape(x, shape=(shape(x)[0], shape(x)[1], heads, -1))
            x = transpose(x, perm=(0, 2, 1, 3))
        else:
            # Reverting the reshaping and transposing operations:
            # (batch_size, seq_length, d_k)
            x = transpose(x, perm=(0, 2, 1, 3))
            x = reshape(x, shape=(shape(x)[0], shape(x)[1], self.d_k))
        return x
    
    def call(self, queries, keys, values, mask=None):
        # Rearrange the queries to be able to compute all heads in parallel
        q_reshaped = self.reshape_tensor(self.W_q(queries), self.heads, True)
        # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
        # Rearrange the keys to be able to compute all heads in parallel
        k_reshaped = self.reshape_tensor(self.W_k(keys), self.heads, True)
        # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
        # Rearrange the values to be able to compute all heads in parallel
        v_reshaped = self.reshape_tensor(self.W_v(values), self.heads, True)
        # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
        # Compute the multi-head attention output using the reshaped queries,
        # keys, and values
        o_reshaped = self.attention(q_reshaped, k_reshaped, v_reshaped, self.d_k, mask)
        # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
        # Rearrange back the output into concatenated form
        output = self.reshape_tensor(o_reshaped, self.heads, False)
        # Resulting tensor shape: (batch_size, input_seq_length, d_v)
        # Apply one final linear projection to the output to generate the multi-head
        # attention. Resulting tensor shape: (batch_size, input_seq_length, d_model)
        return self.W_o(output)

### Normalization layer

In [7]:
# Implementing the Add & Norm Layer
class AddNormalization(Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.layer_norm = LayerNormalization() # Layer normalization layer
    def call(self, x, sublayer_x):
        # The sublayer input and output need to be of the same shape to be summed
        add = x + sublayer_x
        # Apply layer normalization to the sum
        return self.layer_norm(add)


### Feed forward layer

In [8]:
# Implementing the Feed-Forward Layer
class FeedForward(Layer):
    def __init__(self, d_ff, d_model, **kwargs):
        super().__init__(**kwargs)
        self.fully_connected1 = Dense(d_ff) # First fully connected layer
        self.fully_connected2 = Dense(d_model) # Second fully connected layer
        self.activation = ReLU() # ReLU activation layer
        
    def call(self, x):
        # The input is passed into the two fully-connected layers, with a ReLU in between
        x_fc1 = self.fully_connected1(x)
        return self.fully_connected2(self.activation(x_fc1))

### A single decoder layer

In [9]:
class DecoderLayer(Layer):
    def __init__(self, h, d_k, d_v, d_model, d_ff, rate, **kwargs):
        super().__init__(**kwargs)
        self.multihead_attention1 = MultiHeadAttention(h, d_k, d_v, d_model)
        self.dropout1 = Dropout(rate)
        self.add_norm1 = AddNormalization()
        self.multihead_attention2 = MultiHeadAttention(h, d_k, d_v, d_model)
        self.dropout2 = Dropout(rate)
        self.add_norm2 = AddNormalization()
        self.feed_forward = FeedForward(d_ff, d_model)
        self.dropout3 = Dropout(rate)
        self.add_norm3 = AddNormalization()
    
    
    def call(self, x, encoder_output, lookahead_mask, padding_mask, training):
        # Multi-head attention layer
        multihead_output1 = self.multihead_attention1(x, x, x, lookahead_mask)
         
        # Expected output shape = (batch_size, sequence_length, d_model)
        # Add in a dropout layer
        multihead_output1 = self.dropout1(multihead_output1, training=training)
        # Followed by an Add & Norm layer
        addnorm_output1 = self.add_norm1(x, multihead_output1)
        # Expected output shape = (batch_size, sequence_length, d_model)
        # Followed by another multi-head attention layer
        multihead_output2 = self.multihead_attention2(addnorm_output1, encoder_output,encoder_output, padding_mask)
         
        # Add in another dropout layer
        multihead_output2 = self.dropout2(multihead_output2, training=training)
        
        # Followed by another Add & Norm layer
        addnorm_output2 = self.add_norm1(addnorm_output1, multihead_output2)
        
        # Followed by a fully connected layer
        feedforward_output = self.feed_forward(addnorm_output2)
        # Expected output shape = (batch_size, sequence_length, d_model)
        
        # Add in another dropout layer
        feedforward_output = self.dropout3(feedforward_output, training=training)
        
        # Followed by another Add & Norm layer
        return self.add_norm3(addnorm_output2, feedforward_output)

### Decoder construct

In [38]:
class Decoder(Layer):
    def __init__(self, vocab_size, sequence_length, h, d_k, d_v, d_model, d_ff, n, rate,**kwargs):
        super().__init__(**kwargs)
        self.pos_encoding = PositionEmbeddingFixedWeights(sequence_length, vocab_size,d_model)
        self.dropout = Dropout(rate)
        self.decoder_layer = [DecoderLayer(h, d_k, d_v, d_model, d_ff, rate) for _ in range(n)]
        
    
    def call(self, output_target, encoder_output, lookahead_mask=None, padding_mask=None, training=False):
        # Generate the positional encoding
        pos_encoding_output = self.pos_encoding(output_target)
        # Expected output shape = (number of sentences, sequence_length, d_model)
        # Add in a dropout layer
        x = self.dropout(pos_encoding_output, training=training)
        # Pass on the positional encoded values to each encoder layer
        for i, layer in enumerate(self.decoder_layer):
            x = layer(x, encoder_output, lookahead_mask=lookahead_mask, padding_mask=padding_mask, training=training)
        return x

In [24]:
h = 8 # Number of self-attention heads
d_k = 64 # Dimensionality of the linearly projected queries and keys
d_v = 64 # Dimensionality of the linearly projected values
d_ff = 2048 # Dimensionality of the inner fully connected layer
d_model = 512 # Dimensionality of the model sub-layers' outputs
n = 6 # Number of layers in the encoder stack
batch_size = 64 # Batch size from the training process
dropout_rate = 0.1 # Frequency of dropping the input units in the dropout layers

In [25]:
dec_vocab_size = 20 # Vocabulary size for the decoder
input_seq_length = 5 # Maximum length of the input sequence
input_seq = np.random.rand(batch_size, input_seq_length)
enc_output = np.random.rand(batch_size, input_seq_length, d_model)

In [26]:
input_seq

array([[0.85404104, 0.6590931 , 0.00801142, 0.15239133, 0.90819242],
       [0.57147685, 0.87189948, 0.46574859, 0.95172972, 0.16893189],
       [0.8370025 , 0.87666555, 0.83163383, 0.51620282, 0.21303305],
       [0.32213007, 0.48830044, 0.02404684, 0.44876341, 0.96598851],
       [0.14453682, 0.31114745, 0.13151907, 0.54885457, 0.62306043],
       [0.06882451, 0.95322895, 0.28961088, 0.36076364, 0.60142198],
       [0.03123791, 0.23143449, 0.60407628, 0.37560397, 0.5530125 ],
       [0.08261558, 0.95814627, 0.02462276, 0.1219724 , 0.17036007],
       [0.65737732, 0.8539066 , 0.32106262, 0.14379717, 0.75005263],
       [0.86897311, 0.02070043, 0.10383817, 0.00396981, 0.95984319],
       [0.17923014, 0.909579  , 0.38651565, 0.05013358, 0.8992825 ],
       [0.51917468, 0.70918715, 0.10842953, 0.36571321, 0.63359847],
       [0.63852509, 0.69150957, 0.75571145, 0.55296803, 0.23657143],
       [0.01434756, 0.06438124, 0.62329489, 0.92799422, 0.16680802],
       [0.56505058, 0.14176671, 0.

In [27]:
enc_output.shape

(64, 5, 512)

In [28]:
print("Input Sequence Shape:", input_seq.shape)
print("Encoder Output Shape:", enc_output.shape)

Input Sequence Shape: (64, 5)
Encoder Output Shape: (64, 5, 512)


In [39]:
decoder = Decoder(dec_vocab_size, input_seq_length, h, d_k, d_v, d_model, d_ff, n, dropout_rate)
res = decoder(input_seq, enc_output, lookahead_mask=None, padding_mask=None, training=True)

ValueError: Exception encountered when calling MultiHeadAttention.call().

[1mOnly input tensors may be passed as positional arguments. The following argument value should be passed as a keyword argument: 64 (of type <class 'int'>)[0m

Arguments received by MultiHeadAttention.call():
  • queries=tf.Tensor(shape=(64, 5, 512), dtype=float32)
  • keys=tf.Tensor(shape=(64, 5, 512), dtype=float32)
  • values=tf.Tensor(shape=(64, 5, 512), dtype=float32)
  • mask=None

In [17]:
res.shape

NameError: name 'res' is not defined

In [24]:
res

<tf.Tensor: shape=(64, 5, 512), dtype=float32, numpy=
array([[[-1.0012507 , -0.02040063,  0.00654149, ...,  0.00182297,
         -0.42907837, -0.6750074 ],
        [-0.9141136 , -0.07876357,  0.03269473, ...,  0.02028901,
         -0.41397125, -0.6715221 ],
        [-0.8649395 , -0.19943327,  0.00265713, ...,  0.04040005,
         -0.37729064, -0.67488515],
        [-0.9230531 , -0.2717329 , -0.06383975, ...,  0.04616989,
         -0.34594226, -0.6787786 ],
        [-1.0300083 , -0.24937628, -0.1109768 , ...,  0.03863088,
         -0.34787044, -0.66987866]],

       [[-1.134145  , -0.16023272, -0.10939384, ...,  0.03158534,
         -0.41963074, -0.65775114],
        [-1.0422376 , -0.21190177, -0.07916868, ...,  0.04733095,
         -0.41154853, -0.655486  ],
        [-1.0103201 , -0.32608616, -0.1004246 , ...,  0.04688325,
         -0.3615602 , -0.65161943],
        [-1.0573205 , -0.38737077, -0.1575223 , ...,  0.03170759,
         -0.31814742, -0.65505326],
        [-1.1520594 , -0.3