### Importing the libraries

In [7]:
import tensorflow as tf
import random
import numpy as np
from keras.layers import LayerNormalization, Layer, Dense, ReLU, Dropout,TextVectorization, Embedding
from keras.backend import softmax 
from tensorflow import math, matmul, reshape, shape, transpose, cast, float32

In [8]:
import keras
keras.__version__

'2.15.0'

### The Word embedding and Positional encoding layer(uses sinusoidal encoding)

In [2]:
class PositionEmbeddingFixedWeights(Layer):
    def __init__(self, seq_length, vocab_size, output_dim, **kwargs):
        super().__init__(**kwargs)
        word_embedding_matrix = self.get_position_encoding(vocab_size, output_dim)
        pos_embedding_matrix = self.get_position_encoding(seq_length, output_dim)
        self.word_embedding_layer = Embedding(input_dim=vocab_size, output_dim=output_dim,weights=[word_embedding_matrix],trainable=False)
        self.position_embedding_layer = Embedding(input_dim=seq_length, output_dim=output_dim,weights=[pos_embedding_matrix],trainable=False)
    
    def get_position_encoding(self, seq_len, d, n=10000):
        P = np.zeros((seq_len, d))
        for k in range(seq_len):
            for i in np.arange(int(d/2)):
                denominator = np.power(n, 2*i/d)
                P[k, 2*i] = np.sin(k/denominator)
                P[k, 2*i+1] = np.cos(k/denominator)
        return P
    
    
    def call(self, inputs):
        position_indices = tf.range(tf.shape(inputs)[-1])
        embedded_words = self.word_embedding_layer(inputs)
        embedded_indices = self.position_embedding_layer(position_indices)
        return embedded_words + embedded_indices

### Attention mechanism

#### Single head attention

In [3]:
class DotProductAttention(Layer):
    def __init__(self,**kwargs):
        super().__init__(**kwargs)
    
    def call(self,queries,keys,values,d_k,mask = None):
        
        # Scoring the queries against the keys after transposing the latter, and scaling
        scores = matmul(queries, keys, transpose_b=True) / math.sqrt(cast(d_k, float32))
        
        # Apply mask to the attention scores
        if mask is not None:
            scores += -1e9 * mask
        
        # Computing the weights by a softmax operation
        weights = softmax(scores)
        
        # Computing the attention by a weighted sum of the value vectors
        return matmul(weights, values)
        
    

#### Multi - head Attention

In [4]:
class MultiHeadAttention(Layer):
    def __init__(self, h, d_k, d_v, d_model, **kwargs):
        super().__init__(**kwargs)
        self.attention = DotProductAttention() # Scaled dot product attention
        self.heads = h # Number of attention heads to use
        self.d_k = d_k # Dimensionality of the linearly projected queries and keys
        self.d_v = d_v # Dimensionality of the linearly projected values
        self.d_model = d_model # Dimensionality of the model
        self.W_q = Dense(d_k) # Learned projection matrix for the queries
        self.W_k = Dense(d_k) # Learned projection matrix for the keys
        self.W_v = Dense(d_v) # Learned projection matrix for the values
        self.W_o = Dense(d_model) # Learned projection matrix for the multi-head output
    def reshape_tensor(self, x, heads, flag):
        if flag:
            # Tensor shape after reshaping and transposing:
            # (batch_size, heads, seq_length, -1)
            x = reshape(x, shape=(shape(x)[0], shape(x)[1], heads, -1))
            x = transpose(x, perm=(0, 2, 1, 3))
        else:
            # Reverting the reshaping and transposing operations:
            # (batch_size, seq_length, d_k)
            x = transpose(x, perm=(0, 2, 1, 3))
            x = reshape(x, shape=(shape(x)[0], shape(x)[1], self.d_k))
        return x
    
    def call(self, queries, keys, values, mask=None):
        # Rearrange the queries to be able to compute all heads in parallel
        q_reshaped = self.reshape_tensor(self.W_q(queries), self.heads, True)
        # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
        # Rearrange the keys to be able to compute all heads in parallel
        k_reshaped = self.reshape_tensor(self.W_k(keys), self.heads, True)
        # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
        # Rearrange the values to be able to compute all heads in parallel
        v_reshaped = self.reshape_tensor(self.W_v(values), self.heads, True)
        # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
        # Compute the multi-head attention output using the reshaped queries,
        # keys, and values
        o_reshaped = self.attention(q_reshaped, k_reshaped, v_reshaped, self.d_k, mask)
        # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
        # Rearrange back the output into concatenated form
        output = self.reshape_tensor(o_reshaped, self.heads, False)
        # Resulting tensor shape: (batch_size, input_seq_length, d_v)
        # Apply one final linear projection to the output to generate the multi-head
        # attention. Resulting tensor shape: (batch_size, input_seq_length, d_model)
        return self.W_o(output)

### Normalization 

In [None]:
# Implementing the Add & Norm Layer
class AddNormalization(Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.layer_norm = LayerNormalization() # Layer normalization layer
    def call(self, x, sublayer_x):
        # The sublayer input and output need to be of the same shape to be summed
        add = x + sublayer_x
        # Apply layer normalization to the sum
        return self.layer_norm(add)

### Feed forward layer

In [5]:
# Implementing the Feed-Forward Layer
class FeedForward(Layer):
    def __init__(self, d_ff, d_model, **kwargs):
        super().__init__(**kwargs)
        self.fully_connected1 = Dense(d_ff) # First fully connected layer
        self.fully_connected2 = Dense(d_model) # Second fully connected layer
        self.activation = ReLU() # ReLU activation layer
        
    def call(self, x):
        # The input is passed into the two fully-connected layers, with a ReLU in between
        x_fc1 = self.fully_connected1(x)
        return self.fully_connected2(self.activation(x_fc1))

### Encoder stack

#### A single Encoder layer

In [6]:
# Implementing the Encoder Layer
class EncoderLayer(Layer):
    def __init__(self, h, d_k, d_v, d_model, d_ff, rate, **kwargs):
        super().__init__(**kwargs)
        self.multihead_attention = MultiHeadAttention(h, d_k, d_v, d_model)
        self.dropout1 = Dropout(rate)
        self.add_norm1 = AddNormalization()
        self.feed_forward = FeedForward(d_ff, d_model)
        self.dropout2 = Dropout(rate)
        self.add_norm2 = AddNormalization()
    
    def call(self, x, padding_mask, training):
        # Multi-head attention layer
        multihead_output = self.multihead_attention(x, x, x, padding_mask)
        
        # Expected output shape = (batch_size, sequence_length, d_model)
        # Add in a dropout layer
        multihead_output = self.dropout1(multihead_output, training=training)
        # Followed by an Add & Norm layer
        addnorm_output = self.add_norm1(x, multihead_output)
        # Expected output shape = (batch_size, sequence_length, d_model)
        # Followed by a fully connected layer
        feedforward_output = self.feed_forward(addnorm_output)
        # Expected output shape = (batch_size, sequence_length, d_model)
        # Add in another dropout layer
        feedforward_output = self.dropout2(feedforward_output, training=training)
        # Followed by another Add & Norm layer
        return self.add_norm2(addnorm_output, feedforward_output)

#### The stacked encoder

In [14]:
# Implementing the Encoder
class Encoder(Layer):
    def __init__(self, vocab_size, sequence_length, h, d_k, d_v, d_model, d_ff, n, rate,
    **kwargs):
        super().__init__(**kwargs)
        self.pos_encoding = PositionEmbeddingFixedWeights(sequence_length, vocab_size,
        d_model)
        self.dropout = Dropout(rate)
        self.encoder_layer = [EncoderLayer(h, d_k, d_v, d_model, d_ff, rate) for _ in range(n)]
    
    def call(self, input_sentence, padding_mask, training):
        # Generate the positional encoding
        pos_encoding_output = self.pos_encoding(input_sentence)
        # Expected output shape = (batch_size, sequence_length, d_model)
        # Add in a dropout layer
        x = self.dropout(pos_encoding_output, training=training)
        # Pass on the positional encoded values to each encoder layer
        for i, layer in enumerate(self.encoder_layer):
            x = layer(x, padding_mask, training)
        return x

In [15]:
h = 8 # Number of self-attention heads
d_k = 64 # Dimensionality of the linearly projected queries and keys
d_v = 64 # Dimensionality of the linearly projected values
d_ff = 2048 # Dimensionality of the inner fully connected layer
d_model = 512 # Dimensionality of the model sub-layers' outputs
n = 6 # Number of layers in the encoder stack
batch_size = 64 # Batch size from the training process
dropout_rate = 0.1 # Frequency of dropping the input units in the dropout layers

In [19]:
enc_vocab_size = 20 # Vocabulary size for the encoder
input_seq_length = 5 # Maximum length of the input sequence
input_seq = np.random.rand(batch_size, input_seq_length)

In [20]:
input_seq

array([[9.15919889e-01, 4.91635944e-01, 4.40446882e-01, 6.29453773e-01,
        3.66986749e-01],
       [2.93714294e-01, 6.36222268e-01, 6.15577773e-01, 3.46301441e-01,
        9.73837110e-01],
       [3.83080686e-01, 2.36675677e-01, 7.23320078e-04, 3.84908616e-01,
        7.99711894e-01],
       [5.17823415e-01, 9.31756226e-01, 1.33404444e-01, 7.85592246e-01,
        4.95811953e-03],
       [1.10830221e-01, 8.96743341e-01, 7.87448103e-01, 2.73020536e-01,
        3.20464169e-01],
       [1.85161186e-01, 7.39244939e-01, 8.69971472e-01, 5.27118878e-01,
        9.23751793e-01],
       [7.89240884e-01, 5.39702157e-01, 7.91981697e-01, 6.88402317e-01,
        2.84742420e-01],
       [3.24290932e-01, 9.42433417e-01, 3.79823536e-01, 6.49576823e-01,
        9.53947588e-01],
       [4.38068550e-01, 6.06056482e-01, 9.21083213e-01, 2.35299504e-01,
        6.50360131e-01],
       [6.86218976e-01, 2.32017896e-01, 4.04086840e-01, 9.82082295e-01,
        6.57499766e-01],
       [4.53202830e-02, 4.8921

In [25]:
input_seq.shape

(64, 5)

In [22]:
encoder = Encoder(enc_vocab_size, input_seq_length, h, d_k, d_v, d_model, d_ff, n,
dropout_rate)
res = encoder(input_seq, None, True)

Running this code produces an output of shape (batch size, sequence length, model
dimensionality). 

In [23]:
print(res)

tf.Tensor(
[[[ 1.2713559   0.839783   -0.61490166 ...  0.4920071  -0.5204461
   -0.82770973]
  [ 0.5069501  -0.10065574 -0.6114606  ...  0.8215117  -0.43704084
   -0.15034983]
  [ 0.5849364   1.1628308  -0.9076203  ...  1.4428228  -1.2228639
   -0.7857101 ]
  [ 1.2106156   1.0229694  -0.95936114 ...  0.5122324  -0.9782004
   -0.66699743]
  [ 1.0806003   0.9080391  -0.6692018  ...  0.34345323 -0.932028
   -0.35415187]]

 [[ 0.8201922   1.1076996  -0.7905818  ... -0.14174259 -1.0566348
   -0.66603184]
  [ 0.7477688   1.0753641  -1.002835   ...  0.5830383  -1.1875114
   -0.6166347 ]
  [ 1.1122088  -0.1251646  -0.6617058  ...  0.90386933 -1.1568978
   -0.3706853 ]
  [-0.15401462  0.5651616  -0.95096487 ...  1.1868485  -0.4684367
   -0.15401462]
  [ 1.164729    0.9010123  -0.40531746 ... -0.13277511 -1.6184456
   -0.03196442]]

 [[ 1.3069713   1.8405747  -0.5914834  ...  1.8612355  -0.5538415
   -0.5636145 ]
  [ 1.444673    1.6484761  -0.8156267  ...  1.0484823  -0.6094352
   -1.0142016 ]
 

In [24]:
res.shape

TensorShape([64, 5, 512])