In [1]:
import tensorflow as tf
import keras
import numpy as np

In [2]:
# Self attention mechanism steps : 1. Calculating the attention scores by dot product of the target word vector with each word vector present in the sentence
# 2. Calculating the sum of all the word vectors present in the sentence by their relevance score(attention scores). This will create our new target word vector, which will also contain 
# information about the surrounding of the target word

In [3]:
from keras.activations import softmax

In [4]:
# Self attention mechanism
# Steps performed : Step 1: Initialing the output array
# Step 2 : Iterating through the input sequence so each word can once be the target word
# Step 3 : "scores" is the array that will contain the attention scores for target word with each word vector present in the sequence - Initialized 
# Step 4 : If input sequence is [[1,2,3],[4,5,6],[7,8,9]] , then
#For the first token [1, 2, 3]: -- Target token
#Dot product with itself: 1*1 + 2*2 + 3*3 = 1 + 4 + 9 = 14
#Dot product with the second token [4, 5, 6]: 1*4 + 2*5 + 3*6 = 4 + 10 + 18 = 32
#Dot product with the third token [7, 8, 9]: 1*7 + 2*8 + 3*9 = 7 + 16 + 27 = 50
#So, the attention scores for the first token [1, 2, 3] are [14, 32, 50].
# Step 5 : Once we get the attention scores, we scale them using np.sqrt and normalize them using softmax activation
# Step 6 : We get the weighted vectors of the attention scores as the new vector representation
# Target token --> [1,2,3]
# Attention scores --> [14,32,50] --> sum = 14 + 32 + 50 = 96   
# Therefore, weighted vector_1 of 1st token --> [14/96,32/96,50/96] => [1/6, 1/3, 1/2]
# Similarly, weighted vector_2 of 1st token --> [8/24, 10/24, 12/24]
# Similarly, weighted vector_3 of 1st token --> [35/96, 40/96, 45/96]

# Therfore, the new pivot representation will be [1/6 + 8/24 + 35/96, 1/3 + 10/24 + 40/96, 1/2 + 12/24 + 45/96]  -> For 1st token

# Similarly, the new pivot representation will be calculated for the next word in the input sequence as the target token
# Input sequence is a sequuence with vector representation of the words
def self_attention(input_sequence):
    output = np.zeros(shape=input_sequence) # Initializing the output
    for i , pivot_vector in enumerate(input_sequence): # pivot_vector is each token in the sentence ( Each token in the sentence will be a target token once)
        scores = np.zeros(shape=(len(input_sequence),)) # Initializing the scores (Depends on the number of tokens in input_sequence)
        for j,vector in enumerate(input_sequence):
            scores[j] = np.dot(pivot_vector,vector.T) # Used for computing the attention scores as a dot product between the pivot vector and the remaining vectors (vector.T means transpose )
        scores /= np.sqrt(input_sequence.shape[1]) # Scaling 
        scores = softmax(scores) # Softmax activation function applied
        new_pivot_representation = np.zeros(shape=pivot_vector.shape)
        for j,vector in enumerate(input_sequence):
            new_pivot_representation += scores[j] * vector # This vector representation of each word will contain the information about the surrounding words as well
        output[i] = new_pivot_representation
    return output

### Calculation behind new pivot representation
For the first token [1, 2, 3]:

Weight for the first vector [1, 2, 3]:

    weight_1 = 14 / (14 + 32 + 50 + 68) = 14 / 164 ≈ 0.0854

Weight for the second vector [4, 5, 6]:

    weight_2 = 32 / (14 + 32 + 50 + 68) = 32 / 164 ≈ 0.1951

Weight for the third vector [7, 8, 9]:

    weight_3 = 50 / (14 + 32 + 50 + 68) = 50 / 164 ≈ 0.3049

Weight for the fourth vector [10, 11, 12]:

    weight_4 = 68 / (14 + 32 + 50 + 68) = 68 / 164 ≈ 0.4146

Weighted vector for the first vector [1, 2, 3]:

    weighted_vector_1 = [0.0854, 0.1708, 0.2562]

Weighted vector for the second vector [4, 5, 6]:

    weighted_vector_2 = [0.7804, 0.9756, 1.1707]

Weighted vector for the third vector [7, 8, 9]:

    weighted_vector_3 = [2.1348, 2.4393, 2.7439]

Weighted vector for the fourth vector [10, 11, 12]:

    weighted_vector_4 = [4.1557, 4.5713, 4.9868]


Sum of weighted vectors:

    [0.0854 + 0.7804 + 2.1348 + 4.1557, 
     0.1708 + 0.9756 + 2.4393 + 4.5713,
     0.2562 + 1.1707 + 2.7439 + 4.9868]

New pivot representation:

    [6.4563, 7.6974, 9.9377]

weight_2 = 32 / (14 + 32 + 50 + 68) = 32 / 164 ≈ 0.1951 

Vector_2 = [4 , 5 , 6]

Weighted vector for the second vector:
[4 * 0.1951, 5 * 0.1951, 6 * 0.1951]

In [5]:
# In keras, we use a vectorized implementation of the above since its faster, its present in MultiHeadAttention Layer

In [6]:
from keras.layers import MultiHeadAttention

In [7]:
#num_heads = 4
#embed_dim = 256

#mha_layer = MultiHeadAttention(num_heads=num_heads,key_dim=embed_dim)
#outputs = mha_layer(inputs,inputs,inputs)

In [8]:
# A transformer was originally created to translate from source sequence to target sequence ( That's why transformers are called sequence to sequence models)

In [9]:
# Multi head attention mechanism : Similar to Seperable Convolutional layers, we split the input query,key and value into different parts, and after applying attention mechanism to them, group them together

In [10]:
# Transformer encoder : Consists of this Multi headed Attention mechanism , since our model is deep so we are required to add residual connections to prevent loss of information and Normalization to run gradient descent faster

In [11]:
# Building Transformer encoder from scratch

In [12]:
from keras.layers import LayerNormalization , Dense, Input, MultiHeadAttention
from keras import Sequential

In [13]:
# Subclass API for Transformer encoder
class TransformerEncoderClass(tf.keras.layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim # Size of input token vectors
        self.dense_dim = dense_dim # Size of dense layer
        self.num_heads = num_heads # Number of heads in multi head attention mechanism
        self.attention_layer = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim) # Multi head attention layer
        # Define the Sequential layers with input_shape
        # Output of the multihead attention mechanism will be fed to Dense block
        self.dense_block = Sequential([ 
            Dense(units=dense_dim, activation="relu", input_shape=(None, embed_dim)),  # input_shape should match the output shape of attention_layer
            Dense(units=embed_dim)
        ])
        # Using Layer Normalization instead of Batch Normalization because batch normalization does not work properly with sequence data
        self.layer_norm1 = LayerNormalization()
        self.layer_norm2 = LayerNormalization()

    def call(self, inputs, mask=None): # Call is used to call the class
        if mask is not None:
            mask = mask[:, tf.newaxis, :] # Converting the 2D mask generated by the embedding layer to 3D or 4D as required by the attention layer

        # Calling the attention layer to display the outputs (the standard Transformer architecture primarily uses the input sequence twice for self-attention (once for queries and once for keys and values))
        attention_output = self.attention_layer(inputs, inputs, attention_mask=mask)

        proj_input = self.layer_norm1(inputs + attention_output) # Residual connection between the inputs and output of the attention mechanism layer

        proj_output = self.dense_block(proj_input) # Calling the dense block to display the outputs

        return self.layer_norm2(proj_input + proj_output) # Residual connection between the inputs and output of the Dense block layer

     # Implomenting serialization so that we can save the model(Always include when building custom layers)    
    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "dense_dim": self.dense_dim,
            "num_heads": self.num_heads,
        })
        return config


In [None]:
# Functional API for Transformer encoder

def TransformerEncoder(embed_dim, dense_dim, num_heads, inputs, mask=None):
    # Multi-head attention layer
    attention_layer = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
    
    # Dense block
    dense_block = Sequential([
        Dense(units=dense_dim, activation="relu"),
        Dense(units=embed_dim)
    ])
    
    # Layer normalization layers
    layer_norm1 = LayerNormalization()
    layer_norm2 = LayerNormalization()
    
    # Apply mask if provided
    if mask is not None:
        mask = mask[:, tf.newaxis, :]
    
    # Compute attention output
    attention_output = attention_layer(inputs, inputs, attention_mask=mask) 
    
    # Layer normalization and residual connection before dense block
    proj_input = layer_norm1(inputs + attention_output)
    
    # Compute output of the dense block
    proj_output = dense_block(proj_input)
    
    # Layer normalization and residual connection after dense block
    final_output = layer_norm2(proj_input + proj_output)
    
    return final_output


In [None]:
# Workflow of the encoder : Word Vector Embeddings get feeded to Multi headed attention mechanism , the output is then added with the original input as part of residual connection, normalization done,
# The normalized input is then passed to the Dense block(2 Dense layer) , the output of the Dense layers is then added to the normalized input, then the final result is again normalized and sent as output of the encoder

In [14]:
from keras.layers import TextVectorization, Embedding, GlobalMaxPooling1D, Dropout
from keras.models import Model
import pathlib
from keras.callbacks import ModelCheckpoint

In [15]:
vocab_size = 20000
embed_dim = 256
dense_dim = 32
num_heads = 2

input_layer = Input(shape=(None,), dtype="int64")
embedding_layer = Embedding(vocab_size, embed_dim)(input_layer)  # Embedding layer for token IDs
encoder_layer = TransformerEncoderClass(embed_dim,dense_dim,num_heads)(embedding_layer)
max_pool_layer = GlobalMaxPooling1D()(encoder_layer) # Since our encoder returns full sequences and we need to reduce each sequence to a single vector for classification 
dropout_layer = Dropout(0.5)(max_pool_layer)
output_layer = Dense(1, activation="sigmoid")(dropout_layer)

model1 = Model(input_layer, output_layer)
model1.compile(loss="binary_crossentropy", optimizer="rmsprop", metrics=["accuracy"])


In [16]:
model1.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 256)         5120000   
                                                                 
 transformer_encoder_class (  (None, None, 256)        543776    
 TransformerEncoderClass)                                        
                                                                 
 global_max_pooling1d (Globa  (None, 256)              0         
 lMaxPooling1D)                                                  
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_2 (Dense)             (None, 1)                 257   

In [17]:
IMDB_DATADIR = r'C:\Users\arany\.keras\datasets\aclimdb'

base_dir = pathlib.Path(IMDB_DATADIR)
val_dir = base_dir / "val"
train_dir = base_dir / "train"
batch_size = 32
test_dir = base_dir / "test"

train_dataset = keras.utils.text_dataset_from_directory(train_dir,batch_size=batch_size)
test_dataset = keras.utils.text_dataset_from_directory(test_dir,batch_size=batch_size)
validation_dataset = keras.utils.text_dataset_from_directory(val_dir,batch_size=batch_size)

text_vectorization = TextVectorization(max_tokens=vocab_size,output_mode="int",output_sequence_length=600) # Output mode set as integers, because sequential models use input sequnces as input(integer tokens representing words)

text_only_train_dataset = train_dataset.map(lambda x , y : x)
text_vectorization.adapt(text_only_train_dataset)

int_train_dataset = train_dataset.map(lambda x,y : (text_vectorization(x) , y) , num_parallel_calls=4)

int_test_dataset = test_dataset.map(lambda x,y : (text_vectorization(x), y), num_parallel_calls=4)

int_val_dataset = validation_dataset.map(lambda x,y : (text_vectorization(x), y), num_parallel_calls=4)

Found 20000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.


In [22]:
int_train_dataset

<ParallelMapDataset element_spec=(TensorSpec(shape=(None, 600), dtype=tf.int64, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

In [23]:
callbacks_list1 = [ModelCheckpoint(filepath="transformer_encoder_model",save_best_only=True)]

history1 = model1.fit(int_train_dataset,validation_data=int_val_dataset,epochs=20,callbacks=callbacks_list1)
history1

Epoch 1/20



INFO:tensorflow:Assets written to: transformer_encoder_model\assets


INFO:tensorflow:Assets written to: transformer_encoder_model\assets


Epoch 2/20



INFO:tensorflow:Assets written to: transformer_encoder_model\assets


INFO:tensorflow:Assets written to: transformer_encoder_model\assets


Epoch 3/20



INFO:tensorflow:Assets written to: transformer_encoder_model\assets


INFO:tensorflow:Assets written to: transformer_encoder_model\assets


Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1c702ff9e40>

In [24]:
# Since the Transformer encoder class is a custom layer, we are mentioning it here using the "name" that we got when we did model.summary()
transformer_encoder_model = keras.models.load_model("transformer_encoder_model",custom_objects={"transformer_encoder_model" : TransformerEncoderClass})

In [25]:
transformer_encoder_model.evaluate(int_test_dataset)



[0.277483731508255, 0.8895599842071533]

In [26]:
# The transformer model doesn't have word position information, its only taking set of words sequence information and manually using that ( information about surrounding words using attention scores)
# In order to implement this word position information in the sequence, we do positional encoding( It takes into factor the position of a word in the sequence and adds that to the vector embedding of the word)

In [27]:
# Embedding layer now converted to Positional Encoding layer (contains both vector embedding and position encoding) --> Both the encodings will be learned by the model during training

In [32]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self,sequence_length,input_dim,output_dim,**kwargs): # The sequence length needs to be known because we need to use that as input dimension for the Positional embedding
        super().__init__()
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.token_embeddings = Embedding(input_dim=input_dim,output_dim=output_dim)
        self.positional_embeddings = Embedding(input_dim=sequence_length,output_dim=output_dim)

    def call(self,inputs):
        length = tf.shape(inputs)[-1] # Retreiving the length of the sequence
        positions = tf.range(start=0,limit=length,delta=1) # List of number positions (1,2,3,4.....length of the sentence)
        embedded_tokens = self.token_embeddings(inputs) # Word embeddings
        embedded_positions = self.positional_embeddings(positions) # Position embeddings
        return embedded_tokens + embedded_positions # Adding word and position embeddings

    def compute_mask(self,inputs,mask=None): # Creating a mask to be able to ignore the zero paddings
        return tf.math.not_equal(inputs,0)

    def get_config(self): # Created so that we can use this custom class later as a layer
        config = super().get_config()
        config.update({
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim,
            "output_dim": self.output_dim,
        })
        return config   

In [38]:
# Now we wll be using this custom Positional Embedding layer and the Transformer Encoder layer classes together to build the Sequence model

vocab_size = 20000
sequence_length = 600
embed_dim = 256
dense_dim = 32
num_heads = 2

input_layer = Input(shape=(None,), dtype="int64")
positional_embedding_layer = PositionalEmbedding(sequence_length=sequence_length,input_dim=vocab_size,output_dim=embed_dim)(input_layer)
encoder_layer = TransformerEncoderClass(embed_dim,dense_dim,num_heads)(positional_embedding_layer)
max_pool_layer = GlobalMaxPooling1D()(encoder_layer) # Since our encoder returns full sequences and we need to reduce each sequence to a single vector for classification 
dropout_layer = Dropout(0.5)(max_pool_layer)
output_layer = Dense(1, activation="sigmoid")(dropout_layer)

model2 = Model(input_layer, output_layer)
model2.compile(loss="binary_crossentropy", optimizer="rmsprop", metrics=["accuracy"])
model2.summary()


Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, None)]            0         
                                                                 
 positional_embedding_2 (Pos  (None, None, 256)        5273600   
 itionalEmbedding)                                               
                                                                 
 transformer_encoder_class_2  (None, None, 256)        543776    
  (TransformerEncoderClass)                                      
                                                                 
 global_max_pooling1d_2 (Glo  (None, 256)              0         
 balMaxPooling1D)                                                
                                                                 
 dropout_5 (Dropout)         (None, 256)               0         
                                                           

In [34]:
callbacks_list2 = [ModelCheckpoint(filepath="transformer_encoder_positional",save_best_only=True,monitor="val_loss")]
history2 = model2.fit(int_train_dataset,validation_data = int_val_dataset,epochs=20,callbacks=callbacks_list2)
history2

Epoch 1/20



INFO:tensorflow:Assets written to: transformer_encoder_positional\assets


INFO:tensorflow:Assets written to: transformer_encoder_positional\assets


Epoch 2/20
Epoch 3/20



INFO:tensorflow:Assets written to: transformer_encoder_positional\assets


INFO:tensorflow:Assets written to: transformer_encoder_positional\assets


Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1c70382a920>

In [41]:
# Since the Transformer encoder class is a custom layer, we are mentioning it here using the "name" that we got when we did model.summary()
transformer_encoder_positional_model = keras.models.load_model("transformer_encoder_model",custom_objects={"transformer_encoder_model" : TransformerEncoderClass, "positional_embedding" : PositionalEmbedding})

In [42]:
transformer_encoder_positional_model.evaluate(int_test_dataset)



[0.27748382091522217, 0.8895599842071533]