In [1]:
from keras.layers import Input, Embedding, MultiHeadAttention, LayerNormalization, Dense, Dropout
from keras.models import Model

In [2]:
vocab_size = 10000
sequence_length = 50
embed_dim = 128
num_heads = 4
ff_dim = 512
dropout_rate = 0.1

In [3]:
# Input Layer
inputs = Input(shape = (sequence_length,), name="Input")

In [4]:
# Embedding Layer
x = Embedding(input_dim = vocab_size, output_dim = embed_dim, name = "Embedding")(inputs)

In [6]:
# Multi-Head Self-Attention
attention_output = MultiHeadAttention(num_heads = num_heads, key_dim = embed_dim, name = "MultiHeadAttention")(x, x)

In [7]:
# Add and Norm(Residual Connection and Layer Normalization)
attention_output = LayerNormalization(epsilon=1e-6, name = "Attention_Add_Norm")(x + attention_output)

In [8]:
# Feed-Forward Network (FFN)
ffn = Dense(ff_dim, activation = 'relu', name = "FFN_Dense1")(attention_output)
ffn = Dense(embed_dim, name = "FFN_Dense2")(ffn)

In [9]:
# Add and Norm(Residula connection and Layer Normalization for FFN)
encoder_output = LayerNormalization(epsilon=1e-6, name = "FFN_Add_Norm")(attention_output + ffn)

In [10]:
# Define the model
encoder_model = Model(inputs = inputs, outputs = encoder_output, name='Transformer_Encoder')

In [11]:
# Model summary
encoder_model.summary()