# Transformers

unlike traditional sequence models such as RNNs, transformers leverage self attention mechanisms to process and put data in parallel.

transformer model consists of two main parts:
1. the encoder
2. the decoder


both the encoder and the decoder are composed of layers that include self attention mechanisms and feed forward neural networks. Self-attention allows the model to weigh the importance of different words in a sentence when encoding a particular word. This is crucial for capturing dependencies that are far apart in the input sequence.

The feed forward neural network layers help in transforming the input data after the self attention mechanism. Each layer in the encoder and decoder stacks multiple such sub layers enabling the model to learn complex representations.

# Self-attention

is the core component of the transformer architecture.

It allows each word and the input to attend to every other word, making it possible to capture contexts and relationships more effectively.

In self-attention, each word is represented by three vectors:
1. Query
2. key
3. value.

The attention score is computed as a dot product of the query and key vectors, which is then used to weigh the value vectors. This process allows the model to focus on different parts of the input sequence when making predictions.

# the Transformer Encoder:

composed of:
1. multiple layers
2. self-attention mechanism
3. feedforward neural network
4. residual connection
5. layer normalization
6. Input embedded and passed through positional encoding for order of words

In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Layer

In [2]:
class SelfAttention(Layer):
    def __init__(self, d_model):
        super(SelfAttention, self).__init__()
        self.d_model = d_model
        self.query_dense = tf.keras.layers.Dense(d_model)
        self.key_dense = tf.keras.layers.Dense(d_model)
        self.value_dense = tf.keras.layers.Dense(d_model)
    def call(self, inputs):
        q = self.query_dense(inputs)
        k = self.key_dense(inputs)
        v = self.value_dense(inputs)

        attention_weights = tf.nn.softmax(tf.matmul(q,k,transpose_b=True) / tf.math.sqrt(tf.cast(self.d_model, tf.float32)) , axis=-1)
        output = tf.matmul(attention_weights, v)
        return output

In [3]:
inputs = tf.random.uniform((1,60,512))
self_attention = SelfAttention(d_model=512)
output = self_attention(inputs)
print(output.shape)

(1, 60, 512)


In [None]:
class TransformerEncoder(Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(TransformerEncoder, self).__init__()

    self.mha = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
    self.ffn = tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),
      tf.keras.layers.Dense(d_model)
    ])

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)

  def call(self, x, training, mask):
    attn_output = self.mha(x, x, x, attention_mask = mask) # self attention
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(x + attn_output) # residual connection and normalization

    ffn_output = self.ffn(out1) # Feed forward network
    ffn_output = self.dropout2(ffn_output, training=training)
    out2 = self.layernorm2(out1 + ffn_output) # residual connection and normalization

    return out2

In [None]:
encoder = TransformerEncoder(d_model=512, num_heads=8, dff=2048)
x = tf.random.uniform((1, 60, 512))
mask = None
output = encoder(inputs, training=True, mask=None)
print(output.shape)

# The Transofer Decoder

similar to the encoder but with an additional cross attention mechanism to attend to the encoders output
this allows the decoder to generate sequences based on the context provided by the encoder

the decoder takes the target sequence as input, applies self attention and cross attention with the encoder output and then passes through a feed forward neural network

In [None]:
class TransformerDecoder(Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(TransformerDecoder, self).__init__()

    self.mha1 = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
    self.mha2 = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
    self.ffn = tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),
        tf.keras.layers.Dense(d_model)
    ])
    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.dropout3 = tf.keras.layers.Dropout(rate)

  def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
    attn1 = self.mha1(x, x, x, attention_mask=look_ahead_mask) # self attention
    attn1 = self.dropout1(attn1, training=training)
    out1 = self.layernorm1(attn1 + x) # reisudal connection and normalization

    attn2 = self.mha2(enc_output, enc_output, out1, attention_mask=padding_mask) # cross attention
    attn2 = self.dropout2(attn2, training=training)
    out2 = self.layernorm2(attn2 + out1) # residual connection and normalization

    ffn_output = self.ffn(out2) # feed forward network
    ffn_output = self.dropout3(ffn_output, training=training)
    out3 = self.layernorm3(ffn_output + out2) # residual connection and normalization

    return out3