# Transformers


- Encoder & Decoder
- Self-attention mechanism
- Muti-Head Attention
- Layer Normalization
- Feedforward neural network
- Residual connection
- Positional Encoding (not covered)
- 

# 1 Self-attention 

Enables each input word to attent to every other word - capture contexts and relationships more effectively.

Each word will be captured by 3 vectors:
- Query
- Key
- Value

**Attention Score** = `Softmax(Query * Key  / sqrt(modelsize))`
- Dot product of query and key
- Used to weigh the value vectors (Attention_score * Value)
- allows the model to focus on different parts of the input sequence when making predictions.

In [1]:
# An example

import tensorflow as tf
from tensorflow.keras.layers import Layer

# Define the Self-Attention mechanism
class SelfAttention(Layer):
    def __init__(self, d_model):
        super(SelfAttention, self).__init__()
        self.d_model = d_model
        # Dense layers for Q, K, V projections
        self.query_dense = tf.keras.layers.Dense(d_model)
        self.key_dense = tf.keras.layers.Dense(d_model)
        self.value_dense = tf.keras.layers.Dense(d_model)

    def call(self, inputs):
        # Compute the attention weights and apply them to the value vectors to get the output
        q = self.query_dense(inputs)
        k = self.key_dense(inputs)
        v = self.value_dense(inputs)
        # Attention scores = Softmax(Q * K^T / sqrt(d_model))
        # where d_k is the dimension of the key vectors
        attention_weights = tf.nn.softmax(tf.matmul(q, k, transpose_b=True) / tf.math.sqrt(tf.cast(self.d_model, tf.float32)), axis=-1)
        output = tf.matmul(attention_weights, v)
        return output

# Example usage
inputs = tf.random.uniform((1, 60, 512)) # Batch size of 1, sequence length of 60, and embedding dimension of 512
self_attention_layer = SelfAttention(d_model=512)
output = self_attention_layer(inputs)
print("Output shape:", output.shape)  # Should be (1, 60, 512)



Output shape: (1, 60, 512)


# 2 Encoder

In [None]:
# An examle
import tensorflow as tf
from tensorflow.keras.layers import Layer

# Define a Transformer Encoder
class TransformerEncoder(Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(TransformerEncoder, self).__init__()
        
        # Multi-head self-attention: applied to the input sequence
        self.mha = tf.keras.layers.MultiHeadAttention(num_heads=8, key_dim=d_model)  # need to define d_model

        # Feed-forward network: applied to the output of the self-attention layer
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),  # dff is the dimension of the feed-forward network
            tf.keras.layers,Dense(d_model)
        ])

        # Layer normalization: applied after the self-attention and feed-forward layers
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        # Dropout: applied after the self-attention, feed-forward and normalization layers
        self.dropout1 = tf.keras.layers.Dropout(0.1)
        self.dropout2 = tf.keras.layers.Dropout(0.1)


    def call():
        # Masked Multi-head Self-attention
        attn_output = self.mha(x, x, x, attention_mask=mask)
        # Dropout 1
        attn_output = self.dropout1(attn_output, training=training)
        # Add & Norm 1: residual connection and layer normalization
        out1 = self.layernorm1(x + attn_output)

        # Feed-forward network
        ffn_output = self.ffn(out1)
        # Dropout 2
        ffn_output = self.dropout2(ffn_output, training=training)
        # Add & Norm 2: residual connection and layer normalization
        out2 = self.layernorm2(out1 + ffn_output)

        return

# 3 Decoder


Similar to encoder, but
- additoinal cross-attention mechanism to attend to the encoders output
  - generate sequences based on the context provided by the encoder

Decoder takes the **target sequence** as input, applies self-attention and cross-attention with the **encoders output**, and then passes through a feed forward neural network.

In [None]:
# An example

# Define a Transformer Decoder
class TransformerDecoder(Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(TransformerDecoder, self).__init__()
        
        # Multi-head self-attention: 
        # applied to the target sequence (input in this part) and the output of the encoder
        self.mha1 = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.mha2 = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)

        # Feed-forward network
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model)
        ])

        # Residual connection & Layer Normalization
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        
    def call(self, x, 
             enc_output,  # output of the encoder
             training,    # training or not
             look_ahead_mask,  # mask for the target sequence
             padding_mask):    # mask for the encoder output
        
        # Masked self attention - on Target Sequence
        attn1 = self.mha1(x, x, x, attention_mask=look_ahead_mask)    
        # Dropout
        attn1 = self.dropout1(attn1, training=training) 
        # Add & Norm: Residual connection & Layer Normalization
        out1 = self.layernorm1(x + attn1)

        # Cross attention - Encoder output & Target Sequence attention
        attn2 = self.mha1(out1, enc_output, enc_output, attention_mask=padding_mask)    
        # Dropout
        attn2 = self.dropout1(attn1, training=training) 
        # Add & Norm: Residual connection & Layer Normalization
        out2 = self.layernorm1(out1 + attn2)

        # Feed-forward network
        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        # Add & Norm
        out3 = self.layernorm3(out2 + ffn_output)

        return out3
