# **Transformer Model**

# Importing Libraries

In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Layer, Embedding, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.datasets import imdb
from tensorflow.keras import preprocessing
import numpy as np

# Data Preparation and Positional Encoding

In [2]:
# Set parameters for the data.
max_features = 10000
maxlen = 200

# Load the IMDB dataset.
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

# Preprocess the data to ensure all sequences have the same length.
x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen, padding="post")
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen, padding="post")

print("Shape of training data:", x_train.shape)
print("Shape of test data:", x_test.shape)

class PositionalEncoding(Layer):
    """
    Adds positional information to the word embeddings.
    """
    def __init__(self, position, d_model, **kwargs):
        super().__init__(**kwargs)
        self.positional_encoding = self.get_positional_encoding(position, d_model)
        self.positional_encoding = tf.cast(self.positional_encoding, dtype=tf.float32)

    def get_positional_encoding(self, position, d_model):
        angle_rads = np.arange(position)[:, np.newaxis] / np.power(10000, (2 * (np.arange(d_model)[np.newaxis, :] // 2)) / np.float32(d_model))
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        pos_encoding = angle_rads[np.newaxis, ...]
        return tf.cast(pos_encoding, dtype=tf.float32)

    def call(self, inputs):
        return inputs + self.positional_encoding[:, :tf.shape(inputs)[1], :]


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Shape of training data: (25000, 200)
Shape of test data: (25000, 200)


# Build the Transformer Block

In [3]:
def create_transformer_block(embed_dim, num_heads, ff_dim, rate=0.1):
    # The Multi-Head Attention layer is the heart of the Transformer.
    # It allows the model to attend to different parts of the input sequence.
    attn_layer = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)

    # A simple feed-forward network to process the attention output.
    ff_layer = tf.keras.layers.Dense(ff_dim, activation='relu')

    # Layer normalization and dropout are used to stabilize training and prevent overfitting.
    layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    dropout1 = tf.keras.layers.Dropout(rate)
    dropout2 = tf.keras.layers.Dropout(rate)

    inputs = tf.keras.Input(shape=(maxlen, embed_dim))

    # Self-Attention block
    attn_output = attn_layer(inputs, inputs)
    attn_output = dropout1(attn_output)
    norm1_output = layernorm1(inputs + attn_output)

    # Feed-forward block
    ff_output = ff_layer(norm1_output)
    ff_output = dropout2(ff_output)
    outputs = layernorm2(norm1_output + ff_output)

    return Model(inputs=inputs, outputs=outputs)

# Build the Full Transformer Model

In [4]:
def create_transformer_model():
    # Model parameters.
    embed_dim = 32  # Dimension of the word embeddings.
    num_heads = 2   # Number of attention heads.
    ff_dim = 32     # Dimension of the feed-forward layer.

    inputs = tf.keras.Input(shape=(maxlen,))

    # Input embedding and positional encoding.
    embedding_layer = Embedding(max_features, embed_dim)
    x = embedding_layer(inputs)
    x = PositionalEncoding(maxlen, embed_dim)(x)

    # Transformer block. You can stack multiple blocks for a deeper model.
    transformer_block = create_transformer_block(embed_dim, num_heads, ff_dim)
    x = transformer_block(x)

    # Flatten and add the final dense layer for classification.
    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    x = Dense(20, activation="relu")(x)
    outputs = Dense(1, activation="sigmoid")(x)

    return Model(inputs=inputs, outputs=outputs)

# Compile and Train the Model

In [5]:
# Create an instance of the Transformer model.
transformer_model = create_transformer_model()

# Compile the model with an optimizer, a loss function, and metrics to monitor.
transformer_model.compile(optimizer='adam',
                          loss='binary_crossentropy',
                          metrics=['accuracy'])

# Train the model on the training data.
history = transformer_model.fit(x_train, y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 239ms/step - accuracy: 0.5976 - loss: 0.6386 - val_accuracy: 0.8596 - val_loss: 0.3260
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 221ms/step - accuracy: 0.8795 - loss: 0.2863 - val_accuracy: 0.8738 - val_loss: 0.2951
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 236ms/step - accuracy: 0.9231 - loss: 0.2007 - val_accuracy: 0.8682 - val_loss: 0.3322
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 217ms/step - accuracy: 0.9526 - loss: 0.1384 - val_accuracy: 0.8674 - val_loss: 0.3611
Epoch 5/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 217ms/step - accuracy: 0.9727 - loss: 0.0903 - val_accuracy: 0.8588 - val_loss: 0.4677


# Evaluate the Model

In [6]:
# Evaluate the model on the test dataset to get its final loss and accuracy.
loss, accuracy = transformer_model.evaluate(x_test, y_test)

print(f"\nTest loss: {loss:.4f}")
print(f"Test accuracy: {accuracy:.4f}")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 36ms/step - accuracy: 0.8490 - loss: 0.4831

Test loss: 0.4937
Test accuracy: 0.8480
