In [6]:
#pip install tensorflow_datasets

#### Import the libraries

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from keras.layers import *
from keras.models import *
from keras.datasets import imdb
from keras.utils import pad_sequences

### Define the transformer block

In [8]:
class TransofmerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
       # embed_dim: This parameter specifies the dimentionality of the input and output embeddings
       # num_heads: This parameter controls the number of attention heads in the
       # ff_dim: This parameter specifies the dimensionlity of the feedforward network
       # rate: This parameter specifies the dropout rate
        super().__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = Sequential(
            [Dense(ff_dim, activation="relu"),Dense(embed_dim),]
        )
        # self.ffn: This creates a feedforward network, often used for additional normalization
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        # self.layernorm1 and self.layernorm2: These create LayerNormalization layers
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)
        # self.dropout1 and self.dropout2: These create Dropout layers, randomly setting
    def call(self, inputs, training):

      attn_output = self.att(inputs, inputs)
      # Applies multi-head attention to the inputs sequences, allowing different

      attn_output = self.dropout1(attn_output, training=training)
      # Applies dropout to the attention output:

      out1 = self.layernorm1(inputs + attn_output)
      # Adds the attention output to the original input and applies layer of Normalization

      ffn_output = self.ffn(out1)
      # Passes the normalized output through the feedforward network

      ffn_output = self.dropout2(ffn_output, training=training)
      # Applies dropout to the feedforward output.

      return self.layernorm2(out1 + ffn_output)
      # Adds the feedforward output to the previous layers output and applies final layer

In [9]:
class TokenAndPositionEmbedding(Layer):
  def __init__(self, maxlen, vocab_size, embed_dim):
    # maxlen : The maximum length of the input sequences the model will handle.
    # vocab_size : The total number of unique tokens (words) in the vocabulary
    # embed_dim : The dimesionality of the embeddings
        # (how each token and its position will be represented as vector)
    super().__init__()
    self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
    # An Embedding layer that maps each token in the input sequence to a dense vector of size embed_size
    self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)
    # An embedding layer that maps each position in the sequence from 0 to maxlen-1 to a dense vector of size embed_dim

  def call(self, x):
    maxlen = tf.shape(x)[-1]
    # extracts the actual length of the current input sequences
    positions = tf.range(start=0, limit=maxlen, delta=1)
    # creates a tensor of positions from 0 to maxlen-1
    positions = self.pos_emb(positions)
    # Looks up the position embeddings for each position in the sequence
    x = self.token_emb(x)
    # Looks up the token embeddings for each token in the input sequence
    return x + positions
    # Adds the token embeddigns and position embeddings
    # resulting in a combined representation that captures both word
    # meaning and position information

In [10]:
vocab_size = 20000 # Only consider the top 20k words
maxlen = 150 # Only consider the first 150 words of each movie review

In [11]:
(x_train, y_train), (x_val, y_val) = imdb.load_data(num_words=vocab_size)
print(len(x_train), 'Training  sequences')
print(len(x_val), 'Validation sequences')

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
25000 Training  sequences
25000 Validation sequences


In [12]:
x_train = pad_sequences(x_train, maxlen=maxlen)
x_val = pad_sequences(x_val, maxlen=maxlen)

In [13]:
x_train.shape

(25000, 150)

In [14]:
x_train[4000]

array([ 2143,    48,    13,    69,     6, 12928,    13,    62,    28,
        2564,    12,     8,    98,   634,   908,    10,    10,  2047,
        3423,     9, 14790,    17,     2,     6,    87,  1465,    48,
          25,   377,    27,   478,   157,    11,     2, 18497,    29,
        2010,     4,  2915,     7,  5712, 12710,    83,     6,  3207,
           2,     7,   107,    42,   289,   715,   257,     5,    95,
        9727,     4, 13331,    11,    17, 10846,     5, 13869,  1377,
          17,   614,    11,    14,   365,  1652,     2,     2,   373,
          10,    10,     4,   167,  6184,     2,   287,    64,    35,
           2,  3470,     7,  1489,     4,   370,   121,    12,    80,
         123,   178,    51,    75,   181,     8,    67,     4,   636,
       10227,     9,  3735,  3316,   190,    50,     9,   486,    54,
          11,     6,   303,   548,  6548,   684,  8135,     2,   208,
          11,     4,     2,     2,    95,  5115,     4,  4154,  5425,
         190,   122,

In [15]:
embed_dim = 32
num_heads = 2
ff_dim = 32

inputs = Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransofmerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = GlobalAveragePooling1D()(x)
x = Dropout(0.1)(x)
x = Dense(20, activation="relu")(x)
x = Dropout(0.1)(x)
outputs = Dense(2, activation="softmax")(x)

model = Model(inputs=inputs, outputs=outputs)

In [16]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 150)]             0         
                                                                 
 token_and_position_embeddi  (None, 150, 32)           644800    
 ng (TokenAndPositionEmbedd                                      
 ing)                                                            
                                                                 
 transofmer_block (Transofm  (None, 150, 32)           10656     
 erBlock)                                                        
                                                                 
 global_average_pooling1d (  (None, 32)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dropout_2 (Dropout)         (None, 32)                0     

In [17]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

history = model.fit(x_train, y_train, batch_size=32, epochs=100, validation_data=(x_val, y_val))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78