In [10]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [11]:
!pip install torch torchvision




In [12]:
# Implement multi head attention layer 
class MultiHeadSelfAttention(layers.Layer):
    def __init__(self, embeddim, numheads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embeddim = embeddim  # embedded dimensions  
        self.numheads = numheads  # number of heads 
        if embeddim % numheads != 0:
            raise ValueError(
                f"embedding dimension should be divisible by no of heads"
            )
        self.projectiondim = embeddim // numheads  # projected dimensions 
        self.querydense = layers.Dense(embeddim)   # query layer
        self.keydense = layers.Dense(embeddim)     # key layer
        self.valuedense = layers.Dense(embeddim)   # value layer 
        self.combineheads = layers.Dense(embeddim) # combination layer

    # attention layer 
    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dimkey = tf.cast(tf.shape(key)[-1], tf.float32)
        scaledscore = score / tf.math.sqrt(dimkey)
        weights = tf.nn.softmax(scaledscore, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    # create multiple heads 
    def separateheads(self, x, batchsize):
        x = tf.reshape(x, (batchsize, -1, self.numheads, self.projectiondim))
        return tf.transpose(x, perm=[0, 2, 1, 3])


    def call(self, inputs):
        batchsize = tf.shape(inputs)[0]   # define batch size 
        query = self.querydense(inputs)   # create query layer 
        key = self.keydense(inputs)       # create key layer 
        value = self.valuedense(inputs)   # create value layer 
        query = self.separateheads(
            query, batchsize
        )                                 # separate heads for query 
        key = self.separateheads(
            key, batchsize
        )                                 # separate heads for key 
        value = self.separateheads(
            value, batchsize
        )                                 # separate heads for value 
        attention, weights = self.attention(query, key, value) # create the attention layer 
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  
        concatattention = tf.reshape(
            attention, (batchsize, -1, self.embeddim)
        )                                 # concate the attention layer 
        output = self.combineheads(
            concatattention
        )                                 # create the output layer 
        return output

In [13]:
class TransformerBlock(layers.Layer):
    def __init__(self, embeddim, numheads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadSelfAttention(embeddim, numheads)  # attention layer 
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embeddim),]
        )                                                     # feed forward layer 
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6) # layer normalization
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6) # layer normalization
        self.dropout1 = layers.Dropout(rate)                  # dropout layer 
        self.dropout2 = layers.Dropout(rate)                  # dropout layer 

    # call the different layers 
    def call(self, inputs, training):
        attnoutput = self.att(inputs)                        
        attnoutput = self.dropout1(attnoutput, training=training) 
        out1 = self.layernorm1(inputs + attnoutput)
        ffnoutput = self.ffn(out1)
        ffnoutput = self.dropout2(ffnoutput, training=training)
        return self.layernorm2(out1 + ffnoutput)

In [14]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocabsize, embeddim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocabsize, output_dim=embeddim) # token embedding
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embeddim)      # position embedding 

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [15]:
vocabsize = 10000  # Consider top 10K words
maxlen = 250  # consider first 250 words of imdb reviews 
(xtrain, ytrain), (xval, yval) = keras.datasets.imdb.load_data(num_words=vocabsize) # imdb dataset
print(len(xtrain), "training")
print(len(xval), "validation")
xtrain = keras.preprocessing.sequence.pad_sequences(xtrain, maxlen=maxlen)
xval = keras.preprocessing.sequence.pad_sequences(xval, maxlen=maxlen)

25000 training
25000 validation


In [16]:
embeddim = 32  # embedding for each word 
numheads = 2  # attention heads 
ff_dim = 32  # hidde layer size 

inputs = layers.Input(shape=(maxlen,))    # input layer 
embedding_layer = TokenAndPositionEmbedding(maxlen, vocabsize, embeddim) # embedding layer 
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embeddim, numheads, ff_dim) # transformer 
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

In [17]:
# compile and fit the model
model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])

history = model.fit(
    xtrain, ytrain, batch_size=32, epochs=2, validation_data=(xval, yval)
)


Epoch 1/2
Epoch 2/2
