## Necessary Imports

In [37]:
import math
import tensorflow as tf

## Tokenizing

In [38]:
def tokenizer(corpus):
  corpus = ' '.join(corpus).lower()
  token_arr = corpus.split()
  token_set = set(token_arr)
  token_dict = {}
  for index, i in enumerate(token_set):
    token_dict[i] = index

  return token_dict

def tokenize(vocab, text):
  words = text.lower().split()
  tokens = []
  for word in words:
    tokens.append(vocab[word])
  return tokens

## Self Attention

In [39]:
def scaled_dot_product(q, k, v, mask=None):
    scores = tf.matmul(q, k, transpose_a=False, transpose_b=True) / math.sqrt(tf.cast(tf.shape(q)[-1], tf.float32))
    if mask is not None:
        scores = tf.where(mask == 0, tf.fill(tf.shape(scores), float("-inf")), scores)
    scores = tf.nn.softmax(scores, axis=-1)
    weights = tf.matmul(scores, v)
    return weights

class SelfAttention(tf.keras.layers.Layer):
    def __init__(self, config):
        super(SelfAttention, self).__init__()
        self.q = tf.keras.layers.Dense(units=config.head_dim)
        self.k = tf.keras.layers.Dense(units=config.head_dim)
        self.v = tf.keras.layers.Dense(units=config.head_dim)

    def call(self, x, mask=None):
        query = self.q(x)
        key = self.k(x)
        value = self.v(x)
        self_atten = scaled_dot_product(query, key, value, mask)
        return self_atten

## Multi Headed Attention

In [40]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, config):
        super(MultiHeadAttention, self).__init__()
        self.heads = config.emb_dim // config.head_dim
        self.attention_heads = [SelfAttention(config) for _ in range(self.heads)]
        self.concat_output = tf.keras.layers.Concatenate(axis=-1)
        self.output_linear = tf.keras.layers.Dense(units=config.emb_dim)

    def call(self, inputs, mask=None):
        attention_outputs = [head(inputs, mask) for head in self.attention_heads]
        concatenated_attention = self.concat_output(attention_outputs)
        output = self.output_linear(concatenated_attention)
        return output

## Feed Forward Layer

In [41]:
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, config):
        super(FeedForward, self).__init__()
        self.feed_forward1 = tf.keras.layers.Dense(4 * config.emb_dim, activation="linear")
        self.feed_forward2 = tf.keras.layers.Dense(config.emb_dim, activation="linear")
        self.gelu = tf.keras.layers.Activation("gelu")
        self.dropout = tf.keras.layers.Dropout(0.3)

    def call(self, x):
        x = self.feed_forward1(x)
        x = self.gelu(x)
        x = self.feed_forward2(x)
        x = self.dropout(x)
        return x

## Transformer Encoder Layer

In [42]:
class TransformerEncoderLayer(tf.keras.layers.Layer):
    def __init__(self, config):
        super(TransformerEncoderLayer, self).__init__()
        self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.attention = MultiHeadAttention(config)
        self.feed_forward = FeedForward(config)

    def call(self, inputs, mask=None):
        x = self.layer_norm1(inputs)
        attention_output = x + self.attention(x, mask)
        attention_output = self.layer_norm2(attention_output)
        output = attention_output + self.feed_forward(attention_output)
        return output

## Embeddings

In [43]:
class Embeddings(tf.keras.layers.Layer):
    def __init__(self, vocab_size, config):
        super(Embeddings, self).__init__()
        self.token_embeddings = tf.keras.layers.Embedding(vocab_size, config.emb_dim)
        self.positional_embeddings = tf.keras.layers.Embedding(config.emb_dim, config.emb_dim)
        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout = tf.keras.layers.Dropout(0.3)

    def call(self, x):
        seq_len = tf.shape(x)[1]
        positions = tf.range(start=0, limit=seq_len, delta=1)
        position_embeddings = self.positional_embeddings(positions)
        token_embeddings = self.token_embeddings(x)
        embeddings = token_embeddings + position_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

## Encoder

In [44]:
class TransformerEncoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, config):
        super(TransformerEncoder, self).__init__()
        self.embeddings = Embeddings(vocab_size, config)
        self.encoder_layers = [TransformerEncoderLayer(config) for _ in range(config.no_of_encoders)]

    def call(self, x, mask=None):
        seq_len = tf.shape(x)[1]
        embeddings = self.embeddings(x)
        for encoder_layer in self.encoder_layers:
            embeddings = encoder_layer(embeddings, mask)
        return embeddings

## Driver Code

In [45]:
class Config:
  def __init__(self):
    self.emb_dim = 32
    self.head_dim = 8
    self.no_of_encoders = 2

config = Config()

In [46]:
corpus = [
    'Time flies like an arrow',
    'fruit flies like a banana',
    'my name is faizan'
]

vocab = tokenizer(corpus)
tokens = tokenize(vocab, corpus[1])

encoder = TransformerEncoder(len(vocab), config)
tokens_tensor = tf.constant([tokens])
encoder(tokens_tensor, len(tokens)).shape

TensorShape([1, 5, 32])