In [262]:
import numpy as np
import tensorflow as tf

In [263]:
# sentence_embedding              - done
# positional encoding             - done
# batch tokenize                  - done
# scaled dot product              - done
# LayerNormalization              - done
# Position wise feed forward      - done
# create masks                    - done
# multi head attention            - done
# multi head cross attention      - done
# encoder layer                   - done
# sequential encoder              - done
# encoder                         - done
# decoder layer                   - done
# sequential decoder              - done
# decoder                         - done
# **Transformer**                 

In [264]:
data = [["The sun rises in the east every morning.",
"She enjoys listening to classical music while studying.",
"He traveled to Japan last summer and explored Kyoto.",
"Cooking is his passion, and he experiments with new recipes regularly.",
"The company announced a new product line to expand its market reach."],

["సూర్యోదయం ప్రతి ఉదయం తూర్పులో జరుగుతుంది.",
"ఆమె పాఠాన్ని చదివితే క్లాసికల్ సంగీతాన్ని వింటారు.",
"అతను గత వర్షం జపాన్‌కు ప్రయాణించాడు మరియు క్యోటోను అన్వేషించాడు.",
"వంటకాలను అతని ఆసక్తిగా చేసుకుంటాడు, మరియు కొత్త రెసిపీలను నియమించడం సాధారణం.",
"కంపెనీ తన మార్కెట్ విస్తరణను పెంచడానికి కొత్త ఉత్పత్తి వారిస్తుంది."]]

class DatasetClass(tf.keras.utils.Sequence):
    def __init__(self, english_sentences, telugu_sentences, batch_size):
        super(DatasetClass, self).__init__()
        self.english_sentences = english_sentences
        self.telugu_sentences = telugu_sentences
        self.batch_size = batch_size

    def __len__(self):
        return (len(self.english_sentences) + self.batch_size - 1) //  self.batch_size

    def __getitem__(self, idx):
        start_idx = idx * self.batch_size
        end_idx = start_idx + self.batch_size
        eng_batch = self.english_sentences[start_idx:end_idx]
        tel_batch = self.telugu_sentences[start_idx:end_idx]
        return eng_batch, tel_batch

In [265]:
dc = DatasetClass(data[0], data[1], 5)

In [267]:
dc[0]

(['The sun rises in the east every morning.',
  'She enjoys listening to classical music while studying.',
  'He traveled to Japan last summer and explored Kyoto.',
  'Cooking is his passion, and he experiments with new recipes regularly.',
  'The company announced a new product line to expand its market reach.'],
 ['సూర్యోదయం ప్రతి ఉదయం తూర్పులో జరుగుతుంది.',
  'ఆమె పాఠాన్ని చదివితే క్లాసికల్ సంగీతాన్ని వింటారు.',
  'అతను గత వర్షం జపాన్\u200cకు ప్రయాణించాడు మరియు క్యోటోను అన్వేషించాడు.',
  'వంటకాలను అతని ఆసక్తిగా చేసుకుంటాడు, మరియు కొత్త రెసిపీలను నియమించడం సాధారణం.',
  'కంపెనీ తన మార్కెట్ విస్తరణను పెంచడానికి కొత్త ఉత్పత్తి వారిస్తుంది.'])

In [268]:
START_TOKEN = ''
END_TOKEN = ''
PADDING_TOKEN = ''

In [269]:
eng_vocab = set()
for sen in data[0]:
    eng_vocab.update(list(sen.lower()))

tel_vocab = set()
for sen in data[1]:
    tel_vocab.update(list(sen.lower()))

eng_vocab = list(eng_vocab) + [START_TOKEN, PADDING_TOKEN, END_TOKEN] 
tel_vocab = list(tel_vocab) + [START_TOKEN, PADDING_TOKEN, END_TOKEN] 

In [270]:
eng_to_ix = {ch:i for i, ch in enumerate(eng_vocab)}
ix_to_eng = {i:ch for i, ch in enumerate(eng_vocab)}
tel_to_ix = {ch:i for i, ch in enumerate(tel_vocab)}
ix_to_tel = {i:ch for i, ch in enumerate(tel_vocab)}

In [271]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self):
        super(PositionalEncoding, self).__init__()

    def call(self, x):
        sequence_length = x.shape[1]
        d_model = x.shape[2]
        even_i = tf.range(0, d_model, 2, dtype=tf.float32)
        denominator = tf.pow(10000, even_i / d_model)
        position = tf.cast(tf.reshape(tf.range(sequence_length), (sequence_length, 1)), dtype=tf.float32)
        even_PE = tf.sin(position / denominator)
        odd_PE = tf.cos(position / denominator)
        stacked = tf.stack([even_PE, odd_PE], axis=2)
        PE = tf.reshape(stacked, shape=(sequence_length, -1))
        return PE

In [272]:
class SentenceEmbedding(tf.keras.layers.Layer):
    def __init__(self,max_sequence_length, d_model, drop_prob, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN):
        super(SentenceEmbedding, self).__init__()
        self.vocab_size = len(language_to_index)
        self.max_sequence_length = max_sequence_length
        self.language_to_index = language_to_index
        self.embedding = tf.keras.layers.Embedding(300, d_model)
        self.positional_encoder = PositionalEncoding()#max_sequence_length, d_model)
        self.dropout = tf.keras.layers.Dropout(drop_prob)
        self.START_TOKEN = START_TOKEN
        self.END_TOKEN = END_TOKEN
        self.PADDING_TOKEN = PADDING_TOKEN

    def batch_tokenize(self, batch, start_token, end_token):

        def tokenize(sentence, start_token, end_token):
            sentence_char_indices = [self.language_to_index[ch] for ch in list(sentence.lower())]
            if start_token:
                sentence_char_indices.insert(0, self.language_to_index[self.START_TOKEN])
            if end_token:
                sentence_char_indices.append(self.language_to_index[self.END_TOKEN])
            for _ in range(len(sentence_char_indices), self.max_sequence_length):
                sentence_char_indices.append(self.language_to_index[self.PADDING_TOKEN])
            return sentence_char_indices

        tokenized = []
        for sentence in batch:
            tokenized.append(tokenize(sentence, start_token, end_token))
        return tf.constant(tokenized)

    def call(self, x, start_token, end_token):
        x = self.batch_tokenize(x, start_token, end_token)
        x = self.embedding(x)
        pos = self.positional_encoder(x)
        out = self.dropout(x)
        return x

In [273]:
se = SentenceEmbedding(200, 512, 0.1, eng_to_ix, START_TOKEN, END_TOKEN, PADDING_TOKEN)

In [275]:
a = se(data[0], start_token = False, end_token = False)
a

<tf.Tensor: shape=(5, 200, 512), dtype=float32, numpy=
array([[[ 0.00934873,  0.0312071 ,  0.00187446, ..., -0.00197504,
         -0.03114487,  0.04096032],
        [ 0.03229113, -0.01250665, -0.02166044, ..., -0.04236313,
         -0.01148643,  0.01966378],
        [ 0.00653084,  0.00442668,  0.03949089, ..., -0.02999088,
          0.02366151, -0.02640178],
        ...,
        [ 0.01483108, -0.02003411, -0.03107834, ...,  0.02375862,
          0.01203339,  0.03533855],
        [ 0.01483108, -0.02003411, -0.03107834, ...,  0.02375862,
          0.01203339,  0.03533855],
        [ 0.01483108, -0.02003411, -0.03107834, ...,  0.02375862,
          0.01203339,  0.03533855]],

       [[ 0.00165099, -0.0282317 , -0.03810468, ..., -0.00775183,
          0.03940971, -0.02272587],
        [ 0.03229113, -0.01250665, -0.02166044, ..., -0.04236313,
         -0.01148643,  0.01966378],
        [ 0.00653084,  0.00442668,  0.03949089, ..., -0.02999088,
          0.02366151, -0.02640178],
        ...,

In [276]:
def scaled_dot_product(q, k, v, mask = None):
    d_k  = q.shape[-1]
    scaled = tf.matmul(q, v, transpose_b=True) / tf.sqrt(tf.cast(d_k, tf.float32))
    if mask is not None:
        scaled = tf.transpose(scaled, perm=[1,0,2,3]) + mask
        scaled = tf.transpose(scaled, perm=[1,0,2,3])
    attention = tf.keras.activations.softmax(scaled, axis=-1)
    values = tf.matmul(attention, v)
    return attention, values

In [277]:
q = tf.random.normal((5, 200, 512))
k = tf.random.normal((5, 200, 512))
v = tf.random.normal((5, 200, 512))

scaled_dot_product(q, k, v)

(<tf.Tensor: shape=(5, 200, 200), dtype=float32, numpy=
 array([[[0.00106442, 0.00244878, 0.00123245, ..., 0.01170628,
          0.00623719, 0.00998638],
         [0.00214754, 0.00035849, 0.004891  , ..., 0.00316554,
          0.00155623, 0.00059329],
         [0.00207958, 0.00537052, 0.00107772, ..., 0.00127546,
          0.00196642, 0.00310425],
         ...,
         [0.00216896, 0.0012453 , 0.00042396, ..., 0.00095403,
          0.00260908, 0.00114504],
         [0.00317748, 0.02739937, 0.00475054, ..., 0.00626936,
          0.00125677, 0.00299646],
         [0.00195552, 0.0082712 , 0.00252123, ..., 0.01649719,
          0.00331436, 0.00402784]],
 
        [[0.00170772, 0.00453184, 0.0029558 , ..., 0.00632331,
          0.00790246, 0.00050766],
         [0.00398971, 0.00578735, 0.00165002, ..., 0.01104287,
          0.01559439, 0.00288609],
         [0.01874736, 0.00236063, 0.0345882 , ..., 0.00086318,
          0.00142355, 0.00286088],
         ...,
         [0.00758179, 0.0011739

In [282]:
class MulitheadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MulitheadAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.qkv_layer = tf.keras.layers.Dense(3 * d_model)
        self.linear_layer = tf.keras.layers.Dense(d_model)

    def call(self, x, mask):
        batch_size, seq_len, d_model = x.shape
        qkv = self.qkv_layer(x)
        qkv = tf.reshape(qkv, [batch_size, seq_len, self.num_heads, 3 * self.head_dim])
        qkv = tf.transpose(qkv, perm=[0,2,1,3])
        q, k, v = tf.split(qkv, num_or_size_splits=3, axis=-1)
        attention, values = scaled_dot_product(q, k, v, mask)
        values = tf.transpose(values, perm=[0,2,1,3])
        values = tf.reshape(values, [batch_size, seq_len, d_model])
        out = self.linear_layer(values)
        return out

In [283]:
mha = MulitheadAttention(512, 8)

In [284]:
x = tf.random.normal((5, 200, 512))

In [285]:
mha(x, p)

<tf.Tensor: shape=(5, 200, 512), dtype=float32, numpy=
array([[[ 0.01504253,  0.08207852, -0.18806857, ...,  0.03649726,
          0.16674569, -0.01192651],
        [-0.08213367, -0.03040322, -0.25050715, ..., -0.02492133,
          0.05354537, -0.13416961],
        [-0.00167383, -0.05655375, -0.17083986, ..., -0.1420301 ,
          0.05907283, -0.03343423],
        ...,
        [-0.01221397,  0.03831248, -0.08597168, ..., -0.01828451,
         -0.02090643, -0.00048306],
        [-0.01221399,  0.03831249, -0.08597165, ..., -0.0182845 ,
         -0.02090642, -0.00048306],
        [-0.01221399,  0.03831249, -0.08597165, ..., -0.0182845 ,
         -0.02090642, -0.00048306]],

       [[ 0.07013018,  0.07852001, -0.05133325, ..., -0.04014938,
          0.05742659,  0.13638796],
        [ 0.14489925,  0.04363689, -0.04770201, ..., -0.17569709,
         -0.1315613 ,  0.17929906],
        [ 0.09695606, -0.05710326, -0.10270253, ..., -0.1872873 ,
         -0.02298065,  0.11826328],
        ...,

In [286]:
class LayerNormalization(tf.keras.layers.Layer):
    def __init__(self,param_shape, eps = 1e-5):
        super(LayerNormalization, self).__init__()
        self.param_shape = param_shape
        self.eps = eps
        self.gamma = tf.Variable(tf.ones(param_shape))
        self.beta = tf.Variable(tf.zeros(param_shape))

    def call(self, inputs):
        dims = [-(i+1) for i in range(len(self.param_shape))]
        mean = tf.reduce_mean(inputs, axis= dims, keepdims=True)
        var = tf.reduce_mean((inputs - mean)**2, axis=dims, keepdims=True)
        std = tf.sqrt(var + self.eps)
        y = (inputs - mean) / std
        out = self.gamma * y + self.beta
        return out

In [287]:
ln = LayerNormalization([512])

In [288]:
ln(x)

<tf.Tensor: shape=(5, 200, 512), dtype=float32, numpy=
array([[[-1.0984427 , -1.0887604 , -0.48727182, ..., -0.2596247 ,
         -1.2129011 ,  0.3249844 ],
        [-0.51372004, -0.3527528 , -0.40824893, ..., -0.75974   ,
         -1.1174742 , -1.9838665 ],
        [-2.2375162 , -0.8920584 ,  1.4020891 , ...,  1.1354482 ,
          1.168833  , -0.9863533 ],
        ...,
        [-1.414808  ,  2.183807  , -1.3419497 , ..., -0.21862304,
         -0.09736116, -0.9202662 ],
        [-1.5136845 ,  0.00618266,  0.28880802, ...,  0.60409117,
          0.1662155 , -0.46629637],
        [ 1.2291785 ,  1.277692  ,  1.176331  , ...,  0.03791079,
         -0.09864099, -0.15184437]],

       [[ 1.7060914 ,  0.24484003, -0.40184137, ...,  0.88344365,
         -0.13359421, -0.61691415],
        [ 1.6664728 , -0.3703839 , -0.21923965, ..., -1.0797304 ,
          1.0451243 ,  1.3544583 ],
        [ 1.4749291 , -0.6726234 ,  0.6566001 , ...,  0.7243677 ,
          1.0507255 ,  1.1308199 ],
        ...,

In [289]:
class PositionwiseFeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, ffn_hidden, drop_prob):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = tf.keras.layers.Dense(ffn_hidden, activation='relu')
        self.linear2 = tf.keras.layers.Dense(d_model)
        self.dropout = tf.keras.layers.Dropout(drop_prob)

    def call(self, x):
        x = self.linear1(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [290]:
pff = PositionwiseFeedForward(512, 200, 0.1)

In [291]:
pff(a)

<tf.Tensor: shape=(5, 200, 512), dtype=float32, numpy=
array([[[-0.01482221, -0.02873298,  0.01518679, ...,  0.0256278 ,
          0.01134379, -0.02773355],
        [ 0.01112475, -0.0311682 ,  0.02047699, ...,  0.00776466,
         -0.01845117, -0.00757068],
        [-0.00546887, -0.02239474,  0.00546525, ...,  0.00492487,
         -0.06505503, -0.01396921],
        ...,
        [ 0.00486233, -0.01954756,  0.01769177, ...,  0.02493312,
         -0.01489168, -0.01056006],
        [ 0.00486233, -0.01954756,  0.01769177, ...,  0.02493312,
         -0.01489168, -0.01056006],
        [ 0.00486233, -0.01954756,  0.01769177, ...,  0.02493312,
         -0.01489168, -0.01056006]],

       [[-0.00408603, -0.00270201,  0.0088633 , ...,  0.01328484,
         -0.01802973, -0.00631885],
        [ 0.01112475, -0.0311682 ,  0.02047699, ...,  0.00776466,
         -0.01845117, -0.00757068],
        [-0.00546887, -0.02239474,  0.00546525, ...,  0.00492487,
         -0.06505503, -0.01396921],
        ...,

In [292]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, ffn_hidden, drop_prob):
        super().__init__()
        self.mha = MulitheadAttention(d_model, num_heads)
        self.dropout1 = tf.keras.layers.Dropout(drop_prob)
        self.norm1 = LayerNormalization([d_model])
        self.ffn = PositionwiseFeedForward(d_model, ffn_hidden, drop_prob)
        self.dropout2 = tf.keras.layers.Dropout(drop_prob)
        self.norm2 = LayerNormalization([d_model])

    def call(self, x, mask):
        _x = tf.identity(x)
        x = self.mha(x, mask)
        x = self.dropout1(x)
        x = self.norm1(x + _x)

        _x = tf.identity(x)
        x = self.ffn(x)
        x = self.dropout2(x)
        x = self.norm2(x + _x)

        return x

In [293]:
el = EncoderLayer(512, 8, 200, 0.1)

In [294]:
b= el(a, p)
b

<tf.Tensor: shape=(5, 200, 512), dtype=float32, numpy=
array([[[-0.17927198,  0.7596176 ,  0.01053356, ..., -0.7415999 ,
         -0.71357924,  0.00491984],
        [ 1.2492323 , -0.08246657, -0.0847224 , ..., -0.62817055,
         -0.7300947 , -0.23200978],
        [ 1.0437924 ,  0.01342754,  0.95792365, ..., -1.2196497 ,
          0.00640951, -1.1127784 ],
        ...,
        [ 1.2442676 ,  1.2350265 ,  0.31848356, ...,  0.3513386 ,
         -1.1462915 ,  0.7202206 ],
        [ 1.2442672 ,  1.2350272 ,  0.31848386, ...,  0.35133874,
         -1.1462917 ,  0.72021997],
        [ 1.2442672 ,  1.2350272 ,  0.31848386, ...,  0.35133874,
         -1.1462917 ,  0.72021997]],

       [[ 0.01257358, -0.794236  , -1.5063595 , ..., -0.4992905 ,
          0.8486899 , -1.0875169 ],
        [ 1.2264314 , -0.13908793, -0.09473242, ..., -0.6895309 ,
         -0.7083393 , -0.17044045],
        [ 1.0665542 , -0.03729541,  0.9675194 , ..., -1.2364334 ,
          0.11238205, -0.94847226],
        ...,

In [295]:
class SequentialEncoder(tf.keras.Sequential):
    def __init__(self, layers_):
        super(SequentialEncoder, self).__init__()
        self.layers_ = layers_

        # self.trainable = True
        # for layer in self.layers_.layers:
        #     layer.trainable = True

    def call(self, x, mask):
        for layer in self.layers_:
            x = layer(x, mask)
        return x

In [319]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, ffn_hidden, drop_prob, num_layers, max_sequence_length, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN):
        super(Encoder, self).__init__()
        self.sentence_embedding = SentenceEmbedding(max_sequence_length, d_model, drop_prob, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.layers_ = SequentialEncoder([EncoderLayer(d_model, num_heads, ffn_hidden, drop_prob) for _ in range(num_layers)])

    def call(self, x, mask, start_token, end_token):
        x = self.sentence_embedding(x, start_token, end_token)
        x = self.layers_(x, mask)
        return x

In [320]:
en = Encoder(512, 8, 2048, 0.2, 5, 200, eng_to_ix, '', '', '')

In [321]:
b = en(data[0], p, True, False)
b

<tf.Tensor: shape=(5, 200, 512), dtype=float32, numpy=
array([[[-0.11947751, -0.808126  , -0.37508708, ...,  0.3813504 ,
         -1.1551932 , -0.02323645],
        [-0.6871771 , -0.580417  , -0.98202103, ..., -1.046966  ,
         -0.6515697 ,  0.96369845],
        [-0.51994026, -0.37923577, -0.7862403 , ..., -0.7736754 ,
          0.18406254,  0.53586996],
        ...,
        [-0.210077  , -0.04256653, -0.8587586 , ...,  0.3016158 ,
         -0.56877416,  0.4258047 ],
        [-0.21007696, -0.04256668, -0.85875934, ...,  0.30161625,
         -0.56877416,  0.42580423],
        [-0.21007696, -0.04256668, -0.85875934, ...,  0.30161625,
         -0.56877416,  0.42580423]],

       [[ 0.3461749 , -1.2330649 , -0.43432003, ...,  0.28887582,
         -1.5846373 , -0.25842524],
        [-0.70319355,  0.19848642, -0.3563104 , ..., -0.80630845,
         -1.6366704 ,  0.51279056],
        [ 0.0824535 , -0.7759392 , -1.1144212 , ..., -1.040468  ,
         -0.4352395 ,  0.08145463],
        ...,

In [299]:
en.get_weights()

[array([[-0.03877078,  0.01752673, -0.03749885, ..., -0.03636763,
          0.00538752, -0.03441108],
        [ 0.02708724, -0.02530009, -0.03529684, ..., -0.02469801,
          0.04000466,  0.03528659],
        [ 0.00535965,  0.00655144,  0.0389531 , ..., -0.04072517,
          0.03006163,  0.02398596],
        ...,
        [-0.02541303, -0.00191172, -0.02625985, ..., -0.03390269,
         -0.04010241, -0.01685697],
        [ 0.04339908, -0.00104833, -0.00323932, ...,  0.04196853,
          0.02887061, -0.00032103],
        [ 0.03803526,  0.00234511, -0.01281928, ..., -0.02392826,
          0.01051598,  0.03420719]], dtype=float32)]

In [300]:
en.load_weights('encoder weights')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x2d8aa571c40>

In [228]:
en.get_weights()

[array([[ 0.00554125, -0.01158991,  0.01657984, ...,  0.04695997,
         -0.04948799,  0.03087907],
        [ 0.00930234,  0.04654985, -0.01283615, ...,  0.00472629,
          0.0097146 ,  0.04627255],
        [-0.04187052,  0.03774012,  0.0383137 , ..., -0.02420974,
          0.01643573, -0.03619011],
        ...,
        [-0.02653729, -0.02238812, -0.00166317, ...,  0.03908445,
         -0.02432957,  0.00939595],
        [ 0.03333708, -0.04846844, -0.02972633, ..., -0.03997279,
         -0.02390128, -0.03751271],
        [-0.017908  ,  0.03431005, -0.02884624, ..., -0.04752916,
         -0.03448067, -0.03953455]], dtype=float32)]

In [301]:
NEG_INFTY = -1e9

def create_mask(eng_batch, tel_batch):
    num_sentences = len(eng_batch)
    look_ahead_mask = np.full((200,200), True)
    look_ahead_mask = np.triu(look_ahead_mask, k=1)
    encoder_padding_mask = np.full((num_sentences, 200, 200), False)
    decoder_padding_mask_self_attention = np.full((num_sentences, 200, 200), False)
    decoder_padding_mask_cross_attention = np.full((num_sentences, 200, 200), False)

    for idx in range(num_sentences):
        eng_sen_len, tel_sen_len = len(eng_batch[idx]), len(tel_batch[idx])
        eng_chars_to_padding_mask = np.arange(eng_sen_len + 1, 200)
        tel_chars_to_padding_mask = np.arange(tel_sen_len + 1, 200)
        encoder_padding_mask[idx, :, eng_chars_to_padding_mask] = True
        encoder_padding_mask[idx, eng_chars_to_padding_mask, :] = True
        decoder_padding_mask_self_attention[idx, :, tel_chars_to_padding_mask] = True
        decoder_padding_mask_self_attention[idx, tel_chars_to_padding_mask, :] = True
        decoder_padding_mask_cross_attention[idx, :, tel_chars_to_padding_mask] = True
        decoder_padding_mask_cross_attention[idx, tel_chars_to_padding_mask, :] = True

    encoder_padding_mask = np.where(encoder_padding_mask, NEG_INFTY, 0)
    decoder_padding_mask_self_attention = np.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
    decoder_padding_mask_cross_attention = np.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)

    return encoder_padding_mask, decoder_padding_mask_self_attention, decoder_padding_mask_cross_attention

In [302]:
p, q, r = create_mask(data[0], data[1])

In [303]:
p.shape, q.shape, r.shape

((5, 200, 200), (5, 200, 200), (5, 200, 200))

In [304]:
class MultiheadCrossAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiheadCrossAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.q_layer = tf.keras.layers.Dense(d_model)
        self.kv_layer = tf.keras.layers.Dense(2*d_model)
        self.linear_layer = tf.keras.layers.Dense(d_model)

    def call(self, x, y, cross_attention_mask):
        batch_size, seq_len, d_model = y.shape
        q = self.q_layer(y)
        q = tf.reshape(q, [batch_size, seq_len, self.num_heads, self.head_dim ])
        q = tf.transpose(q, perm=[0, 2, 1, 3])
        kv = self.kv_layer(x)
        kv = tf.reshape(kv, [batch_size, seq_len, self.num_heads, 2*self.head_dim])
        kv = tf.transpose(kv, perm=[0,2,1,3])
        k, v = tf.split(kv, num_or_size_splits=2, axis = -1)
        attention, values = scaled_dot_product(q, k, v, cross_attention_mask)
        values = tf.transpose(values, perm=[0,2,1,3])
        values = tf.reshape(values, [batch_size, seq_len, d_model])
        out = self.linear_layer(values)
        return out

In [305]:
mhca = MultiheadCrossAttention(512, 8)

In [306]:
se = SentenceEmbedding(200, 512, 0.1, tel_to_ix, START_TOKEN, END_TOKEN, PADDING_TOKEN)
y = se(data[1], True, True)

In [307]:
mhca(b, y, r)

<tf.Tensor: shape=(5, 200, 512), dtype=float32, numpy=
array([[[-0.1505366 ,  0.8309417 ,  0.25985107, ..., -0.50802106,
          0.4641287 ,  0.21321014],
        [-0.14984694,  0.832798  ,  0.2611691 , ..., -0.5069573 ,
          0.4631387 ,  0.2140295 ],
        [-0.15438172,  0.83487344,  0.26057932, ..., -0.5049591 ,
          0.46634397,  0.2159032 ],
        ...,
        [-0.34946436,  0.2746668 ,  0.11851574, ..., -0.4005754 ,
         -0.28684056,  0.77087814],
        [-0.34946358,  0.27466747,  0.11851589, ..., -0.40057617,
         -0.28684032,  0.7708779 ],
        [-0.34946358,  0.27466747,  0.11851589, ..., -0.40057617,
         -0.28684032,  0.7708779 ]],

       [[-0.13328582,  0.7836279 ,  0.25602642, ..., -0.59550786,
          0.5007312 ,  0.19610427],
        [-0.1305708 ,  0.78646827,  0.2565706 , ..., -0.5935536 ,
          0.4984933 ,  0.19644198],
        [-0.13450918,  0.7867577 ,  0.25653604, ..., -0.59437597,
          0.50344497,  0.19686885],
        ...,

In [308]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, ffn_hidden, drop_prob):
        super(DecoderLayer, self).__init__()
        self.mha = MulitheadAttention(d_model, num_heads)
        self.drop1 = tf.keras.layers.Dropout(drop_prob)
        self.norm1 = LayerNormalization([d_model])

        self.mhca = MultiheadCrossAttention(d_model, num_heads)
        self.drop2 = tf.keras.layers.Dropout(drop_prob)
        self.norm2 = LayerNormalization([d_model])

        self.ffn = PositionwiseFeedForward(d_model, ffn_hidden, drop_prob)
        self.drop3 = tf.keras.layers.Dropout(drop_prob)
        self.norm3 = LayerNormalization([d_model])

    def call(self, x, y, self_attention_mask, cross_attention_mask):
        _y = tf.identity(y)
        y = self.mha(y, self_attention_mask)
        y = self.drop1(y)
        y = self.norm1(y + _y)

        _y = tf.identity(y)
        y = self.mhca(x, y, cross_attention_mask)
        y = self.drop2(y)
        y = self.norm3(y + _y)

        _y = tf.identity(y)
        y = self.ffn(y)
        y = self.drop3(y)
        y = self.norm3(y + _y)

        return y

In [309]:
dl = DecoderLayer(512, 8, 2048, 0.2)

In [310]:
dl(b, y, q, r)

<tf.Tensor: shape=(5, 200, 512), dtype=float32, numpy=
array([[[ 1.63469255e-01, -1.54640210e+00, -6.78564847e-01, ...,
          7.06591070e-01,  4.18409169e-01, -6.37723863e-01],
        [ 3.07701588e-01, -1.74094069e+00, -9.89145637e-01, ...,
          3.81571352e-02,  1.73485756e-01,  4.53577518e-01],
        [-1.93369538e-01, -2.05447719e-01, -1.82240725e+00, ...,
         -8.43080133e-03,  1.98066163e+00, -1.53634119e+00],
        ...,
        [ 5.82890166e-03, -6.87762678e-01, -9.51853156e-01, ...,
          2.95102566e-01,  2.82390237e-01, -7.72865653e-01],
        [ 5.82832191e-03, -6.87763214e-01, -9.51852441e-01, ...,
          2.95102358e-01,  2.82390803e-01, -7.72864938e-01],
        [ 5.82832191e-03, -6.87763214e-01, -9.51852441e-01, ...,
          2.95102358e-01,  2.82390803e-01, -7.72864938e-01]],

       [[ 2.09775865e-01, -1.40397072e+00, -6.89413607e-01, ...,
          7.36814976e-01,  7.44833529e-01, -4.75005329e-01],
        [-1.85229287e-01, -4.45348144e-01, -1.43

In [311]:
class SequentialDecoder(tf.keras.Sequential):
    def __init__(self, layers_):
        super(SequentialDecoder, self).__init__()
        self.layers_ = layers_

    def call(self, x, y, self_attention_mask, cross_attention_mask):
        for layer in self.layers_:
            y = layer(x, y, self_attention_mask, cross_attention_mask)
        return y

In [312]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, ffn_hidden, drop_prob, num_layers, max_seq_len, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN):
        super(Decoder, self).__init__()
        self.sentence_embedding = SentenceEmbedding(max_seq_len, d_model, drop_prob, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.layers = SequentialDecoder([DecoderLayer(d_model, num_heads, ffn_hidden, drop_prob) for _ in range(num_layers)])

    def call(self, x, y, self_attention_mask, cross_attention_mask, start_token, end_token):
        y = self.sentence_embedding(y, start_token, end_token)
        y = self.layers(x, y, self_attention_mask, cross_attention_mask)
        return y

In [313]:
de = Decoder(512, 8, 2048, 0.2, 5, 200, tel_to_ix, '', '', '')

In [314]:
de(b, data[1], q,r, True, True)

<tf.Tensor: shape=(5, 200, 512), dtype=float32, numpy=
array([[[ 1.37031800e-03,  5.76758385e-01,  6.35467947e-01, ...,
          2.45629236e-01,  1.55599749e+00,  4.30920333e-01],
        [-1.66369393e-01,  1.08282542e+00,  5.88491499e-01, ...,
          1.99777484e-01,  1.04868782e+00,  8.92071545e-01],
        [ 1.97014257e-01,  9.41888034e-01,  8.96312118e-01, ...,
          2.18954146e-01,  9.49097574e-01,  6.27372324e-01],
        ...,
        [-4.64140832e-01,  7.06178963e-01,  1.25263083e+00, ...,
         -4.89266694e-01,  1.59530556e+00,  1.00970042e+00],
        [-4.64141279e-01,  7.06180871e-01,  1.25263035e+00, ...,
         -4.89266902e-01,  1.59530449e+00,  1.00970101e+00],
        [-4.64141279e-01,  7.06180871e-01,  1.25263035e+00, ...,
         -4.89266902e-01,  1.59530449e+00,  1.00970101e+00]],

       [[ 3.04728329e-01,  4.17604119e-01,  1.00551105e+00, ...,
          4.94076833e-02,  1.47495937e+00,  3.51048797e-01],
        [-2.52073467e-01,  5.17423868e-01,  1.39

In [315]:
class Transformer(tf.keras.Model):
    def __init__(self, d_model, num_heads, ffn_hidden, drop_prob, num_layers, max_seq_len, tel_vocab_size, eng_to_ix, tel_to_ix, START_TOKEN, END_TOKEN, PADDING_TOKEN):
        super(Transformer, self).__init__()
        self.encoder = Encoder(d_model, num_heads, ffn_hidden, drop_prob,num_layers, max_seq_len, eng_to_ix, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.decoder = Decoder(d_model, num_heads, ffn_hidden, drop_prob,num_layers, max_seq_len, tel_to_ix, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.linear = tf.keras.layers.Dense(tel_vocab_size)

    def call(self, x, y, encoder_self_attention_mask = None, decoder_self_attention_mask= None,decoder_cross_attention_mask= None, 
             enc_start_token = False, enc_end_token = False, dec_start_token=False, dec_end_token=False):
        x = self.encoder(x, encoder_self_attention_mask, enc_start_token, enc_end_token)
        out = self.decoder(x, y, decoder_self_attention_mask, decoder_cross_attention_mask, dec_start_token, dec_end_token)
        out = self.linear(out)
        return out

In [316]:
t = Transformer(512, 8, 2048, 0.2, 5, 200,len(tel_to_ix), eng_to_ix, tel_to_ix, '', '', '')

In [317]:
t(dc[0][0], dc[0][1], p, q, r, False, False, True, False)

<tf.Tensor: shape=(5, 200, 39), dtype=float32, numpy=
array([[[ 0.52833587,  1.1580445 ,  1.3764699 , ...,  1.8021618 ,
         -0.2479335 ,  1.4071469 ],
        [ 0.19280773,  1.6708237 ,  1.5258871 , ...,  2.3118284 ,
         -0.20316708,  1.0133984 ],
        [ 0.48645544,  0.8473632 ,  1.0746875 , ...,  2.1770132 ,
         -0.12393856,  1.5643172 ],
        ...,
        [ 0.5194371 ,  0.09409517,  1.7981002 , ...,  1.2983698 ,
         -1.5503206 ,  2.2919197 ],
        [ 0.5194357 ,  0.09409422,  1.7981021 , ...,  1.2983694 ,
         -1.5503199 ,  2.2919197 ],
        [ 0.5194357 ,  0.09409422,  1.7981021 , ...,  1.2983694 ,
         -1.5503199 ,  2.2919197 ]],

       [[ 0.1211826 ,  0.9825478 ,  1.6685649 , ...,  2.7253478 ,
         -0.4401403 ,  2.1896224 ],
        [-0.12330988,  1.3969742 ,  1.7346131 , ...,  2.6804142 ,
         -0.72620296,  1.3807548 ],
        [ 0.40650982,  0.47800332,  2.010179  , ...,  2.9622436 ,
         -0.85474694,  1.0549891 ],
        ...,


In [318]:
t(data[0], data[1], p, q, r, False, False, True, False)

<tf.Tensor: shape=(5, 200, 39), dtype=float32, numpy=
array([[[ 0.52833587,  1.1580445 ,  1.3764699 , ...,  1.8021618 ,
         -0.2479335 ,  1.4071469 ],
        [ 0.19280773,  1.6708237 ,  1.5258871 , ...,  2.3118284 ,
         -0.20316708,  1.0133984 ],
        [ 0.48645544,  0.8473632 ,  1.0746875 , ...,  2.1770132 ,
         -0.12393856,  1.5643172 ],
        ...,
        [ 0.5194371 ,  0.09409517,  1.7981002 , ...,  1.2983698 ,
         -1.5503206 ,  2.2919197 ],
        [ 0.5194357 ,  0.09409422,  1.7981021 , ...,  1.2983694 ,
         -1.5503199 ,  2.2919197 ],
        [ 0.5194357 ,  0.09409422,  1.7981021 , ...,  1.2983694 ,
         -1.5503199 ,  2.2919197 ]],

       [[ 0.1211826 ,  0.9825478 ,  1.6685649 , ...,  2.7253478 ,
         -0.4401403 ,  2.1896224 ],
        [-0.12330988,  1.3969742 ,  1.7346131 , ...,  2.6804142 ,
         -0.72620296,  1.3807548 ],
        [ 0.40650982,  0.47800332,  2.010179  , ...,  2.9622436 ,
         -0.85474694,  1.0549891 ],
        ...,
