In [1]:
import tensorflow as tf
import keras
import numpy as np

In [2]:
# Machine translation use case

In [None]:
# Dataset download

dataset = tf.keras.utils.get_file(fname="spa-eng.zip",origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip", extract=True)

In [2]:
import pathlib
DATASET_DIR = r"C:\Users\arany\.keras\datasets\spa-eng"
base_path = pathlib.Path(DATASET_DIR)

text_filepath = base_path / "spa.txt"
with open(text_filepath) as f:
    lines = f.read().split("\n")[:-1]

text_pairs = []

for line in lines:
    english , spanish = line.split("\t") # Since each line contains the english and spanish sentence as tab seperated 
    spanish = "[start] " + spanish + " [end]" # So that we get a start and end sentence tokens for each spanish word (used in decoder)
    text_pairs.append((english,spanish))

In [4]:
import random
print(random.choice(text_pairs))

("I don't know who wrote it.", '[start] No sé quién lo escribió. [end]')


In [5]:
# Train, Test and Validation split

random.shuffle(text_pairs)

num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = int(len(text_pairs) - 2*num_val_samples)
num_test_samples = num_train_samples - num_val_samples

train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples+ num_val_samples]
test_pairs = text_pairs[num_train_samples+num_val_samples:]

In [6]:
# We need to create 2 seperate text vectorization for 2 different language(punctuations may be different) ( Also the brackets might get removed from spanish translation in [start] and [end], but we need them)

In [6]:
import string
import re

In [7]:
strip_chars = string.punctuation + "¿" # Adding the character as punctuation
strip_chars = strip_chars.replace("[","") # Removing brackets from string punctuations
strip_chars = strip_chars.replace("]","")

def custom_standardization(input_string): # Function for lower case conversioon and removing and adding punctuation for the spanish language
    output_string = tf.strings.lower(input_string)
    return tf.strings.regex_replace(output_string,f"[{re.escape(strip_chars)}]","") # Removing the punctuations 

In [8]:
from keras.layers import TextVectorization

In [9]:
vocab_size = 15000 # top 15000 frequent words
sequence_length = 20 # Sequence length restricted to 20 words

# Vectorization of the source sequence(english) and the target sequence(spanish)

source_vectorization = TextVectorization(max_tokens=vocab_size,output_mode="int",output_sequence_length=sequence_length)

# Generating spanish sequences with 1 extra token per sentence because we need to offset the sentence by one step during training
# If both source and target have the same number of tokens, then +1 added in the output sequence length means it will be predicting the next token..i,e the 4th token
# If we don't add +1 , then since both the source and target have same length, then there is no new next token to predict( so we add + 1 to the output_sequence_length)
target_vectorization = TextVectorization(max_tokens=vocab_size,output_mode="int",output_sequence_length=sequence_length + 1,standardize=custom_standardization) 

# Extracting the texts seperately and training our vocabularies seperately for english and spanish
train_english_texts = [pair[0] for pair in train_pairs]
train_spanish_texts = [pair[1] for pair in train_pairs]

source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_spanish_texts)


In [11]:
# Our data pipeline should return the below tuple
# (inputs,target) where inputs = {"encoder_inputs":"english sentence from the input file" , "decoder_inputs" : "spanish sentence from the input file"}
# target is the Spanish sentence offset by one step ahead

In [10]:
# Building the dataset 

batch_size = 64

def format_dataset(eng,spa): # Function to get the tuple
    eng = source_vectorization(eng)
    spa = target_vectorization(spa)
    return ({
        "english" : eng ,
        "spanish" : spa[:,:-1] # The input spanish sequence doesn't include the last token to keep the inputs and targets of same length
    }, spa[:,1:]) # The target spanish sequence is one step ahead(Both are still the same length)

def make_dataset(pairs): 
    eng_texts,spa_texts = zip(*pairs) # Unzips the pairs into separate lists of English and Spanish sentences.
    eng_texts = list(eng_texts)
    spa_texts = list(spa_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts,spa_texts)) # Converts the lists into TensorFlow dataset
    dataset = dataset.batch(batch_size) # Batches the dataset
    dataset = dataset.map(format_dataset,num_parallel_calls=4) # This function formats each pair of sentences into the required format(format defined in format_dataset)
    return dataset.shuffle(2048).prefetch(16).cache() # Prefetches 16 batches of data to speed up training

train_dataset = make_dataset(train_pairs)

val_dataset = make_dataset(val_pairs)

In [11]:
for inputs,targets in train_dataset.take(1):
    print(inputs["english"].shape)
    print(inputs["spanish"].shape)
    print(targets.shape)

(64, 20)
(64, 20)
(64, 20)


In [18]:
# RNN Model : Need two RNN components(encoder and decoder) --> encoder will turn the entire sequence into a single or set of vectors ---> This single or set of vectors will be used as initial state
# for the decoder, which will look at elements 0 to N in target sequence and try to predict the N+1 token

In [19]:
# Using GRU instead of LSTM because it makes things simpler,, since a single state vector is used in GRU as compared to multi set vector in LSTM

In [23]:
from keras.layers import GRU, Dense, Input, Embedding, Bidirectional, Dropout
from keras.models import Model

In [26]:
embed_dim = 256
latent_dim = 1024


# Encoder
source = Input(shape=(None,),dtype="int64",name="english") # English Source sentence
embedding_layer1 = Embedding(input_dim=vocab_size,output_dim=embed_dim,mask_zero=True)(source) # Masking is a critical step needed( Sentences with variable lengths will be padded)
encoded_source = Bidirectional(GRU(latent_dim),merge_mode="sum")(embedding_layer1) # Output layer of our encoder("sum" means the forward and backward direction representations will be summed together to get the final output encoded representations)

# Decoder
past_target = Input(shape=(None,),dtype="int64",name="spanish") # Spanish Target sentence
embedding_layer2 = Embedding(input_dim=vocab_size,output_dim=embed_dim,mask_zero=True)(past_target) # Masking is critical here as well
decoded_gru = GRU(latent_dim,return_sequences=True) # Specifying the units of the GRU layer and we need the full sequence of outputs generated by the decoder not just the final last output , so return_sequences is set to True

# The below layer is the main decoder GRU layer and we are initializing it by passing the information that we got as output from our encoder
# So the decoder can use the target sequence to predict new tokens using the context information that was learned by the encoder while it was learning on the source input
decoded_gru_initialized = decoded_gru(embedding_layer2,initial_state=encoded_source) # Encoder source sequence set as the initial state for the decoder GRU  
dropout_layer = Dropout(0.5)(decoded_gru_initialized)
target_next_token_layer = Dense(vocab_size,activation="softmax")(dropout_layer)
seq2seq_rnn = Model([source,past_target],target_next_token_layer) # Inputs are source and past_target layers , output layer is target_next_token_layer

seq2seq_rnn.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 english (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 spanish (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_4 (Embedding)        (None, None, 256)    3840000     ['english[0][0]']                
                                                                                                  
 embedding_5 (Embedding)        (None, None, 256)    3840000     ['spanish[0][0]']                
                                                                                            

In [27]:
seq2seq_rnn.compile(optimizer="rmsprop",loss="sparse_categorical_crossentropy",metrics="accuracy")
seq2seq_rnn.fit(train_dataset,epochs=15,validation_data=val_dataset)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x173f5c50490>

In [28]:
spa_vocab = target_vectorization.get_vocabulary() # Getting the vocabulary
spa_index_lookup = dict(zip(range(len(spa_vocab)),spa_vocab)) # Creating a dictionary to be used to retrieve token and the corresponding words
max_decoded_sentence_length = 20

def decode_sequence(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence]) # Vectorizing the input sentence
    decoded_sentence = "[start]" # Initializing and defining the first token of the output sentence
    for i in range(max_decoded_sentence_length): # Looping till the max sentence length we want
        tokenized_target_sentence = target_vectorization([decoded_sentence])  # Vectorizing the previous tokens present in the output sentence
        next_token_prediction = seq2seq_rnn.predict([tokenized_input_sentence,tokenized_target_sentence]) # Predicting the next token based on the previous token vectors
        sampled_token_index = np.argmax(next_token_prediction[0,i,:]) # Taking the highest predicted word token from the model prediction
        sampled_token = spa_index_lookup[sampled_token_index] # Going through the target sequence vocabulary to check which word corresponds to the predicted token
        decoded_sentence += " " + sampled_token # Adding the word to the output sentence
        if sampled_token == "[end]": # If we get [end] token before we reach the max sentence length, then break the loop
            break

    return decoded_sentence

test_eng_texts = [pair[0] for pair in test_pairs] # Taking the english sentences from the test pairs
for _ in range(20): # 20 sentences to be predicted
    input_sequence = random.choice(test_eng_texts) # Randomly choosing the sentences
    print("-------------")
    print(input_sequence)
    print(decode_sequence(input_sequence))

-------------
Good words are worth a lot, but cost almost nothing.
[start] las [UNK] mucho que las mujeres [UNK] mucho pero no son las cosas más tarde [end]
-------------
We all like cycling.
[start] a todos nos gusta el extranjero [end]
-------------
It's dangerous to ignore the signal at a railroad crossing.
[start] es peligroso que [UNK] a la [UNK] del menos me [UNK] [end]
-------------
Tom ate the whole pizza by himself.
[start] tom se comió la solo para ti [end]
-------------
Tom helps Mary because he wants to, not because he has to.
[start] tom le pidió a mary que no le [UNK] porque no lo puedo [end]
-------------
Tell me who you gave your old toolbox to.
[start] dime que le [UNK] que tu otro [UNK] [end]
-------------
I ate a hot dog for lunch.
[start] me tomó un perro que [UNK] [end]
-------------
Stop saying that!
[start] deja de decir eso [end]
-------------
Is your dog mean?
[start] tu perro es tu [end]
-------------
The recent advances in medicine are remarkable.
[start] la 

In [13]:
# Transformer model (More preferred than RNNs since RNNs are less efficient when treating long sentences, but transformers prove to be efficient in them, thus leading to long document analysis possibility)

In [34]:
from keras.layers import MultiHeadAttention,Dense,LayerNormalization,Input,Embedding,Dropout
from keras import Sequential
from keras.models import Model

In [56]:
class TransformerDecoderClass(tf.keras.layers.Layer):
    def __init__(self,embed_dim,dense_dim,num_heads,**kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.attention_layer1 = MultiHeadAttention(num_heads=num_heads,key_dim=embed_dim)
        self.attention_layer2 = MultiHeadAttention(num_heads=num_heads,key_dim=embed_dim)
        self.dense_block = Sequential([
            Dense(units=dense_dim,activation="relu"),
            Dense(units=embed_dim)
        ])
        self.layer_norm1 = LayerNormalization()
        self.layer_norm2 = LayerNormalization()
        self.layer_norm3 = LayerNormalization()
        self.supports_masking = True # Ensures that layer will propogate its input mask to its output

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "dense_dim": self.dense_dim,
            "num_heads": self.num_heads
        })
        return config
    
    # Causal padding ensures that during self-attention calculations in the transformer, each token only attends to the previous tokens in the sequence, not the future ones.
    def get_causal_attention_mask(self,inputs): # Causual padding implementation ( Since the transformer model has access to the whole sequence , so that it doesn't directly copy while predicting the N+1 token, we pad the future elements in the sequence)
        input_shape = tf.shape(inputs)
        batch_size , sequence_length = input_shape[0] , input_shape[1]
        # These lines generate two tensors i and j, where i represents a range from 0 to sequence_length - 1 along columns and j represents the same range along rows.
        i = tf.range(sequence_length)[:,tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j,dtype="int32") # This line creates a mask where each element is 1 if the corresponding element in i is greater than or equal to the corresponding element in j, and 0 otherwise. 
        # This ensures that each token only attends to itself and the previous tokens, not the future ones.
        mask = tf.reshape(mask,(1,input_shape[1],input_shape[1])) # Reshaping done so that the mask has the correct dimensions to be compatible with the subsequent tiling operation and matches the shape expected by the attention mechanism in the transformer model.
        mult = tf.concat(
            [tf.expand_dims(batch_size,-1), # This line helps us determine how many times the mask matrix will be repeated during tiling
             tf.constant([1,1],dtype="int32")],axis=0) # Here [1,1] means that the mask matrix will not be tilled in other dimensions
        return tf.tile(mask,mult) # Tiling process means copying the mask matrix for different dimensions(here only 1 dimension whose number of times to be replicated depends on the batch size)
    
    def call(self,inputs,encoder_outputs,mask=None): # inputs is the target sequence provided to decoder as input, encoder_inputs is the representation of the source sequence of the encoder
        causal_mask = self.get_causal_attention_mask(inputs) # Retreiving the causal mask
        
        # If a padding mask is provided, it's first cast to an integer type and expanded to match the shape of the causal mask. 
        #Then, a minimum operation is performed element-wise between the padding mask and the causal mask. 
        #This step ensures that the model doesn't attend to the padded elements during the attention calculation.
        if mask is not None:
            padding_mask = tf.cast(mask[:,tf.newaxis,:],dtype="int32") # Preparing the input mask which describes the padding locations in the target sequence
            padding_mask = tf.minimum(padding_mask,causal_mask) # Merging the masks together
        
        # Attention layer 1 has only the inputs sent to the decoder, so the inputs will be the query, key and value for the layer
        # Causal mask only applied here because the model only has the source sequence 
        attention_output_1 = self.attention_layer1(query=inputs,key=inputs,value=inputs,attention_mask=causal_mask) # Pass the causal mask to the first attention layer, which performs self attention over target sequence        
        attention_output_1 = self.layer_norm1(inputs + attention_output_1) # Applying layer normalization and residual connection
        
        # Attention layer 2 has the attention scores and outputs from the previous attention layer which will be the query here, and the outputs sent by the encoder will be the value and key here ( since we are using context information from the encoder as the key and corresponding values to predict the next token)
        # Padding mask is used since the model has both target and source sequence here
        attention_output_2 = self.attention_layer2(query=attention_output_1,key=encoder_outputs,value=encoder_outputs,attention_mask=padding_mask) # Pass the padding mask to the second attention layer, which relates the source sequence to the target sequence
        attention_output_2 = self.layer_norm2(attention_output_1 + attention_output_2) # Applying layer normalization and residual connection
        
        proj_output = self.dense_block(attention_output_2) # Dense layer block
        return self.layer_norm3(attention_output_2 + proj_output) # Apply layer normalization and residual connection


#### The mask matrix looks like this :

Sequence Length = 5

   0 1 2 3 4  <-- Token index (j)
  +----------
0 | 1 0 0 0 0
1 | 1 1 0 0 0
2 | 1 1 1 0 0
3 | 1 1 1 1 0
4 | 1 1 1 1 1
^
|
Token index (i)

At (0, 0), the value is 1 because the token at index 0 can attend to itself.
At (1, 0), the value is 1 because the token at index 1 can attend to the token at index 0.
At (2, 3), the value is 0 because the token at index 2 cannot attend to the token at index 3 since 2 < 3.

Original Mask Tensor (Shape: (1, 5, 5)):

[[1 0 0 0 0]
 [1 1 0 0 0]
 [1 1 1 0 0]
 [1 1 1 1 0]
 [1 1 1 1 1]]


Mult : If batch_size = 3 , tf.expand_dims(batch_size,-1) --> convert this into a tensor [3]
tf.constant([1,1],dtype="int32")],axis=0) --> adds [1,1] in axis = 0, 
there mult becomes : 
[[3]
[1,1]]
After concatenation,
mult -->

[[3]
[1]
[1]]

Tiling Process:
We'll tile the mask tensor along the batch dimension according to the tiling multiplier tensor:

The first dimension of the multiplier tensor specifies how many times to repeat the mask tensor along the batch dimension. In this case, it's 3, so we'll have three copies of the mask tensor.
The other dimensions of the multiplier tensor specify how many times to repeat the mask tensor along other dimensions. In this case, it's [1, 1], so the mask tensor will not be tiled along any other dimensions.
Resultant Tiled Mask Tensor (Shape: (3, 5, 5)):
Each example in the batch gets the same mask tensor replicated according to the multiplier tensor:

Batch 1:
[[1 0 0 0 0]
 [1 1 0 0 0]
 [1 1 1 0 0]
 [1 1 1 1 0]
 [1 1 1 1 1]]

Batch 2:
[[1 0 0 0 0]
 [1 1 0 0 0]
 [1 1 1 0 0]
 [1 1 1 1 0]
 [1 1 1 1 1]]

Batch 3:
[[1 0 0 0 0]
 [1 1 0 0 0]
 [1 1 1 0 0]
 [1 1 1 1 0]
 [1 1 1 1 1]]

Each batch receives the same mask tensor replicated according to the tiling multiplier tensor, ensuring that the same mask is applied to each example independently.


In [22]:
# Functional API for the transformer model

def get_causal_attention_mask(inputs):
    input_shape = tf.shape(inputs)
    batch_size, sequence_length = input_shape[0], input_shape[1]
    i = tf.range(sequence_length)[:, tf.newaxis]
    j = tf.range(sequence_length)
    mask = tf.cast(i >= j, dtype="int32")
    mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
    mult = tf.concat([tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype="int32")], axis=0)
    return tf.tile(mask, mult)

def transformer_decoder(num_heads,embed_dim,dense_dim,inputs,encoder_outputs,mask=None):

    causal_mask = get_causal_attention_mask(inputs)

    if mask is not None:

        padding_mask = tf.cast(mask[:,tf.newaxis,:],dtype="int32")
        padding_mask = tf.minimum(padding_mask,causal_mask)

    attention_layer_1 = MultiHeadAttention(num_heads=num_heads,key_dim=embed_dim)
    attention_output_1 = attention_layer_1(query=inputs,key=inputs,value=inputs,attention_mask=causal_mask)
    attention_output_1 = LayerNormalization()(attention_output_1 + inputs)

    attention_layer_2 = MultiHeadAttention(num_heads=num_heads,key_dim=embed_dim)
    attention_output_2 = attention_layer_2(query=attention_output_1,key=encoder_outputs,value=encoder_outputs,attention_mask=padding_mask)
    attention_output_2 = LayerNormalization()(attention_output_1 + attention_output_2)

    dense_block = Sequential([
        Dense(dense_dim,activation="relu"),
        Dense(embed_dim)
    ])

    proj_output = dense_block(attention_output_2)

    output_layer = LayerNormalization()(proj_output + attention_output_2)

    model_functional = Model(inputs = [inputs,encoder_outputs,mask],outputs=output_layer)


In [23]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self,sequence_length,input_dim,output_dim,**kwargs): # The sequence length needs to be known because we need to use that as input dimension for the Positional embedding
        super().__init__()
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.token_embeddings = Embedding(input_dim=input_dim,output_dim=output_dim)
        self.positional_embeddings = Embedding(input_dim=sequence_length,output_dim=output_dim)

    def call(self,inputs):
        length = tf.shape(inputs)[-1] # Retreiving the length of the sequence
        positions = tf.range(start=0,limit=length,delta=1) # List of number positions (1,2,3,4.....length of the sentence)
        embedded_tokens = self.token_embeddings(inputs) # Word embeddings
        embedded_positions = self.positional_embeddings(positions) # Position embeddings
        return embedded_tokens + embedded_positions # Adding word and position embeddings

    def compute_mask(self,inputs,mask=None): # Creating a mask to be able to ignore the zero paddings
        return tf.math.not_equal(inputs,0)

    def get_config(self): # Created so that we can use this custom class later as a layer
        config = super().get_config()
        config.update({
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim,
            "output_dim": self.output_dim,
        })
        return config   

In [24]:
# Subclass API for Transformer encoder
class TransformerEncoderClass(tf.keras.layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim # Size of input token vectors
        self.dense_dim = dense_dim # Size of dense layer
        self.num_heads = num_heads # Number of heads in multi head attention mechanism
        self.attention_layer = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim) # Multi head attention layer
        # Define the Sequential layers with input_shape
        # Output of the multihead attention mechanism will be fed to Dense block
        self.dense_block = Sequential([ 
            Dense(units=dense_dim, activation="relu", input_shape=(None, embed_dim)),  # input_shape should match the output shape of attention_layer
            Dense(units=embed_dim)
        ])
        # Using Layer Normalization instead of Batch Normalization because batch normalization does not work properly with sequence data
        self.layer_norm1 = LayerNormalization()
        self.layer_norm2 = LayerNormalization()

    def call(self, inputs, mask=None): # Call is used to call the class
        if mask is not None:
            mask = mask[:, tf.newaxis, :] # Converting the 2D mask generated by the embedding layer to 3D or 4D as required by the attention layer

        # Calling the attention layer to display the outputs (the standard Transformer architecture primarily uses the input sequence twice for self-attention (once for queries and once for keys and values))
        attention_output = self.attention_layer(inputs, inputs, attention_mask=mask)

        proj_input = self.layer_norm1(inputs + attention_output) # Residual connection between the inputs and output of the attention mechanism layer

        proj_output = self.dense_block(proj_input) # Calling the dense block to display the outputs

        return self.layer_norm2(proj_input + proj_output) # Residual connection between the inputs and output of the Dense block layer

     # Implomenting serialization so that we can save the model(Always include when building custom layers)    
    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "dense_dim": self.dense_dim,
            "num_heads": self.num_heads,
        })
        return config


In [57]:
# Now building the full end to end model using the Transformer Subclass API we build earlier

embed_dim = 256
dense_dim = 2048
num_heads = 8


# Encoder section
encoder_inputs = Input(shape=(None,),dtype="int64",name="english")
positional_embedding_layer1 = PositionalEmbedding(sequence_length=sequence_length,input_dim=vocab_size,output_dim=embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoderClass(embed_dim=embed_dim,dense_dim=dense_dim,num_heads=num_heads)(positional_embedding_layer1)

# Decoder section
decoder_inputs = Input(shape=(None,),dtype="int64",name="spanish")
positional_embedding_layer2 = PositionalEmbedding(sequence_length=sequence_length,input_dim=vocab_size,output_dim=embed_dim)(decoder_inputs)
decoder_outputs = TransformerDecoderClass(embed_dim=embed_dim,dense_dim=dense_dim,num_heads=num_heads)(positional_embedding_layer2,encoder_outputs) # Passing the encoder output here as input along with the previous embedding layer
dropout_layer = Dropout(0.5)(decoder_outputs)

# Dense layer to convert it into probability scores
dense_layer = Dense(vocab_size,activation="softmax")(dropout_layer) # Predicting a word for each output position

transformer_encoder_decoder_model = Model([encoder_inputs,decoder_inputs],dense_layer)
transformer_encoder_decoder_model.summary()


Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 english (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 spanish (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 positional_embedding_21 (Posit  (None, None, 256)   3845120     ['english[0][0]']                
 ionalEmbedding)                                                                                  
                                                                                                  
 positional_embedding_22 (Posit  (None, None, 256)   3845120     ['spanish[0][0]']          

In [36]:
from keras.callbacks import ModelCheckpoint

In [58]:
transformer_encoder_decoder_model.compile(optimizer="rmsprop",loss="sparse_categorical_crossentropy",metrics="accuracy")

callbacks_list = [ModelCheckpoint(filepath="transformer_encoder_decoder",save_best_only=True)]

transformer_encoder_decoder_model.fit(train_dataset,epochs=15,validation_data=val_dataset)

Epoch 1/15


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x1f11a654e20>

In [59]:
# Testing our model

spa_vocab = target_vectorization.get_vocabulary()
spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))
max_decoded_sentence_length = 20


def decode_sequence(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer_encoder_decoder_model([tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = spa_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
        if sampled_token == "[end]":
            break
    return decoded_sentence


test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(20):
    input_sentence = random.choice(test_eng_texts)
    print("-")
    print(input_sentence)
    print(decode_sequence(input_sentence))

-
I must get this work done by the day after tomorrow.
[start] debo llegar a este trabajo mañana [end]
-
Lunch is ready.
[start] el almuerzo está listo [end]
-
The people are so friendly.
[start] las personas son tan [UNK] [end]
-
I'm very excited.
[start] estoy muy el [UNK] [end]
-
You know about that, don't you?
[start] sabes acerca de eso no [end]
-
He was standing at the gate.
[start] Él estaba de vuelta a la puerta [end]
-
I want you to meet him in order to hear his opinion.
[start] quiero que lo [UNK] en el que no se haga tu opinión [end]
-
Justice will prevail.
[start] la [UNK] se [UNK] [end]
-
Have your friends deserted you?
[start] tus amigos te gustan tus amigos [end]
-
Tom kept the secret to himself.
[start] tom se fue un secreto para sí mismo [end]
-
There is more money than is needed.
[start] hay más dinero de lo que necesito [end]
-
He's jealous.
[start] está la [UNK] [end]
-
I haven't seen one of these in years.
[start] no he visto uno de estos años en tokio [end]
-
I pr