In [1]:
import tensorflow as tf
import keras
import numpy as np

In [2]:
# Machine translation use case

In [None]:
# Dataset download

dataset = tf.keras.utils.get_file(fname="spa-eng.zip",origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip", extract=True)

In [3]:
import pathlib
DATASET_DIR = r"C:\Users\arany\.keras\datasets\spa-eng"
base_path = pathlib.Path(DATASET_DIR)

text_filepath = base_path / "spa.txt"
with open(text_filepath) as f:
    lines = f.read().split("\n")[:-1]

text_pairs = []

for line in lines:
    english , spanish = line.split("\t") # Since each line contains the english and spanish sentence as tab seperated 
    spanish = "[start] " + spanish + " [end]" # So that we get a start and end sentence tokens for each spanish word (used in decoder)
    text_pairs.append((english,spanish))

In [4]:
import random
print(random.choice(text_pairs))

('Turn off the gas.', '[start] Corta el gas. [end]')


In [5]:
# Train, Test and Validation split

random.shuffle(text_pairs)

num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = int(len(text_pairs) - 2*num_val_samples)
num_test_samples = num_train_samples - num_val_samples

train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples+ num_val_samples]
test_pairs = text_pairs[num_train_samples+num_val_samples:]

In [6]:
# We need to create 2 seperate text vectorization for 2 different language(punctuations may be different) ( Also the brackets might get removed from spanish translation in [start] and [end], but we need them)

In [7]:
import string
import re

In [8]:
strip_chars = string.punctuation + "¿" # Adding the character as punctuation
strip_chars = strip_chars.replace("[","") # Removing brackets from string punctuations
strip_chars = strip_chars.replace("]","")

def custom_standardization(input_string): # Function for lower case conversioon and removing and adding punctuation for the spanish language
    output_string = tf.strings.lower(input_string)
    return tf.strings.regex_replace(output_string,f"[{re.escape(strip_chars)}]","") # Removing the punctuations 

In [9]:
from keras.layers import TextVectorization

In [10]:
vocab_size = 15000 # top 15000 frequent words
sequence_length = 20 # Sequence length restricted to 20 words

# Vectorization of the source sequence(english) and the target sequence(spanish)

source_vectorization = TextVectorization(max_tokens=vocab_size,output_mode="int",output_sequence_length=sequence_length)

# Generating spanish sequences with 1 extra token per sentence because we need to offset the sentence by one step during training
# If both source and target have the same number of tokens, then +1 added in the output sequence length means it will be predicting the next token..i,e the 4th token
# If we don't add +1 , then since both the source and target have same length, then there is no new next token to predict( so we add + 1 to the output_sequence_length)
target_vectorization = TextVectorization(max_tokens=vocab_size,output_mode="int",output_sequence_length=sequence_length + 1,standardize=custom_standardization) 

# Extracting the texts seperately and training our vocabularies seperately for english and spanish
train_english_texts = [pair[0] for pair in train_pairs]
train_spanish_texts = [pair[1] for pair in train_pairs]

source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_spanish_texts)


In [11]:
# Our data pipeline should return the below tuple
# (inputs,target) where inputs = {"encoder_inputs":"english sentence from the input file" , "decoder_inputs" : "spanish sentence from the input file"}
# target is the Spanish sentence offset by one step ahead

In [12]:
# Building the dataset 

batch_size = 64

def format_dataset(eng,spa): # Function to get the tuple
    eng = source_vectorization(eng)
    spa = target_vectorization(spa)
    return ({
        "english" : eng ,
        "spanish" : spa[:,:-1] # The input spanish sequence doesn't include the last token to keep the inputs and targets of same length
    }, spa[:,1:]) # The target spanish sequence is one step ahead(Both are still the same length)

def make_dataset(pairs): 
    eng_texts,spa_texts = zip(*pairs) # Unzips the pairs into separate lists of English and Spanish sentences.
    eng_texts = list(eng_texts)
    spa_texts = list(spa_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts,spa_texts)) # Converts the lists into TensorFlow dataset
    dataset = dataset.batch(batch_size) # Batches the dataset
    dataset = dataset.map(format_dataset,num_parallel_calls=4) # This function formats each pair of sentences into the required format(format defined in format_dataset)
    return dataset.shuffle(2048).prefetch(16).cache() # Prefetches 16 batches of data to speed up training

train_dataset = make_dataset(train_pairs)

val_dataset = make_dataset(val_pairs)

In [16]:
for inputs,targets in train_dataset.take(1):
    print(inputs["english"].shape)
    print(inputs["spanish"].shape)
    print(targets.shape)

(64, 20)
(64, 20)
(64, 20)


In [18]:
# RNN Model : Need two RNN components(encoder and decoder) --> encoder will turn the entire sequence into a single or set of vectors ---> This single or set of vectors will be used as initial state
# for the decoder, which will look at elements 0 to N in target sequence and try to predict the N+1 token

In [19]:
# Using GRU instead of LSTM because it makes things simpler,, since a single state vector is used in GRU as compared to multi set vector in LSTM

In [23]:
from keras.layers import GRU, Dense, Input, Embedding, Bidirectional, Dropout
from keras.models import Model

In [26]:
embed_dim = 256
latent_dim = 1024


# Encoder
source = Input(shape=(None,),dtype="int64",name="english") # English Source sentence
embedding_layer1 = Embedding(input_dim=vocab_size,output_dim=embed_dim,mask_zero=True)(source) # Masking is a critical step needed( Sentences with variable lengths will be padded)
encoded_source = Bidirectional(GRU(latent_dim),merge_mode="sum")(embedding_layer1) # Output layer of our encoder("sum" means the forward and backward direction representations will be summed together to get the final output encoded representations)

# Decoder
past_target = Input(shape=(None,),dtype="int64",name="spanish") # Spanish Target sentence
embedding_layer2 = Embedding(input_dim=vocab_size,output_dim=embed_dim,mask_zero=True)(past_target) # Masking is critical here as well
decoded_gru = GRU(latent_dim,return_sequences=True) # Specifying the units of the GRU layer and we need the full sequence of outputs generated by the decoder not just the final last output , so return_sequences is set to True

# The below layer is the main decoder GRU layer and we are initializing it by passing the information that we got as output from our encoder
# So the decoder can use the target sequence to predict new tokens using the context information that was learned by the encoder while it was learning on the source input
decoded_gru_initialized = decoded_gru(embedding_layer2,initial_state=encoded_source) # Encoder source sequence set as the initial state for the decoder GRU  
dropout_layer = Dropout(0.5)(decoded_gru_initialized)
target_next_token_layer = Dense(vocab_size,activation="softmax")(dropout_layer)
seq2seq_rnn = Model([source,past_target],target_next_token_layer) # Inputs are source and past_target layers , output layer is target_next_token_layer

seq2seq_rnn.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 english (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 spanish (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_4 (Embedding)        (None, None, 256)    3840000     ['english[0][0]']                
                                                                                                  
 embedding_5 (Embedding)        (None, None, 256)    3840000     ['spanish[0][0]']                
                                                                                            

In [27]:
seq2seq_rnn.compile(optimizer="rmsprop",loss="sparse_categorical_crossentropy",metrics="accuracy")
seq2seq_rnn.fit(train_dataset,epochs=15,validation_data=val_dataset)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x173f5c50490>

In [28]:
spa_vocab = target_vectorization.get_vocabulary() # Getting the vocabulary
spa_index_lookup = dict(zip(range(len(spa_vocab)),spa_vocab)) # Creating a dictionary to be used to retrieve token and the corresponding words
max_decoded_sentence_length = 20

def decode_sequence(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence]) # Vectorizing the input sentence
    decoded_sentence = "[start]" # Initializing and defining the first token of the output sentence
    for i in range(max_decoded_sentence_length): # Looping till the max sentence length we want
        tokenized_target_sentence = target_vectorization([decoded_sentence])  # Vectorizing the previous tokens present in the output sentence
        next_token_prediction = seq2seq_rnn.predict([tokenized_input_sentence,tokenized_target_sentence]) # Predicting the next token based on the previous token vectors
        sampled_token_index = np.argmax(next_token_prediction[0,i,:]) # Taking the highest predicted word token from the model prediction
        sampled_token = spa_index_lookup[sampled_token_index] # Going through the target sequence vocabulary to check which word corresponds to the predicted token
        decoded_sentence += " " + sampled_token # Adding the word to the output sentence
        if sampled_token == "[end]": # If we get [end] token before we reach the max sentence length, then break the loop
            break

    return decoded_sentence

test_eng_texts = [pair[0] for pair in test_pairs] # Taking the english sentences from the test pairs
for _ in range(20): # 20 sentences to be predicted
    input_sequence = random.choice(test_eng_texts) # Randomly choosing the sentences
    print("-------------")
    print(input_sequence)
    print(decode_sequence(input_sequence))

-------------
Good words are worth a lot, but cost almost nothing.
[start] las [UNK] mucho que las mujeres [UNK] mucho pero no son las cosas más tarde [end]
-------------
We all like cycling.
[start] a todos nos gusta el extranjero [end]
-------------
It's dangerous to ignore the signal at a railroad crossing.
[start] es peligroso que [UNK] a la [UNK] del menos me [UNK] [end]
-------------
Tom ate the whole pizza by himself.
[start] tom se comió la solo para ti [end]
-------------
Tom helps Mary because he wants to, not because he has to.
[start] tom le pidió a mary que no le [UNK] porque no lo puedo [end]
-------------
Tell me who you gave your old toolbox to.
[start] dime que le [UNK] que tu otro [UNK] [end]
-------------
I ate a hot dog for lunch.
[start] me tomó un perro que [UNK] [end]
-------------
Stop saying that!
[start] deja de decir eso [end]
-------------
Is your dog mean?
[start] tu perro es tu [end]
-------------
The recent advances in medicine are remarkable.
[start] la 