### A machine translation example

In [28]:
import random
import tensorflow as tf
import string
import re
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Embedding, TextVectorization, Bidirectional, GRU, Dropout, Dense
import numpy as np

In [2]:
text_file = "spa-eng/spa.txt"

In [3]:
text_pairs = []

with open(text_file, "r", encoding="utf-8") as f:
    lines = f.read().split("\n")[:-1]

for line in lines:
    english, spanish = line.split("\t")
    spanish = "[start] " + spanish + " [end]"
    text_pairs.append((english, spanish))

In [4]:
print(random.choice(text_pairs))

('Can you recommend a good play?', '[start] ¿Puedes recomendar una buena obra de teatro? [end]')


In [5]:
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples

In [17]:
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples:]
train_pairs[:2]

[('I have two brothers and one sister.',
  '[start] Tengo dos hermanos y una hermana. [end]'),
 ('When do you usually get off work?',
  '[start] ¿A qué hora sale comúnmente del trabajo? [end]')]

In [7]:
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

In [11]:
def standardization(input_string):
    
    lowercase = tf.strings.lower(input_string)
    
    return tf.strings.regex_replace(lowercase, f"[{re.escape(strip_chars)}]", "")

In [9]:
vocab_size = 15000
sequence_length = 20
batch_size = 64

**Vectorizing the English and Spanish text pairs**

In [10]:
source_vectorization = TextVectorization(max_tokens=vocab_size, output_mode="int", output_sequence_length=sequence_length)

In [12]:
target_vectorization = TextVectorization(max_tokens=vocab_size, output_mode="int", output_sequence_length=sequence_length + 1, 
                                         standardize=standardization)

In [13]:
train_english_texts = [pair[0] for pair in train_pairs]
train_spanish_texts = [pair[1] for pair in train_pairs]
print(train_english_texts[0])
print(train_spanish_texts[0])

I have two brothers and one sister.
[start] Tengo dos hermanos y una hermana. [end]


In [18]:
source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_spanish_texts)

**Preparing datasets for the translation task**

In [19]:
def format_dataset(eng, spa):
    
    eng = source_vectorization(eng)
    spa = target_vectorization(spa)
    
    return ({
        "english": eng,
        "spanish": spa[:, :-1],
    }, spa[:, 1:])

In [20]:
def make_dataset(pairs):
    
    eng_texts, spa_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    spa_texts = list(spa_texts)
    
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=4)
    
    return dataset.shuffle(2048).prefetch(16).cache()

In [21]:
train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [22]:
for inputs, targets in train_ds.take(1):
    print(f"inputs['english'].shape: {inputs['english'].shape}")
    print(f"inputs['spanish'].shape: {inputs['spanish'].shape}")
    print(f"targets.shape: {targets.shape}")

inputs['english'].shape: (64, 20)
inputs['spanish'].shape: (64, 20)
targets.shape: (64, 20)


In [23]:
embed_dim = 256
latent_dim = 1024

**GRU-based encoder**

In [24]:
source = Input(shape=(None,), dtype="int64", name="english")
x = Embedding(vocab_size, embed_dim, mask_zero=True)(source)
encoded_source = Bidirectional(GRU(latent_dim), merge_mode="sum")(x)

**GRU-based decoder and the end-to-end model**

In [26]:
past_target = Input(shape=(None,), dtype="int64", name="spanish")
x = Embedding(vocab_size, embed_dim, mask_zero=True)(past_target)
Decoder = GRU(latent_dim, return_sequences=True)
x = Decoder(x, initial_state=encoded_source)
x = Dropout(0.5)(x)
target_next_step = Dense(vocab_size, activation="softmax")(x)
model = Model([source, past_target], target_next_step)

**Training our recurrent sequence-to-sequence model**

In [27]:
model.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.fit(train_ds, epochs=15, validation_data=val_ds)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x18535cc3a30>

**Translating new sentences with our RNN encoder and decoder**

In [29]:
spa_vocab = target_vectorization.get_vocabulary()
spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))
max_decoded_sentence_length = 20

In [30]:
def decode_sequence(input_sentence):
    
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"
    
    for i in range(max_decoded_sentence_length):
        
        tokenized_target_sentence = target_vectorization([decoded_sentence])
        next_token_predictions = model.predict([tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = np.argmax(next_token_predictions[0, i, :])
        sampled_token = spa_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
    
        if sampled_token == "[end]":
            break
    
    return decoded_sentence

In [31]:
test_eng_texts = [pair[0] for pair in test_pairs]

for _ in range(20):
    input_sentence = random.choice(test_eng_texts)
    print("-")
    print(input_sentence)
    print(decode_sequence(input_sentence))

-
How many hours did you spend with Tom?
[start] cuántas horas [UNK] con tom [end]
-
What's your favorite musical instrument?
[start] cuál es tu [UNK] favorito [end]
-
Is she healthy?
[start] ella está es [end]
-
Let's not pretend otherwise.
[start] no [UNK] lo que [UNK] [end]
-
This is the most interesting story that I have ever read.
[start] esta es la historia más interesante que he visto de leer [end]
-
I've heard of you.
[start] he oído de ti [end]
-
I know it wasn't easy.
[start] sé que no fue fácil [end]
-
We went to the park to play.
[start] fuimos al parque a jugar [end]
-
Can you see what's wrong with this picture?
[start] puedes ver qué está algo esta [UNK] [end]
-
I need to buy stamps.
[start] necesito comprar una de tengo [end]
-
Tom's life was changed forever.
[start] la vida de tom se ha ido por ellos [end]
-
Our team is winning.
[start] nuestro equipo está en qué le va a dar una nosotros [end]
-
I usually get up at 6:00.
[start] yo lo en cómo se me [UNK] a los [UNK] [en

[start] deberías ir a la escuela [end]
-
Don't make any noise.
[start] no hagas ruido [end]
