In [2]:
#!wget http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
#!unzip -q spa-eng.zip

In [7]:
text_file = 'spa-eng/spa.txt'
with open(text_file) as f:
    lines = f.read().split('\n')[:-1]
text_pairs = []
for line in lines:
    english, spanish = line.split('\t')
    spanish = '[start]' + spanish + '[end]'
    text_pairs.append((english, spanish))

import random
len(text_pairs), random.choice(text_pairs)

(118964,
 ("Tom doesn't know if he will be able to visit us next Monday.",
  '[start]Tom no sabe si nos podrá visitar el próximo lunes.[end]'))

In [10]:
random.shuffle(text_pairs)
n_val_samples = int(0.15 * len(text_pairs))
n_train_samples = len(text_pairs) - 2 * n_val_samples
train_pairs = text_pairs[:n_train_samples]
val_pairs = text_pairs[n_train_samples:(n_train_samples + n_val_samples)]
test_pairs = text_pairs[-n_val_samples:]

len(train_pairs), len(val_pairs), len(test_pairs)

(83276, 17844, 17844)

In [11]:
import tensorflow as tf
import string
import re

2023-09-26 12:27:10.250431: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-26 12:27:10.271873: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-26 12:27:10.272297: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [15]:
strip_chars = string.punctuation + '¿'
strip_chars = strip_chars.replace('[', '').replace(']', '')

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, f'[{re.escape(strip_chars)}]', '')

custom_standardization("hello? I don't think")

<tf.Tensor: shape=(), dtype=string, numpy=b'hello i dont think'>

In [32]:
from keras import layers, activations, optimizers, losses, metrics, callbacks

vocab_size = 15000
seq_len = 20

eng_vectorization = layers.TextVectorization(
    max_tokens=vocab_size, output_mode='int', output_sequence_length=seq_len)
spa_vectorization = layers.TextVectorization(
    max_tokens=vocab_size, output_mode='int', output_sequence_length=seq_len + 1, standardize=custom_standardization)

train_eng_text = [pair[0] for pair in train_pairs]
train_spa_text = [pair[1] for pair in train_pairs]
eng_vectorization.adapt(train_eng_text)
spa_vectorization.adapt(train_spa_text)

spa_vectorization('[start]'), spa_vectorization('[end]')

(<tf.Tensor: shape=(21,), dtype=int64, numpy=array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])>,
 <tf.Tensor: shape=(21,), dtype=int64, numpy=
 array([6649,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0])>)

In [34]:
def format_dataset(eng, spa):
    eng = eng_vectorization(eng)
    spa = spa_vectorization(spa)
    return ({
        'eng': eng,
        'spa': spa[:,:-1]
    },
    spa[:,1:])

format_dataset([train_pairs[0][0]], [train_pairs[0][1]])

({'eng': <tf.Tensor: shape=(1, 20), dtype=int64, numpy=
  array([[   3, 3093,    2, 1289,   11,  521,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0]])>,
  'spa': <tf.Tensor: shape=(1, 20), dtype=int64, numpy=
  array([[   1,    5, 1090,   40, 5951,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0]])>},
 <tf.Tensor: shape=(1, 20), dtype=int64, numpy=
 array([[   5, 1090,   40, 5951,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0]])>)

In [79]:
batch_size = 64

def make_dataset(pairs):
    eng_texts, spa_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    spa_texts = list(spa_texts)

    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts))
    dataset = dataset.batch(batch_size=batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=16)
    return dataset.shuffle(2048).prefetch(16).cache()

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

for x, y in train_ds.take(1):
    print(x['eng'].shape)
    print(x['spa'].shape)
    print(y.shape)


(64, 20)
(64, 20)
(64, 20)


2023-09-26 14:48:47.422625: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [81]:
import keras

embed_dim = 256
latent_dim = 1024

source = layers.Input(shape=(None,), dtype=tf.int64, name='eng')
x = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim, mask_zero=True)(source)
encoded_source = layers.Bidirectional(layers.GRU(units=latent_dim), merge_mode='sum')(x)

past_target = layers.Input(shape=(None,), dtype=tf.int64, name='spa')
x = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim, mask_zero=True)(past_target)
decoder_gru = layers.GRU(units=latent_dim, return_sequences=True)
x = decoder_gru(x, initial_state=encoded_source)
x = layers.Dropout(0.5)(x)
target_next_step = layers.Dense(vocab_size, activation=activations.softmax)(x)
seq2seq_rnn = keras.Model(inputs=[source, past_target], outputs=target_next_step)


In [84]:
seq2seq_rnn.compile(optimizer=optimizers.RMSprop(), loss=losses.SparseCategoricalCrossentropy(), metrics=[metrics.SparseCategoricalAccuracy()])
callback_list = [
    callbacks.ModelCheckpoint('seq2seq_rnn', save_best_only=True)
]
seq2seq_rnn.fit(train_ds, validation_data=val_ds, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x7fcb1426ded0>

In [122]:
import numpy as np

spa_vocab = spa_vectorization.get_vocabulary()
spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))
max_decoded_sentence_length = 20

print(spa_vectorization('[start]'))

def decode_seq(input_sentence):
    tokenized_input_sentence = eng_vectorization([input_sentence])
    decoded_sentence = '[start]'
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = spa_vectorization([decoded_sentence])
        next_token_predict = seq2seq_rnn([tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = np.argmax(next_token_predict[0, i, :])
        sampled_token = spa_index_lookup[sampled_token_index]
        decoded_sentence += ' ' + sampled_token
        if sampled_token == '[end]':
            break
    return decoded_sentence

test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(1):
    input_sentence = random.choice(test_eng_texts)
    print('-')
    print(input_sentence)
    print(decode_seq(input_sentence))

tf.Tensor([1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], shape=(21,), dtype=int64)
-
Were I in your position, I would oppose that plan.
[start] en su lugar de que [UNK] su plan[end]            


In [105]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.attention_2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential([
            layers.Dense(dense_dim, activation=activations.relu),
            layers.Dense(embed_dim)
        ])
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()

    def get_config(self):
        config = super().get_config()
        config.update({
            'embed_dim': self.embed_dim,
            'dense_dim': self.dense_dim,
            'num_heads': self.num_heads
        })
        return config
    
    def get_casual_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, seq_len = input_shape[0], input_shape[1]
        i = tf.range(seq_len)[:, tf.newaxis]
        j = tf.range(seq_len)
        mask = tf.cast(i >= j, dtype=tf.int32)
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat([
            tf.expand_dims(batch_size, -1),
            tf.constant([1, 1], dtype=tf.int32)
        ], axis=0)
        return tf.tile(mask, mult)
    
    def call(self, inputs, encoder_outputs, mask=None):
        casual_mask = self.get_casual_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype=tf.int32)
            padding_mask = tf.minimum(padding_mask, casual_mask)
        attention_output_1 = self.attention_1(
            query=inputs,
            value=inputs,
            key=inputs,
            attention_mask=casual_mask
        )
        attention_output_1 = self.layernorm_1(inputs + attention_output_1)
        attention_output_2 = self.attention_2(
            query=attention_output_1,
            value=encoder_outputs,key=encoder_outputs,
            attention_mask=padding_mask
        )
        attention_output_2 = self.layernorm_2(attention_output_1 + attention_output_2)
        proj_output = self.dense_proj(attention_output_2)
        return self.layernorm_3(attention_output_2 + proj_output)

In [104]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
             layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        attention_output = self.attention(
            inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config


class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=input_dim, output_dim=output_dim)
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

    def get_config(self):
        config = super().get_config()
        config.update({
            "output_dim": self.output_dim,
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim,
        })
        return config

In [106]:
embed_dim = 256
dense_dim = 2048
num_heads = 8

encoder_inputs = layers.Input(shape=(None,), dtype=tf.int64, name='eng')
x = PositionalEmbedding(seq_len, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)

decoder_inputs = keras.Input(shape=(None,), dtype=tf.int64, name='spa')
x = PositionalEmbedding(seq_len, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation=activations.softmax)(x)
transformer = keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_outputs)

In [107]:
transformer.compile(optimizer=optimizers.RMSprop(), loss=losses.SparseCategoricalCrossentropy(), metrics=[metrics.SparseCategoricalAccuracy()])
callback_list = [
    callbacks.ModelCheckpoint('full_transformer_decoder', save_best_only=True)
]
transformer.fit(train_ds, epochs=30, validation_data=val_ds, callbacks=callback_list)

Epoch 1/30




INFO:tensorflow:Assets written to: full_transformer_decoder/assets


Epoch 2/30


INFO:tensorflow:Assets written to: full_transformer_decoder/assets


Epoch 3/30


INFO:tensorflow:Assets written to: full_transformer_decoder/assets


Epoch 4/30


INFO:tensorflow:Assets written to: full_transformer_decoder/assets


Epoch 5/30


INFO:tensorflow:Assets written to: full_transformer_decoder/assets


Epoch 6/30


INFO:tensorflow:Assets written to: full_transformer_decoder/assets


Epoch 7/30


INFO:tensorflow:Assets written to: full_transformer_decoder/assets


Epoch 8/30


INFO:tensorflow:Assets written to: full_transformer_decoder/assets


Epoch 9/30


INFO:tensorflow:Assets written to: full_transformer_decoder/assets


Epoch 10/30


INFO:tensorflow:Assets written to: full_transformer_decoder/assets


Epoch 11/30


INFO:tensorflow:Assets written to: full_transformer_decoder/assets


Epoch 12/30


INFO:tensorflow:Assets written to: full_transformer_decoder/assets


Epoch 13/30


INFO:tensorflow:Assets written to: full_transformer_decoder/assets


Epoch 14/30


INFO:tensorflow:Assets written to: full_transformer_decoder/assets


Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7fca8f147610>

In [129]:
spa_vocab = spa_vectorization.get_vocabulary()
spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))
max_decoded_sentence_length = 20

print(spa_vectorization(['[start]']))

def decode_sequence(input_sentence):
    tokenized_input_sentence = eng_vectorization([input_sentence])
    decoded_sentence = '[start]'
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = spa_vectorization([decoded_sentence])
        tokenized_target_sentence = tokenized_target_sentence[:, :-1] # length of sequence must be 20 (with padding) to match input
        predictions = transformer(inputs=[tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = spa_index_lookup[sampled_token_index]
        decoded_sentence += ' ' + sampled_token
        if sampled_token == '[end]':
            break
    return decoded_sentence

test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(1):
    input_sentence = random.choice(test_eng_texts)
    print('-')
    print(input_sentence)
    print(decode_sequence(input_sentence))
    print(decode_seq(input_sentence))

spa_vocab.index('[UNK]')

tf.Tensor([[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]], shape=(1, 21), dtype=int64)
-
There's no water coming out of the shower.
[start] no hay agua de la ducha[end]              
[start] no hay agua de la ducha[end]              


1