In [1]:
%tensorflow_version 2.x
import tensorflow as tf
import tensorflow_datasets as tfds

TensorFlow 2.x selected.


In [0]:
def download_dataset(tfds_address, disable_progress_bar=False):
    if disable_progress_bar:
        tfds.disable_progress_bar()
    data, metadata = tfds.load(tfds_address, with_info=True, as_supervised=True)
    train_data, val_data = data['train'], data['validation']
    return train_data, val_data

In [0]:
def get_tokenizers(dataset, approx_vocab_size=2 ** 13):
    source_tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
        (i.numpy() for i, j in dataset),
        target_vocab_size=approx_vocab_size
    )
    target_tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
        (j.numpy() for i, j in dataset),
        target_vocab_size=approx_vocab_size
    )
    return source_tokenizer, target_tokenizer

In [0]:
class DataLoader:

    def __init__(self, source_tokenizer, target_tokenize, max_limit=40):
        self.source_tokenizer = source_tokenizer
        self.target_tokenizer = target_tokenizer
        self.max_limit = max_limit

    def preprocess(self, language_1, language_2):
        language_1 = [
            self.source_tokenizer.vocab_size
        ] + self.source_tokenizer.encode(
            language_1.numpy()
        ) + [
             self.source_tokenizer.vocab_size + 1
        ]
        language_2 = [
            self.target_tokenizer.vocab_size
        ] + self.target_tokenizer.encode(
            language_2.numpy()
        ) + [
            self.target_tokenizer.vocab_size + 1
        ]
        return language_1, language_2

    def map_function(self, language_1, language_2):
        language_1, language_2 = tf.py_function(
            self.preprocess,
            [language_1, language_2],
            [tf.int64, tf.int64]
        )
        language_1.set_shape([None])
        language_2.set_shape([None])
        return language_1, language_2

    def filter_max_length(self, x, y):
        return tf.logical_and(
            tf.size(x) <= self.max_limit,
            tf.size(y) <= self.max_limit
        )

    def get_dataset(self, dataset, buffer_size, batch_size):
        tf_dataset = dataset.map(self.map_function)
        tf_dataset = tf_dataset.filter(self.filter_max_length)
        tf_dataset = tf_dataset.cache()
        tf_dataset = tf_dataset.shuffle(buffer_size)
        tf_dataset = tf_dataset.padded_batch(batch_size, padded_shapes=([None],[None]))
        tf_dataset = tf_dataset.prefetch(tf.data.experimental.AUTOTUNE)
        return tf_dataset

In [6]:
train_data, val_data = download_dataset('ted_hrlr_translate/pt_to_en', disable_progress_bar=True)

[1mDownloading and preparing dataset ted_hrlr_translate (124.94 MiB) to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0...[0m
Shuffling and writing examples to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0.incompleteJPOJHN/ted_hrlr_translate-train.tfrecord
Shuffling and writing examples to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0.incompleteJPOJHN/ted_hrlr_translate-validation.tfrecord
Shuffling and writing examples to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0.incompleteJPOJHN/ted_hrlr_translate-test.tfrecord
[1mDataset ted_hrlr_translate downloaded and prepared to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0. Subsequent calls will reuse this data.[0m


In [0]:
source_tokenizer, target_tokenizer = get_tokenizers(train_data)

In [8]:
sample_string = 'Downy feathers kiss your face and flutter everywhere'
tokenized_string = target_tokenizer.encode(sample_string)
print('Sample string in English: {}'.format(sample_string))
print('Sample string tokenized: {}'.format(tokenized_string))

Sample string in English: Downy feathers kiss your face and flutter everywhere
Sample string tokenized: [7899, 1383, 113, 7206, 388, 9, 1519, 1117, 76, 646, 4, 1220, 3165, 7863, 1392]


In [9]:
sample_string = 'Penas felpudas beijam seu rosto e flutuam por toda parte'
tokenized_string = source_tokenizer.encode(sample_string)
print('Sample string in Portuguese: {}'.format(sample_string))
print('Sample string tokenized: {}'.format(tokenized_string))

Sample string in Portuguese: Penas felpudas beijam seu rosto e flutuam por toda parte
Sample string tokenized: [8038, 2641, 17, 5242, 1174, 43, 3999, 1625, 7990, 109, 5376, 6, 2839, 50, 23, 166, 962]


In [14]:
dataloader = DataLoader(source_tokenizer, target_tokenizer)
train_dataset = dataloader.get_dataset(train_data, 20000, 64)
val_dataset = dataloader.get_dataset(val_data, 20000, 64)
print(train_dataset)
print(val_dataset)

<DatasetV1Adapter shapes: ((None, None), (None, None)), types: (tf.int64, tf.int64)>
<DatasetV1Adapter shapes: ((None, None), (None, None)), types: (tf.int64, tf.int64)>


In [19]:
source_language_batch, target_language_batch = next(iter(train_dataset))
source_language_batch.shape, target_language_batch.shape

(TensorShape([64, 40]), TensorShape([64, 40]))