In [None]:
# Install the nightly version of TensorFlow to use the improved
# masking support for `tf.keras.layers.MultiHeadAttention`.
!pip uninstall -y -q tensorflow keras tensorflow-estimator tensorflow-text
!pip install -q tensorflow_datasets
!pip install -q -U tensorflow-text tensorflow

In [None]:
import logging
import time

import numpy as np
import matplotlib.pyplot as plt

import tensorflow_datasets as tfds
import tensorflow as tf
import tensorflow_text

## Dataset handling

In [None]:
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)

In [None]:
train_examples, val_examples = examples['train'], examples['validation']

In [None]:
for pt_examples, en_examples in train_examples.batch(3).take(1):
  print('> Examples portuguese')
  for pt in pt_examples.numpy():
    print(pt.decode('utf-8'))
  print()
  for en in en_examples.numpy():
    print(en.decode('utf-8'))

## Tokenization

In [None]:
model_name = 'ted_hrlr_translate_pt_en_converter'
tf.keras.utils.get_file(
    f'{model_name}.zip',
    f'https://storage.googleapis.com/download.tensorflow.org/models/{model_name}.zip',
    cache_dir='.', cache_subdir='', extract=True
)

In [None]:
tokenizers = tf.saved_model.load(model_name)

In [None]:
[item for item in dir(tokenizers.en) if not item.startswith('_')]

In [None]:
en_examples

In [None]:
encoded = tokenizers.en.tokenize(en_examples)

In [None]:
encoded

In [None]:
pt_examples

In [None]:
encoded_pt = tokenizers.pt.tokenize(pt_examples)

In [None]:
encoded_pt

In [None]:
decoded_pt = tokenizers.pt.detokenize(encoded_pt)

In [None]:
decoded_pt

In [None]:
pt_lengths = []
en_lengths = []

for pt_examples, en_examples in train_examples.batch(1024):
  pt_tokens = tokenizers.pt.tokenize(pt_examples)
  pt_lengths.append(pt_tokens.row_lengths())

  en_tokens = tokenizers.en.tokenize(en_examples)
  en_lengths.append(en_tokens.row_lengths())

In [None]:
all_lengths = np.concatenate(pt_lengths)

plt.hist(all_lengths, np.linspace(0, 500, 101))
plt.ylim(plt.ylim())
max_length = max(all_lengths)
plt.plot([max_length, max_length], plt.ylim())
plt.title(f'Maximum tokens per example: {max_length}')

In [None]:
all_lengths = np.concatenate(en_lengths)

plt.hist(all_lengths, np.linspace(0, 500, 101))
plt.ylim(plt.ylim())
max_length = max(all_lengths)
plt.plot([max_length, max_length], plt.ylim())
plt.title(f'Maximum tokens per example: {max_length}')

## Setup a data pipeline with tf.data

In [None]:
MAX_TOKENS = 128
def prepare_batch(pt, en):
  pt = tokenizers.pt.tokenize(pt)
  pt = pt[:, :MAX_TOKENS]
  pt = pt.to_tensor() # Convert `RaggedTensor` to 0-padded dense Tensor

  en = tokenizers.en.tokenize(en)
  en = en[:, :(MAX_TOKENS+1)]
  en_inputs = en[:, :-1].to_tensor() # Drop the [END]
  en_labels = en[:, 1:].to_tensor() # Drop the [START]

  return (pt, en_inputs), en_labels

In [None]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64

In [None]:
def make_batches(ds):
  return (
      ds
      .shuffle(BUFFER_SIZE)
      .batch(BATCH_SIZE)
      .map(prepare_batch, tf.data.AUTOTUNE)
      .prefetch(buffer_size=tf.data.AUTOTUNE)
  )

In [None]:
train_batches = make_batches(train_examples)
val_batches = make_batches(val_examples)

In [None]:
for (pt, en), en_labels in train_batches.take(1):
  print(pt.shape)
  print(en.shape)

## Positional Encoding

In [None]:
def positional_encoding(length, depth):
  depth = depth / 2 # => 2/d_model

  positions = np.arange(length)[:, np.newaxis] # (hidden_dim, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth # (1, depth)

  angle_rates = 1 / (10000**depths)
  angle_rads = positions * angle_rates

  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1
  )

  return tf.cast(pos_encoding, dtype=tf.float32)

In [None]:
pos_encoding = positional_encoding(length=2048, depth=512)

# Check the shape.
print(pos_encoding.shape)

# Plot the dimensions.
plt.pcolormesh(pos_encoding.numpy().T, cmap='RdBu')
plt.ylabel('Depth')
plt.xlabel('Position')
plt.colorbar()
plt.show()

In [None]:
pos_encoding/=tf.norm(pos_encoding, axis=1, keepdims=True)
p = pos_encoding[1000]
dots = tf.einsum('pd,d -> p', pos_encoding, p)
plt.subplot(2,1,1)
plt.plot(dots)
plt.ylim([0,1])
plt.plot([950, 950, float('nan'), 1050, 1050],
         [0,1,float('nan'),0,1], color='k', label='Zoom')
plt.legend()
plt.subplot(2,1,2)
plt.plot(dots)
plt.xlim([950, 1050])
plt.ylim([0,1])

## Positional Embedding

In [None]:
class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
    self.pos_encoding = positional_encoding(length=2048, depth=d_model)

  def compute_mask(self, *args, **kwargs):
    return self.embedding.compute_mask(*args, **kwargs)

  def call(self, x):
    length = tf.shape(x)[1]
    x = self.embedding(x)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x

In [None]:
embed_pt = PositionalEmbedding(tokenizers.pt.get_vocab_size(), d_model=512)
embed_en = PositionalEmbedding(tokenizers.en.get_vocab_size(), d_model=512)

In [None]:
pt_emb = embed_pt(pt)
en_emb = embed_en(en)

In [None]:
en_emb._keras_mask

## Define the feed forward network

In [None]:
def point_wise_feed_forward_network(
    d_model, # Input/Output dimensionality
    dff, # Inner-layer dimensionality
):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'), # shape: (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model) # shape: (batch_size, seq_len, d_model)
  ])

In [None]:
sample_ffn = point_wise_feed_forward_network(d_model=512, dff=2048)

In [None]:
print(sample_ffn(tf.random.uniform((64, 50, 512))).shape)