# Tester for various functions in Tensforflow Transformer

In [3]:

# set tensorflow device
import tensorflow as tf

# set tensorflow gpu support
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    print('GPU available')
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)
else:
    print('No GPU available')

GPU available


In [1]:
%load_ext autoreload
%autoreload 2

# Create dataset

In [10]:
from dataset import Dataset
import tensorflow_datasets as tfds

ds = Dataset()
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en',
                               with_info=True,
                               as_supervised=True)

train_examples, val_examples = examples['train'], examples['validation']

2024-07-28 08:45:19.311550: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:966] PluggableGraphOptimizer failed: INVALID_ARGUMENT: Failed to deserialize the `graph_buf`.


In [11]:
for pt_examples, en_examples in train_examples.batch(3).take(1):
  print('> Examples in Portuguese:')
  for pt in pt_examples.numpy():
    print(pt.decode('utf-8'))
  print()

  print('> Examples in English:')
  for en in en_examples.numpy():
    print(en.decode('utf-8'))

> Examples in Portuguese:
e quando melhoramos a procura , tiramos a única vantagem da impressão , que é a serendipidade .
mas e se estes fatores fossem ativos ?
mas eles não tinham a curiosidade de me testar .

> Examples in English:
and when you improve searchability , you actually take away the one advantage of print , which is serendipity .
but what if it were active ?
but they did n't test for curiosity .


2024-07-28 08:45:22.517996: W tensorflow/core/kernels/data/cache_dataset_ops.cc:913] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2024-07-28 08:45:22.518457: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [7]:
tokenizers = ds.build_tokenizer()

[item for item in dir(tokenizers.en) if not item.startswith('_')]

print('> This is a batch of strings:')
for en in en_examples.numpy():
  print(en.decode('utf-8'))


Downloading data from https://storage.googleapis.com/download.tensorflow.org/models/ted_hrlr_translate_pt_en_converter.zip
[1m184801/184801[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
> This is a batch of strings:
and when you improve searchability , you actually take away the one advantage of print , which is serendipity .
but what if it were active ?
but they did n't test for curiosity .


2024-07-28 08:43:36.254961: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.
2024-07-28 08:43:36.257260: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:966] PluggableGraphOptimizer failed: INVALID_ARGUMENT: Failed to deserialize the `graph_buf`.


In [12]:
# Create training and validation set batches.
train_batches = ds.make_batches(train_examples)
val_batches = ds.make_batches(val_examples)

In [13]:
for (pt, en), en_labels in train_batches.take(1):
  break

print(pt.shape)
print(en.shape)
print(en_labels.shape)

(64, 99)
(64, 101)
(64, 101)


2024-07-28 08:45:37.380364: W tensorflow/core/kernels/data/cache_dataset_ops.cc:913] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


## Embeddings

In [14]:
from embeddings import PositionalEmbedding

embed_pt = PositionalEmbedding(vocab_size=tokenizers.pt.get_vocab_size().numpy(), d_model=512)
embed_en = PositionalEmbedding(vocab_size=tokenizers.en.get_vocab_size().numpy(), d_model=512)

pt_emb = embed_pt(pt)
en_emb = embed_en(en)

2024-07-28 08:58:14.207179: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:966] PluggableGraphOptimizer failed: INVALID_ARGUMENT: Failed to deserialize the `graph_buf`.


In [15]:
en_emb._keras_mask

<tf.Tensor: shape=(64, 101), dtype=bool, numpy=
array([[ True,  True,  True, ..., False, False, False],
       [ True,  True,  True, ..., False, False, False],
       [ True,  True,  True, ..., False, False, False],
       ...,
       [ True,  True,  True, ..., False, False, False],
       [ True,  True,  True, ..., False, False, False],
       [ True,  True,  True, ..., False, False, False]])>

## Attention layer

In [16]:
# Cross attention. This layer connects the encoder and decoder. It receives the encoder embeddings and the decoder
# embeddings as inputs. The output is the attention output and the attention weights.
from attention import CrossAttention

sample_ca = CrossAttention(num_heads=2, key_dim=512)

print(pt_emb.shape)
print(en_emb.shape)
print(sample_ca(en_emb, pt_emb).shape)

(64, 99, 512)
(64, 101, 512)




(64, 101, 512)




In [17]:
# Global self attention - The global self-attention layer on the other hand lets every sequence element directly access every other sequence element, 
# with only a few operations, and all the outputs can be computed in parallel.
from attention import GlobalSelfAttention

sample_gsa = GlobalSelfAttention(num_heads=2, key_dim=512)

print(pt_emb.shape)
print(sample_gsa(pt_emb).shape)

(64, 99, 512)
(64, 99, 512)




In [18]:
# Causal self attention layer. This layer does a similar job as the global self-attention layer, for the output sequence
# Transformers are an "autoregressive" model: They generate the text one token at a time and feed that output back to the input. 
# To make this efficient, these models ensure that the output for each sequence element only depends on the previous sequence elements; the models are "causal".

from attention import CausalSelfAttention

sample_csa = CausalSelfAttention(num_heads=2, key_dim=512)

print(en_emb.shape)
print(sample_csa(en_emb).shape)

(64, 101, 512)
(64, 101, 512)




## Feed Forward

In [19]:
from encoder import FeedForward

sample_ffn = FeedForward(512, 2048)

print(en_emb.shape)
print(sample_ffn(en_emb).shape)

(64, 101, 512)
(64, 101, 512)




## Encoder

In [20]:
from encoder import Encoder
# Instantiate the encoder.
sample_encoder = Encoder(num_layers=4,
                         d_model=512,
                         num_heads=8,
                         dff=2048,
                         vocab_size=8500)

sample_encoder_output = sample_encoder(pt, training=False)

# Print the shape.
print(pt.shape)
print(sample_encoder_output.shape)  # Shape `(batch_size, input_seq_len, d_model)`.



(64, 99)
(64, 99, 512)


## DecoderLayer and Decoder

In [22]:
from decoder import DecoderLayer, Decoder

sample_decoder_layer = DecoderLayer(d_model=512, num_heads=8, dff=2048)

sample_decoder_layer_output = sample_decoder_layer(
    x=en_emb, context=pt_emb)

print(en_emb.shape)
print(pt_emb.shape)
print(sample_decoder_layer_output.shape)  # `(batch_size, seq_len, d_model)`

(64, 101, 512)
(64, 99, 512)
(64, 101, 512)




In [23]:
# Instantiate the decoder.
sample_decoder = Decoder(num_layers=4,
                         d_model=512,
                         num_heads=8,
                         dff=2048,
                         vocab_size=8000)

output = sample_decoder(
    x=en,
    context=pt_emb)

# Print the shapes.
print(en.shape)
print(pt_emb.shape)
print(output.shape)



(64, 101)
(64, 99, 512)
(64, 101, 512)


## Transformer

In [24]:
from transformer import Transformer
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
dropout_rate = 0.1

transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=tokenizers.pt.get_vocab_size().numpy(),
    target_vocab_size=tokenizers.en.get_vocab_size().numpy(),
    dropout_rate=dropout_rate)


In [25]:
output = transformer((pt, en))

print(en.shape)
print(pt.shape)
print(output.shape)



(64, 101)
(64, 99)
(64, 101, 7010)


In [26]:
attn_scores = transformer.decoder.dec_layers[-1].last_attn_scores
print(attn_scores.shape)  # (batch, heads, target_seq, input_seq)

(64, 8, 101, 99)


## Training

In [28]:
from trainer import masked_accuracy, masked_loss, CustomSchedule
import tensorflow as tf
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)


transformer.compile(
    loss=masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy])

In [29]:
transformer.fit(train_batches,
                epochs=20,
                validation_data=val_batches)

Epoch 1/20


2024-07-28 09:16:04.993148: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:966] PluggableGraphOptimizer failed: INVALID_ARGUMENT: Failed to deserialize the `graph_buf`.


[1m 60/810[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m2:23:21[0m 11s/step - loss: 8.8191 - masked_accuracy: 0.0073

KeyboardInterrupt: 