<br>

<h1 style="text-align:center">Transformer</h1>

<br>

### Initial Deployment

---

In [10]:
# Import the libraries
import warnings, json, collections, tqdm, os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

In [11]:
# Turn off all warnings
warnings.filterwarnings('ignore')

In [12]:
# Set matplotlib style
plt.style.use('ggplot')

<br>

### Hyperparameters

---

In [13]:
# Hyperparameters
BATCH_SIZE = 32
DATA_SHUFFLE = 1000
DATA_REPEAT = 1
MAX_TOKENS = 100000
MAX_SEQ_LENGTH = 50

In [14]:
# Make sure GPU is available
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

<br>

### Dataset Pipeline

---

In [15]:
# TODO: Add <START>, <END> and <PAD> tags in the begining/end of sequences.
# TODO: Finalize the dataset
# TODO: Add an extra step for pre-training
# TODO: Optimize the data loading steps
# TODO: Bucketization step
# TODO: Hyperparameter tunning

In [45]:
# Load the dataset
data = tf.data.TextLineDataset("./../dataset/full_dataset.tsv", num_parallel_reads=tf.data.AUTOTUNE)

# Split the text by tabs
data = data.map(lambda x: tf.strings.split(x, '\t'))

# Ignore the first row (column name)
data = data.skip(1)

# Set UTF-8 encoding
data = data.map(lambda x: (tf.strings.unicode_encode(x[0], 'UTF-8'), tf.strings.unicode_encode(x[1], 'UTF-8')))

In [46]:
for i in data.take(19):
    print(i)
    if i.numpy()[0]=="\'":
        print("\n\n")

tf.Tensor([b'When did Beyonce start becoming popular?' b'in the late 1990s'], shape=(2,), dtype=string)
tf.Tensor(
[b'What areas did Beyonce compete in when she was growing up?'
 b'singing and dancing'], shape=(2,), dtype=string)
tf.Tensor(
[b"When did Beyonce leave Destiny's Child and become a solo singer?"
 b'2003'], shape=(2,), dtype=string)
tf.Tensor([b'In what city and state did Beyonce  grow up? ' b'Houston, Texas'], shape=(2,), dtype=string)
tf.Tensor([b'In which decade did Beyonce become famous?' b'late 1990s'], shape=(2,), dtype=string)
tf.Tensor([b'In what R&B group was she the lead singer?' b"Destiny's Child"], shape=(2,), dtype=string)
tf.Tensor([b'What album made her a worldwide known artist?' b'Dangerously in Love'], shape=(2,), dtype=string)
tf.Tensor([b"Who managed the Destiny's Child group?" b'Mathew Knowles'], shape=(2,), dtype=string)
tf.Tensor([b'When did Beyonc\xc3\xa9 rise to fame?' b'late 1990s'], shape=(2,), dtype=string)
tf.Tensor([b"What role did Beyonc\xc3\xa

In [None]:
# # Make a generator to read the data
# def data_generator():

#     # Loop over paths
#     for i_path in ["./../dataset/full_dataset.csv"]:
            
#         # Open the file
#         with open(i_path, mode='r', encoding='utf8') as file:
            
#             # Loop over lines
#             for i_line in file:

#                 # Ignore the first line
#                 if i_line.startswith("index"):  continue

#                 # Split the line
#                 i_line = i_line.split("\t")

#                 # Set input/output
#                 input_data = i_line[1]
#                 output_data = i_line[2]

#                 # Add starting/ending tags
#                 input_data = "<START> " + input_data + " <END>"
#                 output_data = "<START> " + output_data + " <END>"

#                 # Yield the line
#                 yield (input_data, output_data)


# # Data generator
# data_g = data_generator()

# # Convert to tf.data
# data = tf.data.Dataset.from_generator(
#     data_generator, 
#     output_signature=(
#         tf.TensorSpec(shape=(), dtype=tf.string),
#         tf.TensorSpec(shape=(), dtype=tf.string)
#         ),
#         )

# # Bucketize data
# # data = data.bucket_by_sequence_length(element_length_func=lambda elem: tf.shape(elem)[0],
# #                                       bucket_boundaries=[3, 5],
# #                                       bucket_BATCH_SIZEs=[2, 2, 2])

# # Print a sample
# for i_record in data.take(1):
#     print(i_record)


In [None]:
# TODO: Function for custom standarization
# TODO: Maybe use NLTK tokenizer
# def custom_standardization(input_string):
#     lowercased = tf.strings.lower(input_string)
#     stripped_html = tf.strings.regex_replace(lowercased, "<br />", " ")
#     return tf.strings.regex_replace(stripped_html, f"([{string.punctuation}])", r" \1")               

In [47]:
# Text vectorizer 
vectorize_layer = tf.keras.layers.TextVectorization(standardize='lower_and_strip_punctuation', 
                                                    split='whitespace', 
                                                    ngrams=None, 
                                                    output_mode='int', 
                                                    output_sequence_length=MAX_SEQ_LENGTH, 
                                                    pad_to_max_tokens=True, 
                                                    max_tokens=MAX_TOKENS, 
                                                    idf_weights=None, 
                                                    sparse=False, 
                                                    ragged=False)

# Adapt the vectorizer to the data
vectorize_layer.adapt(data)
print("Vocabulary Size: ", vectorize_layer.vocabulary_size())

Vocabulary Size:  100000


In [49]:
# Vectorize the data
# ds = data.map(lambda x, y: (vectorize_layer(x), vectorize_layer(y)), num_parallel_calls=tf.data.AUTOTUNE)
ds = data.map(lambda x: vectorize_layer(x), num_parallel_calls=tf.data.AUTOTUNE)

# Print a sample
for i in ds.take(1):
    print(i)

tf.Tensor(
[[  34   22 3147  612 1503  243    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0]
 [   4    2  439 1765    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0]], shape=(2, 50), dtype=int64)


In [52]:
# TODO: Finalize the placing of the tags
# Function for preparing dataset
def prepare_dataset(record):

    #
    in_record, out_record = record[0], record[1]

    # Encoder input - actual input
    encoder_input = in_record[1:]

    # Decoder input - actual output (starting from 0 till t-1)
    decoder_input = out_record[:-1] 

    # Decoder output - actual output (starting from 1 till t)
    decoder_output = out_record[1:]

    # Reshape output (to match sparse categorical crossentropy)
    encoder_input = tf.expand_dims(encoder_input, axis=-1)
    decoder_input = tf.expand_dims(decoder_input, axis=-1)
    decoder_output = tf.expand_dims(decoder_output, axis=-1)

    # Return the data
    return ({"encoder_inputs": encoder_input, "decoder_inputs": decoder_input}, decoder_output,)

In [53]:
# Prepare the dataset
ds = ds.map(lambda record: prepare_dataset(record), num_parallel_calls=tf.data.AUTOTUNE)

# Print a sample
for i in ds.take(1):
    print(i)

({'encoder_inputs': <tf.Tensor: shape=(49, 1), dtype=int64, numpy=
array([[  22],
       [3147],
       [ 612],
       [1503],
       [ 243],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0]], dtype=int64)>, 'decoder_inputs': <tf.Tensor: shape=(49, 1), dtype=int64, numpy=
array([[   4],
       [   2],
       [ 439],
       [1765],
       [   0],
       [   0],
       [   0],
       [   0

In [54]:
# Data transformation
ds = ds.shuffle(DATA_SHUFFLE)
ds = ds.repeat(DATA_REPEAT)
ds = ds.batch(BATCH_SIZE)
ds = ds.prefetch(tf.data.AUTOTUNE)

In [55]:
# Print a sample
for inputs, targets in ds.take(1):
    print("Encoder input: ", inputs["encoder_inputs"].shape)
    print("Decoder input: ", inputs["decoder_inputs"].shape)
    print("Decoder Output: ", targets.shape)

Encoder input:  (32, 49, 1)
Decoder input:  (32, 49, 1)
Decoder Output:  (32, 49, 1)


In [56]:
# # Report
# for inputs, targets in ds.take(1):
#     print(f'inputs["encoder_inputs"]: \n{inputs["encoder_inputs"]}')
#     print(f'\ninputs["decoder_inputs"]: \n{inputs["decoder_inputs"]}')
#     print(f"\ntargets.shape: \n{targets}")

<br>

### Token and Positional Encoding

---

In [57]:

#######################
# POSITIONAL ENCODING #
#######################

# Class for positional encoding
class PositionalEmbedding(tf.keras.layers.Layer):

    # Constructor function
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):

        # Inherite parent class constructor
        super(PositionalEmbedding, self).__init__(**kwargs)

        # Token and position embedding
        self.token_embeddings    = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)        
        self.position_embeddings = tf.keras.layers.Embedding(input_dim=sequence_length, output_dim=embed_dim)
        
        # Initialization
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    # Call function
    def call(self, inputs):

        # Length of inputs
        length = tf.shape(inputs)[-1]
        
        # Range of 0-length
        positions = tf.range(start=0, limit=length, delta=1)
        
        # Positional embedding
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        out = embedded_tokens + embedded_positions

        return out 

    # Function for creating mask
    def compute_mask(self, inputs, mask=None):

        # Returns the truth value of (x != y) element-wise
        return tf.math.not_equal(inputs, 0)

    # Function update parent's config
    def get_config(self):

        # Get the config of the parent class
        config = super().get_config()

        # Update the config
        config.update({"sequence_length": self.sequence_length, "vocab_size": self.vocab_size, "embed_dim": self.embed_dim,})
        
        return config

<br>

### Model Architecture

---

In [58]:

#######################
# TRANSFORMER ENCODER #
#######################

# Class for transformer encoder
class TransformerEncoder(tf.keras.layers.Layer):

    # Constructor function
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):

        # Inherite parent class constructor
        super(TransformerEncoder, self).__init__(**kwargs)

        # Initialization
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        
        # Multi-head attention layer
        self.attention = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        
        # Dense projection layers
        self.dense_proj = tf.keras.Sequential([tf.keras.layers.Dense(dense_dim, activation="relu"),
                                               tf.keras.layers.Dense(embed_dim)])
        
        # Layer normalizations
        self.layernorm_1 = tf.keras.layers.LayerNormalization()
        self.layernorm_2 = tf.keras.layers.LayerNormalization()
        
        # Flag for masking
        self.supports_masking = True

    # Call function
    def call(self, inputs, mask=None):

        # If mask is not None
        if mask is not None:

            # Mask the inputs
            padding_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
        
        # Feed into multi-head attention
        attention_output = self.attention(query=inputs, value=inputs, key=inputs, attention_mask=padding_mask)
        
        # Sum up inputs and attention + Normalize the layer 
        proj_input = self.layernorm_1(inputs + attention_output)

        # Feed into dense projection layer
        proj_output = self.dense_proj(proj_input)

        # Sum up projected input and output + Normalize the layer
        out = self.layernorm_2(proj_input + proj_output)

        return out

    # Function for updating parent's config
    def get_config(self):

        # Get the config of the parent class
        config = super().get_config()

        # Update the config
        config.update({"embed_dim": self.embed_dim, "dense_dim": self.dense_dim, "num_heads": self.num_heads,})
        
        return config

In [59]:

#######################
# TRANSFORMER DECODER #
#######################

# Class for transformer decoder
class TransformerDecoder(tf.keras.layers.Layer):

    # Constructor function
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):

        # Inherite parent class constructor
        super(TransformerDecoder, self).__init__(**kwargs)

        # Initialization
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads

        # Multi-head attention layers
        self.attention_1 = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.attention_2 = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        
        # Dense projection layers
        self.dense_proj = tf.keras.Sequential([tf.keras.layers.Dense(latent_dim, activation="relu"),
                                               tf.keras.layers.Dense(embed_dim),])

        # Layer normalizations
        self.layernorm_1 = tf.keras.layers.LayerNormalization()
        self.layernorm_2 = tf.keras.layers.LayerNormalization()
        self.layernorm_3 = tf.keras.layers.LayerNormalization()
        
        # Flag for masking
        self.supports_masking = True

    # Call function
    def call(self, inputs, encoder_outputs, mask=None):

        # Causal attention mask
        causal_mask = self.get_causal_attention_mask(inputs)
        
        # If mask is not None
        if mask is not None:

            # Mask the inputs
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        # Feed into multi-head attention
        attention_output_1 = self.attention_1(query=inputs, value=inputs, key=inputs, attention_mask=causal_mask)
        
        # Sum up inputs and attention + Normalize the layer
        out_1 = self.layernorm_1(inputs + attention_output_1)

        # Feed into multi-head attention
        attention_output_2 = self.attention_2(query=out_1, value=encoder_outputs, key=encoder_outputs, attention_mask=padding_mask,)

        # Sum up output and attention + Normalize the layer
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        # Feed into dense projection layer
        proj_output = self.dense_proj(out_2)

        # Sum up output and projected output + Normalize the layer
        out = self.layernorm_3(out_2 + proj_output)

        return out

    # Function for getting causal attention mask
    def get_causal_attention_mask(self, inputs):

        # Input shape
        input_shape = tf.shape(inputs)

        # Batch size AND sequence length
        batch_size, sequence_length = input_shape[0], input_shape[1]

        # Range for i and j
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        
        # Create causal mask
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        
        # Multiplier (to replicate mask for mult times) 
        mult = tf.concat([tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], axis=0,)
        
        # Replicate the mask for mult times 
        out = tf.tile(mask, mult)

        return out

    # Function for updating parent's config
    def get_config(self):

        # Get the config of the parent class
        config = super().get_config()

        # Update the config
        config.update({"embed_dim": self.embed_dim, "latent_dim": self.latent_dim, "num_heads": self.num_heads,})
        
        return config

<br>

### Training

---

In [60]:
# Hyperparameters
embed_dim = 128
latent_dim = 128
num_heads = 4
vocab_size = vectorize_layer.vocabulary_size()
sequence_length = MAX_SEQ_LENGTH

In [61]:
#######################
# TRANSFORMER ENCODER #
#######################

# Inputs
encoder_inputs = tf.keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")

# Positional encoding
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)

# Transformer encoder
encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)

# Model
encoder = tf.keras.Model(encoder_inputs, encoder_outputs)

In [62]:
#######################
# TRANSFORMER DECODER #
#######################

# Inputs (to decoder)
decoder_inputs = tf.keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")

# Inputs (from encoder)
encoded_seq_inputs = tf.keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")

# Positional encoding
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)

# Transformer decoder
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)

# Dropout
x = tf.keras.layers.Dropout(0.5)(x)

# Output layer
decoder_outputs = tf.keras.layers.Dense(vocab_size, activation="softmax")(x)

# Model
decoder = tf.keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

# Report
print(decoder.summary())
# tf.keras.utils.plot_model(decoder)

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 positional_embedding_1 (Positi  (None, None, 128)   12806400    ['decoder_inputs[0][0]']         
 onalEmbedding)                                                                                   
                                                                                                  
 decoder_state_inputs (InputLay  [(None, None, 128)]  0          []                               
 er)                                                                                              
                                                                                            

In [63]:

#####################
# TRANSFORMER MODEL #
#####################

# Feed inputs into decoder
decoder_outputs = decoder([decoder_inputs, encoder_outputs])

# Transformer model
transformer = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs, name="transformer")

# Report
print(transformer.summary())
# tf.keras.utils.plot_model(transformer)

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 positional_embedding (Position  (None, None, 128)   12806400    ['encoder_inputs[0][0]']         
 alEmbedding)                                                                                     
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 transformer_encoder (Transform  (None, None, 128)   297344      ['positional_embedding[

In [64]:
# Compile the model
transformer.compile("adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Train the model
transformer.fit(ds, epochs=5)      # Choose at least 30 for epoch

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x25d8798c5b0>

In [67]:
# Save model
transformer.save("./saved model/transformer_model")

# Load the model
#transformer = tf.keras.models.load_model("./saved model/transformer_model")



INFO:tensorflow:Assets written to: ./saved model/transformer_model\assets


INFO:tensorflow:Assets written to: ./saved model/transformer_model\assets


In [66]:
# Compile the model
transformer.compile("adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Train the model
transformer.fit(ds, epochs=5)      # Choose at least 30 for epoch

Epoch 1/5
      1/Unknown - 3s 3s/step - loss: 0.3242 - accuracy: 0.1647

ResourceExhaustedError: Graph execution error:

Detected at node 'transformer/model_1/dense_4/Softmax' defined at (most recent call last):
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\traitlets\config\application.py", line 978, in launch_instance
      app.start()
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\ipykernel\kernelapp.py", line 712, in start
      self.io_loop.start()
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\tornado\platform\asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\asyncio\base_events.py", line 600, in run_forever
      self._run_once()
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\asyncio\base_events.py", line 1896, in _run_once
      handle._run()
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\ipykernel\kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\ipykernel\kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\ipykernel\kernelbase.py", line 406, in dispatch_shell
      await result
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\ipykernel\kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\ipykernel\ipkernel.py", line 383, in do_execute
      res = shell.run_cell(
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\ipykernel\zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\IPython\core\interactiveshell.py", line 2885, in run_cell
      result = self._run_cell(
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\IPython\core\interactiveshell.py", line 2940, in _run_cell
      return runner(coro)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\IPython\core\interactiveshell.py", line 3139, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\IPython\core\interactiveshell.py", line 3318, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\IPython\core\interactiveshell.py", line 3378, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\Soheil\AppData\Local\Temp\ipykernel_24552\1840114185.py", line 5, in <module>
      transformer.fit(ds, epochs=5)      # Choose at least 30 for epoch
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\training.py", line 1409, in fit
      tmp_logs = self.train_function(iterator)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\training.py", line 1051, in train_function
      return step_function(self, iterator)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\training.py", line 1040, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\training.py", line 1030, in run_step
      outputs = model.train_step(data)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\training.py", line 889, in train_step
      y_pred = self(x, training=True)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\training.py", line 490, in __call__
      return super().__call__(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\base_layer.py", line 1014, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\functional.py", line 458, in call
      return self._run_internal_graph(
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\functional.py", line 596, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\training.py", line 490, in __call__
      return super().__call__(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\base_layer.py", line 1014, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\functional.py", line 458, in call
      return self._run_internal_graph(
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\functional.py", line 596, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\base_layer.py", line 1014, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\layers\core\dense.py", line 235, in call
      outputs = self.activation(outputs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\activations.py", line 80, in softmax
      output = tf.nn.softmax(x, axis=axis)
Node: 'transformer/model_1/dense_4/Softmax'
Detected at node 'transformer/model_1/dense_4/Softmax' defined at (most recent call last):
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\traitlets\config\application.py", line 978, in launch_instance
      app.start()
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\ipykernel\kernelapp.py", line 712, in start
      self.io_loop.start()
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\tornado\platform\asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\asyncio\base_events.py", line 600, in run_forever
      self._run_once()
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\asyncio\base_events.py", line 1896, in _run_once
      handle._run()
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\ipykernel\kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\ipykernel\kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\ipykernel\kernelbase.py", line 406, in dispatch_shell
      await result
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\ipykernel\kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\ipykernel\ipkernel.py", line 383, in do_execute
      res = shell.run_cell(
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\ipykernel\zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\IPython\core\interactiveshell.py", line 2885, in run_cell
      result = self._run_cell(
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\IPython\core\interactiveshell.py", line 2940, in _run_cell
      return runner(coro)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\IPython\core\interactiveshell.py", line 3139, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\IPython\core\interactiveshell.py", line 3318, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\IPython\core\interactiveshell.py", line 3378, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\Soheil\AppData\Local\Temp\ipykernel_24552\1840114185.py", line 5, in <module>
      transformer.fit(ds, epochs=5)      # Choose at least 30 for epoch
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\training.py", line 1409, in fit
      tmp_logs = self.train_function(iterator)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\training.py", line 1051, in train_function
      return step_function(self, iterator)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\training.py", line 1040, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\training.py", line 1030, in run_step
      outputs = model.train_step(data)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\training.py", line 889, in train_step
      y_pred = self(x, training=True)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\training.py", line 490, in __call__
      return super().__call__(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\base_layer.py", line 1014, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\functional.py", line 458, in call
      return self._run_internal_graph(
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\functional.py", line 596, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\training.py", line 490, in __call__
      return super().__call__(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\base_layer.py", line 1014, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\functional.py", line 458, in call
      return self._run_internal_graph(
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\functional.py", line 596, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\engine\base_layer.py", line 1014, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\layers\core\dense.py", line 235, in call
      outputs = self.activation(outputs)
    File "c:\Users\Soheil\anaconda3\envs\prime\lib\site-packages\keras\activations.py", line 80, in softmax
      output = tf.nn.softmax(x, axis=axis)
Node: 'transformer/model_1/dense_4/Softmax'
2 root error(s) found.
  (0) RESOURCE_EXHAUSTED:  OOM when allocating tensor with shape[32,49,100000] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node transformer/model_1/dense_4/Softmax}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.

	 [[broadcast_weights_1/assert_broadcastable/AssertGuard/pivot_f/_15/_85]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.

  (1) RESOURCE_EXHAUSTED:  OOM when allocating tensor with shape[32,49,100000] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node transformer/model_1/dense_4/Softmax}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.

0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_2338970]

In [None]:
# Save model
transformer.save("./saved model/transformer_model.h5")

# Load the model
#transformer = tf.keras.models.load_model("./saved model/transformer_model")

<br>

### Evaluation

---

<br>

### Testing and Prediction

---

In [None]:
# Spanish vocabulary
spa_vocab = spa_vectorization.get_vocabulary()

# Int2Word dictionary
spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))

# Set the maximum decoded sequence length 
max_decoded_sentence_length = 20

In [None]:
# Function for decoding sequences
def decode_sequence(input_sentence):

    # Vectorize input english sequence
    tokenized_input_sentence = eng_vectorization([input_sentence])
    
    # Initialize the decoded sentence with [start] token
    decoded_sentence = "[start]"

    # Loop for max_decoded_sentence_length times
    for i in range(max_decoded_sentence_length):

        # Vectorize decoded sentence
        tokenized_target_sentence = spa_vectorization([decoded_sentence])[:, :-1]
        
        # Predict the [input sequence, target sequence] using transformer
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])

        # Get the argmax of prediction
        sampled_token_index = np.argmax(predictions[0, i, :])

        # Get the word of the argmax
        sampled_token = spa_index_lookup[sampled_token_index]

        # Append the word to the decoded sentence
        decoded_sentence += " " + sampled_token

        # If the sampled token is [end], break the loop
        if sampled_token == "[end]":
            break
        
    return decoded_sentence

In [None]:
# List of all english sequences
test_eng_texts = [pair[0] for pair in test_pairs]

# Predict for N times
for _ in range(5):

    # Choose a random english-spanish sequence
    input_sentence = random.choice(test_pairs)

    # Predict the sequence
    translated = decode_sequence(input_sentence[0])

    # Report
    print("INPUT:               ", input_sentence[0])
    print("OUTPUT (TRUE):       ", input_sentence[1])
    print("OUTPUT (PREDICTION): ", translated, "\n")