In [1]:
# Turn of the tensorlfow logging
import os
# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
# os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices'

# Import the libraries
import pickle, time
import numpy as np
import tensorflow as tf
from transformer import TransformerModel
from dataset import PrepareDataset
from hyperparameters import *

2023-06-30 20:03:23.849605: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-30 20:03:24.032174: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Function for calculating the loss
def loss_function(target, prediction):
    """
    This function calculates the loss between the target and the prediction.

    PARAMETERS
    ==========================
        - target (tf.Tensor): the target tensor
        - prediction (tf.Tensor): the prediction tensor

    RETURNS
    ==========================
        - loss (tf.Tensor): the loss between the target and the prediction
    """

    # Mask the padding values
    mask = tf.math.logical_not(tf.math.equal(target, 0))
    mask = tf.cast(mask, tf.float32)

    # Computer the sparse categorical cross entropy loss on the unmasked values
    loss = tf.keras.losses.sparse_categorical_crossentropy(target, prediction, from_logits=True) * mask

    # Calculate the mean loss over the unmasked values
    loss = tf.reduce_sum(loss) / tf.reduce_sum(mask)

    return loss

In [3]:
# Function for calculating the accuracy
def accuracy_function(target, prediction):
    """
    Function for calculating the accuracy between the target and the prediction.

    PARAMETERS
    ==========================
        - target (tf.Tensor): the target tensor
        - prediction (tf.Tensor): the prediction tensor

    RETURNS
    ==========================
        - out (tf.Tensor): the accuracy between the target and the prediction
    """

    # Mask the padding values
    mask = tf.math.logical_not(tf.math.equal(target, 0))
    
    # Calculate accuracy and apply the padding mask
    accuracy = tf.equal(target, tf.cast(tf.argmax(prediction, axis=2), tf.int32))
    accuracy = tf.math.logical_and(mask, accuracy)

    # Cast the accuracy from boolean to float32
    mask = tf.cast(mask, dtype=tf.float32)
    accuracy = tf.cast(accuracy, dtype=tf.float32)

    # Calculate the mean accuracy over the unmasked values
    out = tf.reduce_sum(accuracy) / tf.reduce_sum(mask)

    return out

In [4]:
# Class for scheduling the learning ear
class LearningRateScheduler(tf.keras.optimizers.schedules.LearningRateSchedule):
    """
    This class schedules the learning rate.

    PARAMETERS
    ==========================
        - d_model (int): the model's dimensionality
        - warmup_steps (int): the number of warmup steps
        
    RETURNS
    ==========================
        - learning_rate (tf.Tensor): the learning rate
    """
    
    # Constructor function
    def __init__(self, d_model, warmup_steps=4_000, **kwargs):

        # Inherite the parent's constructor
        super().__init__(**kwargs)

        # Initializations
        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps

    # Call function
    def __call__(self, step_num):

        # Cast step_num to float
        step_num = tf.cast(step_num, tf.float32)

        # Linearly increase the learning rate for the first warmup_steps times, then decrease it
        arg1 = step_num ** -0.5
        arg2 = step_num * (self.warmup_steps ** -1.5)

        # Learning rate
        learning_rate = (self.d_model ** -0.5) * tf.math.minimum(arg1, arg2)

        return learning_rate

In [5]:
# Function for the training step (to spped up the training process)
@tf.function
def train_step(encoder_input, decoder_input, decoder_output):
    """
    This function performs a training step.

    PARAMETERS
    ==========================
        - encoder_input (tf.Tensor): the encoder input
        - decoder_input (tf.Tensor): the decoder input
        - decoder_output (tf.Tensor): the decoder output

    RETURNS
    ==========================
        - None
    """

    # Initialize the gradient tape
    with tf.GradientTape() as tape:

        # Forward pass (to make predictions)
        prediction = model(encoder_input, decoder_input, training=True)

        # Calculate loss
        loss = loss_function(decoder_output, prediction)

        # Calculate accuracy
        accuracy = accuracy_function(decoder_output, prediction)

    # Fetch the gradients of the trainable variables with respect to the training loss
    gradients = tape.gradient(loss, model.trainable_variables)

    # Apply the gradients to the optimizer so it can update the model accordingly
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    # Update the metrics
    train_loss(loss)
    train_accuracy(accuracy)

In [6]:
# Instantiate the optimizer with the learning rate scheduler
optimizer = tf.keras.optimizers.Adam(LearningRateScheduler(d_model), beta_1, beta_2, epsilon)


2023-06-30 20:03:54.344914: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-06-30 20:03:54.462873: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-06-30 20:03:54.463291: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-06-30 20:03:54.466461: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-06-30 20:03:54.466738: I tensorflow/compile

In [7]:
# Prepare the dataset
dataset = PrepareDataset()
train_x, train_y, val_x, val_y, train, val, encoder_sequence_length, decoder_sequence_length, encoder_vocabulary_size, decoder_vocabulary_size = dataset("./dataset/english-german-both.pkl")


In [8]:
# Convert to tf.data
train_dataset = tf.data.Dataset.from_tensor_slices((train_x, train_y))
train_dataset = train_dataset.batch(batch_size)
val_dataset = tf.data.Dataset.from_tensor_slices((val_x, val_y))
val_dataset = val_dataset.batch(batch_size)


In [9]:
# Instantiate the model
model = TransformerModel(encoder_vocabulary_size, decoder_vocabulary_size, encoder_sequence_length, decoder_sequence_length, h, d_k, d_v, d_model, d_ff, n, dropout_rate)

In [16]:
tf.keras.models.save_model(model, "weight.ckpt", save_format='tf', save_traces=False)



AttributeError: 'TransformerModel' object has no attribute 'outputs'

In [10]:
# Include the metrics monitoring
train_loss = tf.keras.metrics.Mean(name="train_loss")
train_accuracy = tf.keras.metrics.Mean(name="train_accuracy")
val_loss = tf.keras.metrics.Mean(name="val_loss")

In [11]:
# Checkpoint object and manager (for managing multiple checkpoints)
checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
checkpoint_manager = tf.train.CheckpointManager(checkpoint, "./checkpoints", max_to_keep=None)



In [12]:
# Initialize lists for stroing the losses
train_loss_d = {}
val_loss_d = {}

In [13]:
# Loop over the epochs
for i_epoch in range(epochs):

    # Reset the metrics
    train_loss.reset_states()
    train_accuracy.reset_states()
    val_loss.reset_states()

    # Report 
    print(f"Epoch {i_epoch + 1}/{epochs}" + "\n===========================================")

    # Start a timer
    start_time = time.time()

    # Loop over the training batches
    for i_step, (train_batch_x, train_batch_y) in enumerate(train_dataset):

        # Define the encoder/decoder input/output
        encoder_input = train_batch_x[:, 1:]
        decoder_input = train_batch_y[:, :-1]
        decoder_output = train_batch_y[:, 1:]

        # Perform one training step
        train_step(encoder_input, decoder_input, decoder_output)

        # Report
        # if (i_step % 50 == 0):  print(f"Step {i_step + 1}/{len(train_dataset)}: loss = {train_loss.result():.4f}, accuracy = {train_accuracy.result():.4f}")

    # Loop over the validation batches
    for val_batch_x, val_batch_y in val_dataset:

        # Define the encoder/decoder input/output
        encoder_input = val_batch_x[:, 1:]
        decoder_input = val_batch_y[:, :-1]
        decoder_output = val_batch_y[:, 1:]

        # Forward pass (to make predictions)
        prediction = model(encoder_input, decoder_input, training=False)

        # Calculate the loass
        loss = loss_function(decoder_output, prediction)

        # Update the metrics
        val_loss(loss)

    # Report
    print(f"Training Loss = {train_loss.result():.4f}, Training Accuracy = {train_accuracy.result():.4f}, Validation Loss = {val_loss.result():.4f}")

    # Save the checkpoint after each epoch
    if (i_epoch+1) % 1 == 0:

        # Save the checkpoint
        save_path = checkpoint_manager.save()

        # Report
        print(f"Checkpoint saved at {save_path}.")

        # Save the weights
        model.save_weights(f"./weights/weights_{i_epoch+1}.ckpt")

        # Report
        train_loss_d[i_epoch+1] = train_loss.result()
        val_loss_d[i_epoch+1] = val_loss.result()

Epoch 1/20


2023-06-29 09:52:55.467108: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int32 and shape [10000,12]
	 [[{{node Placeholder/_1}}]]
2023-06-29 09:53:09.394366: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x7f9718049eb0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-06-29 09:53:09.394431: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6
2023-06-29 09:53:09.402770: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.


: 

: 

In [None]:
# Save the loss values
with open("./train_loss.pkl", "wb") as file:  pickle.dump(train_loss_d, file)
with open("./val_loss.pkl", "wb") as file:  pickle.dump(val_loss_d, file)

: 

In [None]:
# Report
print("Total time taken: {:.2f} sec".format(time.time() - start_time))

: 

: 

: 

: 