In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../../..')))

https://machinelearningmastery.com/training-the-transformer-model/

https://machinelearningmastery.com/training-the-transformer-model/

In [2]:
from keras.optimizers import Adam
# from keras.optimizers.schedules import LearningRateSchedule
from tensorflow.keras.optimizers.schedules import LearningRateSchedule
from keras.metrics import Mean
from tensorflow import data, train, math, reduce_sum, cast, equal, argmax, float32, GradientTape, TensorSpec, function, int64
from keras.losses import sparse_categorical_crossentropy

from novel.transformer.components.transformer import TransformerModel
from novel.transformer.components.utils import LRScheduler, loss_fcn, accuracy_fcn
from novel.transformer.components.example_dataset import PrepareDataset

from time import time

In [4]:
# Prepare the training and test splits of the dataset
dataset = PrepareDataset()
trainX, trainY, train_orig, enc_seq_length, dec_seq_length, enc_vocab_size, dec_vocab_size = dataset('english-german-both.pkl')

In [6]:
print(trainX.shape)
print(trainY.shape)

(9000, 7)
(9000, 12)


In [11]:
print(trainX[0,:])
print(trainX[0,1:]) #Input Encoder TODO why skip first word?

tf.Tensor([ 1 57  4 18 62  2  0], shape=(7,), dtype=int64)
tf.Tensor([57  4 18 62  2  0], shape=(6,), dtype=int64)


In [15]:
print(trainY[0,:-1]) #Input Decoder
print(trainY[0,1:]) #Output Decoder

tf.Tensor([  1 122   6   5 975  38   2   0   0   0   0], shape=(11,), dtype=int64)
tf.Tensor([122   6   5 975  38   2   0   0   0   0   0], shape=(11,), dtype=int64)


In [3]:
# Define the model parameters

h = 8  # Number of self-attention heads
d_k = 64  # Dimensionality of the linearly projected queries and keys
d_v = 64  # Dimensionality of the linearly projected values
d_model = 512  # Dimensionality of model layers' outputs
d_ff = 2048  # Dimensionality of the inner fully connected layer
n = 6  # Number of layers in the encoder stack

# Define the training parameters
epochs = 2
batch_size = 64
beta_1 = 0.9
beta_2 = 0.98
epsilon = 1e-9
dropout_rate = 0.1

# Instantiate an Adam optimizer
optimizer = Adam(LRScheduler(d_model), beta_1, beta_2, epsilon)

# Prepare the dataset batches
train_dataset = data.Dataset.from_tensor_slices((trainX, trainY))
train_dataset = train_dataset.batch(batch_size)

# Create model
training_model = TransformerModel(enc_seq_length, dec_seq_length, h, d_k, d_v, d_model, d_ff, n, dropout_rate, enc_vocab_size, dec_vocab_size)

# Include metrics monitoring
train_loss = Mean(name='train_loss')
train_accuracy = Mean(name='train_accuracy')

# Create a checkpoint object and manager to manage multiple checkpoints
ckpt = train.Checkpoint(model=training_model, optimizer=optimizer)
ckpt_manager = train.CheckpointManager(ckpt, "./checkpoints", max_to_keep=3)

# Speeding up the training process
@function
def train_step(encoder_input, decoder_input, decoder_output):
    with GradientTape() as tape:

        # Run the forward pass of the model to generate a prediction
        prediction = training_model(encoder_input, decoder_input, training=True)
        # print(prediction.shape, "Prediction") 
        # print(prediction[0], "Prediction") 

        # Compute the training loss
        loss = loss_fcn(decoder_output, prediction)

        # Compute the training accuracy
        accuracy = accuracy_fcn(decoder_output, prediction)

    # Retrieve gradients of the trainable variables with respect to the training loss
    gradients = tape.gradient(loss, training_model.trainable_weights)

    # Update the values of the trainable variables by gradient descent
    optimizer.apply_gradients(zip(gradients, training_model.trainable_weights))

    train_loss(loss)
    train_accuracy(accuracy)


for epoch in range(epochs):

    train_loss.reset_states()
    train_accuracy.reset_states()

    print("\nStart of epoch %d" % (epoch + 1))

    start_time = time()

    # Iterate over the dataset batches
    for step, (train_batchX, train_batchY) in enumerate(train_dataset):

        # Define the encoder and decoder inputs, and the decoder output
        encoder_input = train_batchX[:, 1:]
        decoder_input = train_batchY[:, :-1]
        decoder_output = train_batchY[:, 1:]

        # print(encoder_input.shape, "Encoder Input")
        # print(decoder_input.shape, "Decoder Input")
        # print(decoder_output.shape, "Decoder Output")

        # # Example input of the first batch
        # print(encoder_input[0], "Encoder Input")
        # print(decoder_input[0], "Decoder Input")
        # print(decoder_output[0], "Decoder Output")


        train_step(encoder_input, decoder_input, decoder_output)

        if step % 50 == 0:
            print(f'Epoch {epoch + 1} Step {step} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')
            # print("Samples so far: %s" % ((step + 1) * batch_size))

    # Print epoch number and loss value at the end of every epoch
    print("Epoch %d: Training Loss %.4f, Training Accuracy %.4f" % (epoch + 1, train_loss.result(), train_accuracy.result()))

    # Save a checkpoint after every five epochs
    if (epoch + 1) % 5 == 0:
        save_path = ckpt_manager.save()
        print("Saved checkpoint at epoch %d" % (epoch + 1))

print("Total time taken: %.2fs" % (time() - start_time))


Start of epoch 1
(64, 6) Encoder Input
(64, 11) Decoder Input
(64, 11) Decoder Output
tf.Tensor([252  21   2   0   0   0], shape=(6,), dtype=int64) Encoder Input
tf.Tensor([  1 423   6   8   2   0   0   0   0   0   0], shape=(11,), dtype=int64) Decoder Input
tf.Tensor([423   6   8   2   0   0   0   0   0   0   0], shape=(11,), dtype=int64) Decoder Output
(64, 1, 1, 6) Mask
(64, 1, 1, 11) Mask
(64, 11, 3688) Prediction
Tensor("strided_slice:0", shape=(11, 3688), dtype=float32) Prediction
(64, 1, 1, 6) Mask
(64, 1, 1, 11) Mask
(64, 11, 3688) Prediction
Tensor("strided_slice:0", shape=(11, 3688), dtype=float32) Prediction


KeyboardInterrupt: 