In [374]:
# Ensure that local modules are reloaded when edited
%load_ext autoreload
%autoreload 2

# Ensure that plots are displayed
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [375]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../../..')))

https://machinelearningmastery.com/training-the-transformer-model/

https://machinelearningmastery.com/training-the-transformer-model/

In [376]:
from keras.optimizers import Adam # type: ignore
from keras.metrics import Mean # type: ignore
from tensorflow import data, train, GradientTape, function # type: ignore

from novel.transformer.components.transformer import TransformerModel
from novel.transformer.components.utils import LRScheduler, loss_fcn, accuracy_fcn, likelihood_fcn
from novel.transformer.components.example_dataset import PrepareDataset

from time import time

In [377]:
from utils.dataset import Dataset
from utils.enums import EncodingCategorical, EncodingNumerical

dataset_name='medium_debug_v2-0.15-4_1.json.gz'
prefix=True
pretrain_percentage=0.0
w2v_vector_size=20
w2v_window_size=4
categorical_encoding=EncodingCategorical.WORD_2_VEC
numerical_encoding=EncodingNumerical.MIN_MAX_SCALING
fs_save=None

dataset = Dataset(dataset_name, 
                      beta=0.005, 
                      prefix=prefix,
                      pretrain_percentage=pretrain_percentage,
                      w2v_vector_size=w2v_vector_size,
                      w2v_window_size=w2v_window_size, 
                      categorical_encoding=categorical_encoding,
                      numerical_encoding=numerical_encoding,
                      fs_save=fs_save)

EncodingNumerical.MIN_MAX_SCALING EncodingNumerical.MIN_MAX_SCALING
0.0 81.0
0.0 1.0
EncodingNumerical.MIN_MAX_SCALING EncodingNumerical.MIN_MAX_SCALING
0.0 159.0
0.0 1.0
EncodingNumerical.MIN_MAX_SCALING EncodingNumerical.MIN_MAX_SCALING
0.0 27.0
0.0 1.0
EncodingNumerical.MIN_MAX_SCALING EncodingNumerical.MIN_MAX_SCALING
0.0 54.0
0.0 1.0
EncodingNumerical.MIN_MAX_SCALING EncodingNumerical.MIN_MAX_SCALING
0.0 12.0
0.0 1.0
Feature Columns: dict_keys(['name', 'arrival_time', 'company', 'country', 'department', 'global_workload_D', 'global_workload_h', 'local_workload_D', 'local_workload_h', 'user'])
Feature Shape: (35483, 13)
Case Length: [ 2  2  3 ...  8 10 10]
Attribute Types: [<AttributeType.CATEGORICAL: 0>, <AttributeType.NUMERICAL: 1>, <AttributeType.CATEGORICAL: 0>, <AttributeType.CATEGORICAL: 0>, <AttributeType.CATEGORICAL: 0>, <AttributeType.NUMERICAL: 1>, <AttributeType.NUMERICAL: 1>, <AttributeType.NUMERICAL: 1>, <AttributeType.NUMERICAL: 1>, <AttributeType.CATEGORICAL: 0>]
Enc

In [378]:
from utils.embedding.w2v import ProcessWord2Vec
import numpy as np

In [379]:
dataset.case_lens[0:20]

array([2, 2, 3, 3, 2, 2, 2, 2, 2, 3, 2, 4, 5, 5, 5, 5, 3, 2, 3, 3])

In [380]:
np.mean(dataset.case_lens)

5.491164783135586

In [381]:
dataset.features[0].shape

(35483, 13)

In [382]:
np.set_printoptions(linewidth=200)
dataset.features[0][0:20]

array([[1., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 2., 3., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 2., 3., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 2., 4., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 2., 4., 5., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 2., 3., 6., 7., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 2., 3., 6., 7., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 2., 3., 6., 7., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 2., 3., 6., 7., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 2., 4., 0., 

In [383]:
np.set_printoptions(linewidth=300)
dataset.features[5][0:20]

array([[0.        , 0.01886792, 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.01886792, 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.01886792, 0.01886792, 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.01886792, 0.22012579, 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.22012579, 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.22012579, 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.

In [384]:
w2v_encoder = ProcessWord2Vec(
            encoders=dataset.encoders,
            pretrain_percentage=dataset.pretrain_percentage,
            attribute_types=dataset.attribute_types,
            event_attribute_keys=dataset.event_log.event_attribute_keys,
            features=dataset._features,
            event_log=dataset.event_log,
            vector_size=dataset.w2v_vector_size,
            window=dataset.w2v_window_size,
            fs_save=dataset.fs_save) 

In [385]:
w2v_features, numeric_features, numeric_feature_names, w2v_feature_names = w2v_encoder.encode_features(average=False, match_numerical=True)

In [386]:
# (num_attribute, num_cases, num_events, vector_size) 
print(w2v_features.shape)
transposed_w2v_features = np.transpose(w2v_features, (1, 2, 0, 3))
print(transposed_w2v_features.shape)

(5, 35483, 13, 20)
(35483, 13, 5, 20)


In [387]:
print(numeric_features.shape)
transposed_numeric_features = np.transpose(numeric_features, (1, 2, 0, 3))
print(transposed_numeric_features.shape)

(5, 35483, 13, 20)
(35483, 13, 5, 20)


In [388]:
dataset.case_lens[13]

5

In [389]:
transposed_numeric_features[12,:,0]

array([[0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.07749242, 0.1545188 , 0.23061587, 0.30532598, 0.37819985, 0.4487992 , 0.5166994 , 0.58149207, 0.64278764, 0.70021737, 0.99699295, 0.98798984, 0.9730449 , 0.95224786, 0.92572397, 0.89363265, 0.8561669 , 0.8135521 , 0.76604444, 0.7139297 ],
       [0.11609291, 0.23061587, 0.34202015, 0.4487992 , 0.549509  , 0.64278764, 0.72737366, 0.8021232 , 0.8660254 , 0.9182161 , 0.99323833, 0.9730449 , 0.9396926 , 0.89363265, 0.8354878 , 0.76604444, 0.6862416 , 0.5971586 , 0.5       , 0.39607978],
    

In [390]:
transposed_w2v_features[12,:,0]

array([[-0.01569275,  0.03015446, -0.03076373, -0.0099005 , -0.02991541, -0.0049784 , -0.01010493,  0.04242973,  0.00039001, -0.04287663, -0.02714549, -0.03437993,  0.01346191,  0.04728324, -0.02907998,  0.04132513,  0.04266026, -0.0353132 , -0.04441606,  0.04734592],
       [ 0.00890996, -0.0341445 , -0.04862406,  0.04520292,  0.03099027, -0.03456464,  0.01701741,  0.00103032,  0.02376873, -0.03559972,  0.02013477,  0.02173717,  0.04978685, -0.0223687 , -0.00694632, -0.03658661, -0.04848915, -0.04540129, -0.00511377, -0.03251645],
       [ 0.02424864, -0.03082013,  0.01259593,  0.0036972 , -0.01696077, -0.00489612,  0.04989563,  0.04572944, -0.02230915,  0.04541513, -0.02820882,  0.02965461, -0.01548609,  0.01715876,  0.01508613,  0.03450231, -0.01186942,  0.04387518,  0.03794714, -0.04773823],
       [-0.04478499, -0.003676  ,  0.04076253,  0.03845215, -0.03603058, -0.01833416,  0.01559276, -0.04785361,  0.0073822 ,  0.03262233,  0.0287321 , -0.04381531, -0.02258572, -0.0407008 ,  0.

In [391]:
# Shapes of the input data
num_traces, num_events, num_numeric_features, vector_size = transposed_numeric_features.shape
_, _, num_w2v_features, _ = transposed_w2v_features.shape

# Initialize the merged array
merged_features = np.zeros((num_traces, num_events, num_numeric_features + num_w2v_features, vector_size))

# Keep track of the current indices for numeric and w2v features
numeric_index, w2v_index = 0, 0

# Iterate over dataset.attribute_keys to place each feature in the correct order
for i, key in enumerate(dataset.attribute_keys):
    # print(key)
    if key in numeric_feature_names:
        # print("Numeric feature shape:", transposed_numeric_features[:, :, numeric_index, :].shape)
        # print("Target shape:", merged_features[:, :, i, :].shape)
        # print(numeric_index)
        # Place numeric feature in the merged array
        merged_features[:, :, i, :] = transposed_numeric_features[:, :, numeric_index, :]
        numeric_index += 1
    elif key in w2v_feature_names:
        # print("W2V feature shape:", transposed_w2v_features[:, :, w2v_index, :].shape)
        # print("Target shape:", merged_features[:, :, i, :].shape)
        # print(w2v_index)
        # Place w2v feature in the merged array
        merged_features[:, :, i, :] = transposed_w2v_features[:, :, w2v_index, :]
        w2v_index += 1
    else:
        raise ValueError(f"Unexpected attribute key '{key}' not found in either feature list.")

# The merged array now has features interlaced according to dataset.attribute_keys order
print("Merged features shape:", merged_features.shape)

Merged features shape: (35483, 13, 10, 20)


In [392]:
dataset.case_lens[10:20]

array([2, 4, 5, 5, 5, 5, 3, 2, 3, 3])

In [393]:
merged_features[12][4]

array([[ 0.04776092, -0.03678212, -0.03635194, -0.01132694, -0.0038928 , -0.01608052, -0.00296293,  0.03744411, -0.00348759, -0.0081247 ,  0.013722  , -0.0417955 ,  0.03927902,  0.04268052, -0.04792044,  0.01223133,  0.04952485, -0.03832902, -0.03483459, -0.03868259],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [-0.04801775,  0.02503647, -0.04379793, -0.02195913, -0.0001755 , -0.00148091, -0.0383062 ,  0.04807372,  0.02491029,  0.04616572, -0.04078959,  0.02247899, -0.02068538,  0.00412268,  0.0424931 , -0.02231088,  0.0225875 , -0.0339348 , -0.01774244,  0.04699254],
       [ 0.00047282,  0.0153866 , -0.03406323, -0.00687733,  0.0383429 ,  0.03673205, -0.01836649,  0.01321351, -0.04158565,  0.03102743, -0.02318661, -0.01582053,  0.04655678,  0.00436693,  0.

In [394]:
zero_event = np.zeros((num_numeric_features + num_w2v_features, vector_size))

trainX = []
trainY = []
for case, length in zip(merged_features, dataset.case_lens):
    last_event_index = length - 1
    trainY.append(case[length - 1].copy())

    # Remove the target event from the training data
    case[length - 1] = zero_event
    trainX.append(case)

trainX = np.array(trainX)
trainY = np.array(trainY)

In [411]:
trainX.shape

(35483, 13, 10, 20)

In [395]:
trainY.shape

(35483, 10, 20)

In [402]:
trainY[4]

array([[ 8.90995841e-03, -3.41444984e-02, -4.86240573e-02,  4.52029221e-02,  3.09902728e-02, -3.45646366e-02,  1.70174129e-02,  1.03031995e-03,  2.37687286e-02, -3.55997160e-02,  2.01347712e-02,  2.17371695e-02,  4.97868471e-02, -2.23686993e-02, -6.94631925e-03, -3.65866050e-02, -4.84891459e-02,
        -4.54012863e-02, -5.11377444e-03, -3.25164497e-02],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-4.30984385e-02,  1.83286909e-02,  2.59494185e-02,  2.87096910e-02,  3.73345912e-02, -3.08383759e-02,  5.52806864e-03,  3.02364118e-02, -1.42002525e-02, -3.08676120e-02, -2.05111504e-03, -4.18447442e-02, -2.80000623e-02,  3.55226919e-02,  1.67626981e-02,  3.61283496e-02,  3.400

In [413]:
trainX[4,0]

array([[-0.01569275,  0.03015446, -0.03076373, -0.0099005 , -0.02991541, -0.0049784 , -0.01010493,  0.04242973,  0.00039001, -0.04287663, -0.02714549, -0.03437993,  0.01346191,  0.04728324, -0.02907998,  0.04132513,  0.04266026, -0.0353132 , -0.04441606,  0.04734592],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [-0.04267167,  0.01603553, -0.02318999, -0.02544478,  0.01794809,  0.0268517 ,  0.03884757, -0.02883253,  0.0371668 ,  0.03312748, -0.018549  , -0.04372821,  0.02718734,  0.03254878, -0.00393775, -0.03354928, -0.03542963, -0.0124853 ,  0.02571627, -0.01832619],
       [ 0.00386665, -0.04247889,  0.03904903,  0.04628646, -0.01371164,  0.00400112,  0.00373326,  0.02738943, -0.04303039,  0.00292228,  0.03434711,  0.01115797,  0.00562338, -0.04661078,  0.

In [249]:
# Define the model parameters

h = 8  # Number of self-attention heads
d_k = 64  # Dimensionality of the linearly projected queries and keys
d_v = 64  # Dimensionality of the linearly projected values
d_model = 512  # Dimensionality of model layers' outputs
d_ff = 2048  # Dimensionality of the inner fully connected layer
n = 6  # Number of layers in the encoder stack

# Define the training parameters
batch_size = 8
beta_1 = 0.9
beta_2 = 0.98
epsilon = 1e-9
dropout_rate = 0.1

# Instantiate an Adam optimizer
optimizer = Adam(LRScheduler(d_model), beta_1, beta_2, epsilon)

# Prepare the training and test splits of the dataset
dataset = PrepareDataset()
trainX, trainY, train_orig, enc_seq_length, dec_seq_length, enc_vocab_size, dec_vocab_size = dataset('english-german-both.pkl')
print(f'X Length: {trainX.shape}')

# Prepare the dataset batches
train_dataset = data.Dataset.from_tensor_slices((trainX, trainY))
train_dataset = train_dataset.batch(batch_size)

# Create model
training_model = TransformerModel(enc_vocab_size, dec_vocab_size, enc_seq_length, dec_seq_length, h, d_k, d_v, d_model, d_ff, n, dropout_rate)

# Include metrics monitoring
train_loss = Mean(name='train_loss')
train_accuracy = Mean(name='train_accuracy')
train_likelihood = Mean(name='train_likelihood')

# Create a checkpoint object and manager to manage multiple checkpoints
ckpt = train.Checkpoint(model=training_model, optimizer=optimizer)
ckpt_manager = train.CheckpointManager(ckpt, "./checkpoints", max_to_keep=3)

predictions = []
targets = []
losses = []

# Speeding up the training process
@function
def train_step(encoder_input, decoder_input, decoder_output):
    with GradientTape() as tape:
        prediction = training_model(encoder_input, decoder_input, training=True)

        loss = loss_fcn(decoder_output, prediction)
        accuracy = accuracy_fcn(decoder_output, prediction)
        likelihood = likelihood_fcn(decoder_output, prediction)

    gradients = tape.gradient(loss, training_model.trainable_weights)
    optimizer.apply_gradients(zip(gradients, training_model.trainable_weights))

    train_loss(loss)
    train_accuracy(accuracy)
    train_likelihood(likelihood)

    return loss, prediction, decoder_output

train_loss.reset_states()
train_accuracy.reset_states()
train_likelihood.reset_states()

print(f"\nStart of training in {len(train_dataset)} batches")

start_time = time()

for step, (train_batchX, train_batchY) in enumerate(train_dataset):
    encoder_input = train_batchX[:, 1:]
    decoder_input = train_batchY[:, :-1]
    decoder_output = train_batchY[:, 1:]

    loss, prediction, target = train_step(encoder_input, decoder_input, decoder_output)

    losses.append(loss.numpy())
    predictions.append(prediction.numpy())
    targets.append(decoder_output.numpy())

    if step % 25 == 0:
        print(f'Step {step} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f} Likelihood {train_likelihood.result():.4f}')

# Print epoch number and loss value at the end of every epoch
print("Training Loss %.4f, Training Accuracy %.4f, Training Likelihood %.4f" % (train_loss.result(), train_accuracy.result(), train_likelihood.result()))

print("Total time taken: %.2fs" % (time() - start_time))

X Length: (9000, 7)


TypeError: __init__() takes 11 positional arguments but 12 were given

In [None]:
len(predictions)

In [None]:
predictions[0].shape

In [None]:
targets[0].shape

In [None]:
print(losses)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(losses, label='Training Loss')
plt.xlabel('Batches')
plt.ylabel('Loss Value')
plt.title('Training Loss Over Time')
plt.legend()
plt.grid(True)
plt.show()