In [1]:
import pm4py
import pandas as pd
import numpy as np
import random
import os
from itertools import permutations
from datetime import datetime
from tqdm import tqdm
from matplotlib import pyplot as plt
from pm4py.objects.conversion.log import converter as log_converter
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

import keras
from keras.layers import Input, Embedding, Dense, Flatten, Concatenate, Reshape, LSTM, LeakyReLU, GaussianNoise
from keras.optimizers import Adam
from keras import Model
from keras.utils.vis_utils import plot_model

class TimestampGAN:
    """
    Class for repairing identical timestamp errors in event logs.
    Instructions:
        1.) Initialize object
        2.) object.read_data()
        3.) object.preprocess_data()
        4.) object.construct_networks()
        5.) object.train()
    """
        
    def __init__(self):
        self.df = None
        self.df_original = None
        self.n_suffix = None
        self.scaler = None
        self.encoder = None
        self.current_activities = None
        self.current_deltas = None
        self.succeeding_activities = None
        self.succeeding_deltas = None
        self.latent_dim = None
        self.generator = None
        self.discriminator = None
        self.gan = None
        self.is_skewed = False
        
        
    def read_data(self, data_path, id_column, timestamp_column, activity_column):
        """
        Reads the given data as a pandas dataframe and drops irrelevant columns.
        
        Parameters:
            data_path (str): The path where the .csv or .xes file is stored (e.g. 'C:/event_log.csv')
            id_column (str): The name of the column, that contains the case id. With .xes typically 'case:concept:name'
            timestamp_column (str): The name of the column that contains the timestamp. With .xes typically 'time:timestamp'
            activity_column (str): The name of the column that contains the activity. With .xes typically 'concept:name'
        """
        if data_path.endswith('.csv'):
            print('Reading .csv file...')
            df = pd.read_csv(data_path)
        else:
            print('Reading .xes file...')
            xes_file = pm4py.read_xes(data_path)
            df = log_converter.apply(xes_file, variant=log_converter.Variants.TO_DATA_FRAME)
        df = df[[id_column, timestamp_column, activity_column]]
        df.columns = ['id', 'timestamp', 'activity']
        df = df.sort_values(['id', 'timestamp']).reset_index(drop=True)
        df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True) # ensure timestamp datetype
        self.df = df
        
    def preprocess_data(self, n_suffix = 1):
        """
        Applies timestamp transformation (time difference, log tranformation and min max scaling), 
        activity encoding (one hot encoding), as well as the construction of the final dataset.
        
        Parameters:
            n_suffix (int): Number of succeeding events to take into account.
        """
        
        print("Detecting identical timestamp errors...")
        self.detect_error()
        
        print("Preprocessing data...")
        self.n_suffix = n_suffix
        
        # Add time difference
        self.df['delta_t'] = self.df['timestamp'].diff().shift(0).dt.total_seconds()
        self.df = self.df.dropna()
        self.df = self.df.reset_index(drop=True)
        
        # replace the time difference of the first event of a case with the median difference of its activity
        first_indices = self.df.reset_index().groupby('id').first()['index'].to_list()
        not_first_indices = set(self.df.reset_index()['index'].to_list()) - set(first_indices)
        not_first_indices = list(not_first_indices)
        not_first_indices.sort()
        delta_t_median = self.df.iloc[not_first_indices].groupby('activity')['delta_t'].median().to_dict()
        replace_func = lambda x: delta_t_median[x['activity']] if x['activity'] in delta_t_median else 0
        self.df.loc[first_indices, 'delta_t'] = self.df.iloc[first_indices].apply(replace_func, axis=1)
        
        # Replace duration of lines with identical timestamp errors with median of its activity
        erroneous_rows = self.df.reset_index()[self.df['delta_t'] == 0]['index'].to_list()
        self.df.loc[erroneous_rows, 'delta_t'] = self.df.iloc[erroneous_rows].apply(replace_func, axis=1)
        
        # Min max scale time difference
        self.scaler = MinMaxScaler()
        self.df['delta_t_norm'] = self.scaler.fit_transform(self.df[['delta_t']])
        
        # One hot encode activities
        self.encoder = OneHotEncoder(sparse=False)
        self.df['activities_encoded'] = self.encoder.fit_transform(self.df[['activity']]).tolist()
        
        # save the current df as df_original for later evaluation
        self.df_original = self.df.copy()
        self.df_original.loc[erroneous_rows, 'delta_t'] = 0
        self.df_original['delta_t_norm'] = self.scaler.fit_transform(self.df_original[['delta_t']])
        
        
        self.df = self.df.dropna().reset_index(drop=True)
        
        # drop cases with less than n_suffix events
        drop_dict = (self.df.groupby('id').count()['timestamp'] <= self.n_suffix).to_dict() # identify case_ids with less than n_suffix activities
        to_keep = [x for x in self.df['id'].unique() if not drop_dict[x]] # keep remaining rows
        self.df = self.df[self.df['id'].isin(to_keep)].reset_index(drop=True)
        
        # Construct sequential dataset for the GAN
        self.current_activities, self.current_deltas, self.succeeding_activities, self.succeeding_deltas = self.construct_sequential_dataset()
        
    def detect_error(self):
        """
        Detects identical timestamp errors in a dataframe. Adds an equivalency_flag attribute to 
        the dataframe.
        
        Parameters:
            
        """
        df = self.df
        
        booleans = [[] for x in range(len(df))]
        booleans[0].append(False)
        booleans[-1].append(False)
        for i in tqdm(range(df.shape[0] - 1)):
            # different case
            if df.iloc[i, 0] != df.iloc[i+1, 0]:
                booleans[i].append(False)
                booleans[i+1].append(False)
                continue
            # same case
            else:
                # timestamp-equivalent events
                if df.iloc[i, 1] == df.iloc[i+1, 1]:
                    booleans[i].append(True)
                    booleans[i+1].append(True)
                # no timestamp-equivalent events
                else:
                    booleans[i].append(False)
                    booleans[i+1].append(False)
        equivalency_flag = [x[0] or x[1] for x in booleans]
        self.df['equivalency_flag'] = equivalency_flag
        
        
    def construct_sequential_dataset(self):
        """
        Constructs the sequential dataset, which the GAN will take as input.
        
        Returns:
            current_activity (np.array): The one hot encoded activities of the current activity
            succeeding_activities (np.array): The one hot encoded activities of the suffix events
            succeeding_deltas (np.array): The transformed suffix time difference
            current_delta (np.array): The current transformed and scaled time difference
        """
        
        print("Constructing sequential dataset...")
        current_activity = []
        current_delta = []
        succeeding_activities = []
        succeeding_deltas = []
        
        
        id_to_row_index = self.df.groupby('id').groups
        unique_ids = self.df['id'].unique()
        
        for case_id in tqdm(unique_ids):
            current_set = self.df.iloc[id_to_row_index[case_id]].reset_index()
            for i in range(0, current_set.shape[0]):
                if (i + self.n_suffix) >= current_set.shape[0]:
                    continue
                current_act = np.array([current_set.loc[i, 'activities_encoded']])
                succeeding_act = np.array(current_set.loc[i+1:i+self.n_suffix, 'activities_encoded'].to_list())
                
                current_delt = np.array([current_set.loc[i, 'delta_t_norm']])
                succeeding_delt = np.array(current_set.loc[i+1:i+self.n_suffix, 'delta_t_norm'].to_list())
                
                current_activity.append(current_act)
                current_delta.append(current_delt)
                succeeding_activities.append(succeeding_act)
                succeeding_deltas.append(succeeding_delt)
                
        current_activity = np.array(current_activity)
        current_delta = np.array(current_delta)
        succeeding_activities = np.array(succeeding_activities)
        succeeding_deltas = np.array(succeeding_deltas)
        return current_activity, current_delta, succeeding_activities, succeeding_deltas
    
    def construct_networks(self, latent_dim = 100):
        """
        Constructs the generator, discriminator and the GAN.
        
        Parameters:
            latent_dim (int): The dimension of the latent space
        
        """
        print("Constructing networks...")
        self.latent_dim = latent_dim
        self.generator = self.define_generator(n_classes = self.encoder.categories_[0].shape[0], latent_dim = latent_dim)
        self.discriminator = self.define_discriminator(n_classes = self.encoder.categories_[0].shape[0])
        self.gan = self.define_gan()
        
        
    def define_generator(self, n_classes, latent_dim):
        """
        Defines the generator.
        
        Parameters:
            n_classes (int): The amount of unique activities performed in the event log
            latent_dim (int): The dimension of the latent space
            
        Returns:
            generator (keras.Functional): The generator network
        """
        # one hot encoded activities as input
        input_current_activity = Input(shape = (1, n_classes), name='input_current_activity_one_hot_encoded')
        input_succeeding_activity = Input(shape = (self.n_suffix, n_classes), name='input_succeeding_activities_one_hot_encoded')
        layer_activities = Concatenate()([input_current_activity, input_succeeding_activity])
        layer_activities = Reshape((2, -1))(layer_activities)
        
        # time difference inputs
        input_succeeding_deltas = Input(shape=(self.n_suffix), name='input_succeeding_deltas')
        layer_deltas = Dense(2)(input_succeeding_deltas)
        layer_deltas = Reshape((2,-1))(layer_deltas)
        
        # latent space input
        input_latent = Input(shape=(latent_dim,), name='gaussian_input')
        layer_latent = Dense(latent_dim)(input_latent)        
        layer_latent = Reshape((2, -1))(layer_latent)
        
        
        # combining the different inputs
        layer = Concatenate()([layer_activities, layer_deltas, layer_latent])
        
        # hidden layers
        layer = Reshape((2,-1))(layer)
        layer = LSTM(500, return_sequences=True, dropout=0.25)(layer)
        layer = LSTM(300, return_sequences=False, activation='relu')(layer)
        layer = Dense(100)(layer)
        layer = LeakyReLU(alpha=0.2)(layer)
        layer = Dense(10)(layer)
        layer = LeakyReLU(alpha=0.2)(layer)
        layer = Dense(1, activation='sigmoid')(layer)
        generator = Model([input_current_activity, input_succeeding_activity, 
                           input_succeeding_deltas, input_latent], layer)
        return generator
    
    def define_discriminator(self, n_classes):
        """
        Defines the discriminator.
        
        Parameters:
            n_classes (int): The amount of unique activities performed in the event log
            
        Returns:
            discriminator (keras.Functional): The discriminator network
        """
        # one hot encoded activities as input
        input_current_activity = Input(shape = (1, n_classes), name='input_current_activity_one_hot_encoded')
        input_succeeding_activity = Input(shape = (self.n_suffix, n_classes), name='input_succeeding_activities_one_hot_encoded')
        layer_activities = Concatenate()([input_current_activity, input_succeeding_activity])
        layer_activities = Reshape((2, -1))(layer_activities)
        
        
        # timestamp inputs
        input_current_deltas = Input(shape=(1), name='input_current_deltas')
        input_succeeding_deltas = Input(shape=(self.n_suffix), name='input_succeeding_deltas')
        layer_deltas = Concatenate()([input_current_deltas, input_succeeding_deltas])
        layer_deltas = Reshape((2,-1))(layer_deltas)
        
        # combining the different inputs
        layer = Concatenate()([layer_activities, layer_deltas])
        layer = GaussianNoise(0.1)(layer)
        
        # hidden layers
        layer = Reshape((2,-1))(layer)
        layer = LSTM(500, return_sequences=True)(layer)
        layer = LSTM(300, activation='relu', return_sequences=False)(layer)
        layer = Dense(50)(layer)
        layer = LeakyReLU(alpha=0.2)(layer)
        layer = Dense(10)(layer)
        layer = LeakyReLU(alpha=0.2)(layer)
        layer = Dense(1, activation='sigmoid')(layer)
        
        discriminator = Model([input_current_activity, input_succeeding_activity,
                               input_current_deltas, input_succeeding_deltas], layer)
        opt = Adam(lr=0.0002, beta_1=0.5)
        discriminator.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
        return discriminator
        
    def define_gan(self):
        """
        Constructs the GAN by combining the discriminator and the generator.
        
        Returns:
            gan (keras.Functional): The complete GAN
        """
        self.discriminator.trainable=False
        input_current_activity, input_succeeding_activity, input_succeeding_deltas, input_latent = self.generator.input
        generator_output = self.generator.output
        gan_output = self.discriminator([input_current_activity, input_succeeding_activity, generator_output, input_succeeding_deltas])
        gan = Model([input_current_activity, input_succeeding_activity, input_succeeding_deltas, input_latent], gan_output)
        opt = Adam(lr=0.0002, beta_1=0.5)
        gan.compile(loss='binary_crossentropy', optimizer=opt)
        return gan
    
    def get_real_samples(self, n_samples):
        """
        Randomly picks n real samples of data.
        
        Parameters:
            n_samples (int): The amount of samples to be picked.
            
        Returns:
            current_act (np.array): The current activity as a one hot encoded vector
            succeeding_act (np.array): The suffix activities as one hot encoded vectors
            current_time (np.array): The actual time difference of the current event
            succeeding_time (np.array): The suffix time differences
            y (np.array): The labels of the samples, in this case 1.
        """
        indices = np.random.randint(0, self.current_activities.shape[0], n_samples)
        
        # activities
        current_act = self.current_activities[indices]
        succeeding_act = self.succeeding_activities[indices]
        
        # timestamps
        current_time = self.current_deltas[indices]
        succeeding_time = self.succeeding_deltas[indices]
        
        # labels
        y = np.ones((n_samples, 1))
        return [current_act, succeeding_act, current_time, succeeding_time], y
    
    def get_generator_input(self, n_samples):
        """
        Constructs the generator's input by randomly picking samples of data as well as latent input.
        
        Parameters:
            n_samples (int): The number of samples to be picked.
            
        Returns:
            current_act (np.array): The current activity as a one hot encoded vector
            succeeding_act (np.array): The suffix activities as one hot encoded vectors
            succeeding_time (np.array): The suffix time differences
            latent (np.array): The random gaussian noise
        """
        # indices
        indices = np.random.randint(0, self.current_activities.shape[0], n_samples)
        
        # activities
        current_act = self.current_activities[indices]
        succeeding_act = self.succeeding_activities[indices]
        
        # timestamps
        succeeding_time = self.succeeding_deltas[indices]
        
        # gaussian noise
        latent = np.random.randn(n_samples, self.latent_dim)
        return [current_act, succeeding_act, succeeding_time, latent]
        
    def generate_fake_samples(self, n_samples):
        """
        Generates n_samples of data by the generator.
        
        Parameters:
            n_samples (int): The amount of samples to be generated.
            
        Returns:
            current_act (np.array): The current activity as a one hot encoded vector
            succeeding_act (np.array): The suffix activities as one hot encoded vectors
            generated (np.array): The generated time difference for the current event
            succeeding_time (np.array): The suffix time differences
            y (np.array): The labels of the samples, in this case 0.
        """
        current_act, succeeding_act, succeeding_time, latent = self.get_generator_input(n_samples)
        generated = self.generator.predict([current_act, succeeding_act, succeeding_time, latent])
        y = np.zeros((n_samples,1))
        return [current_act, succeeding_act, generated, succeeding_time], y

    
    def train(self, n_epochs, n_batch, output_path):
        """
        Trains the GAN.
        
        Parameters:
            n_epochs (int): The amount of epochs, the GAN should be trained for.
            n_batch (int): The batch size for a single training run.
            output_path (str): The output path, where model files as well as logging will be saved.
            
        Returns:
            d_loss_real (list): The historical accuracies of the discriminator on real samples
            d_loss_fake (list): The historical accuracies of the discriminator on fake samples
            g_loss (list): The historical accuracies of the generator
        """
        print("Creating folder structure...")
        os.mkdir(output_path)
        os.mkdir(output_path + '/model_files')
        os.mkdir(output_path + '/logging')
        
        print("Training the networks...")
        bat_per_epo = int(self.current_activities.shape[0] / n_batch)
        half_batch = int(n_batch / 2)
        
        d_loss_real = []
        d_loss_fake = []
        g_loss = []
        
        for i in range(n_epochs):
            for j in range(bat_per_epo):
            
                # get randomly selected 'real' samples
                disc_input_real, y_real = self.get_real_samples(half_batch)
                d_loss1, _ = self.discriminator.train_on_batch(disc_input_real, y_real, verbose=0)
                d_loss_real.append(d_loss1)
                
                # get randomly selected 'fake' samples
                disc_input_fake, y_fake = self.generate_fake_samples(half_batch)
                d_loss2, _ = self.discriminator.train_on_batch(disc_input_fake, y_fake, verbose=0)
                d_loss_fake.append(d_loss2)
                
                x_gan = self.get_generator_input(n_batch)
                y_gan = np.ones((n_batch, 1))
                gan_loss = self.gan.train_on_batch(x_gan, y_gan, verbose=0)
                g_loss.append(gan_loss)
                
                print('>%d, %d/%d, d1=%.3f, d2=%.3f g=%.3f' %(i+1, j+1, bat_per_epo, d_loss1, d_loss2, gan_loss))
                
                file = open(output_path + '/logging/metric_log.txt', 'a')
                file.write('>%d, %d/%d, d1=%.3f, d2=%.3f g=%.3f' %(i+1, j+1, bat_per_epo, d_loss1, d_loss2, gan_loss))
                file.write('\n')
                file.close()
                
                if j % 250 == 0:    
                    self.generator.save(output_path + '/model_files/epoch_' + str(i).zfill(4) + '_batch_' + str(j).zfill(4) + '.h5')
                    eval_pred = self.evaluate_random_sample(25000)
                    print('MAE of random evaluation:', eval_pred)
                    file = open(output_path + '/logging/eval_log.txt', 'a')
                    file.write('>%d, %d/%d, d1=%.3f' %(i+1, j+1, bat_per_epo, eval_pred))
                    file.write('\n')
                    file.close()
        return d_loss_real, d_loss_fake, g_loss
        
        
    def evaluate_random_sample(self, n_samples):
        """
        Generates a prediction for n random samples and compares it to its actual value.
        
        Parameters:
            n_samples (int): The amount of evaluation samples that should be considered.
        
        Returns:
            difference (float): The mean absolute error
        """
        # indices
        indices = np.random.randint(0, self.current_activities.shape[0], n_samples)
        
        # activities
        current_act = self.current_activities[indices]
        succeeding_act = self.succeeding_activities[indices]
        
        # timestamps
        succeeding_time = self.succeeding_deltas[indices]
        
        # gaussian noise
        latent = np.random.randn(n_samples, self.latent_dim)
        
        # compare predicted to actual timestamp
        actual = self.current_deltas[indices]
        preds = self.generator.predict([current_act, succeeding_act, succeeding_time, latent])
        difference = abs(actual - preds).mean()
        return difference
        
    def repair_event_log(self, output_path):
        """
        Repairs an event log affected by identical timestamp errors.
        
        Parameters:
            output_path (str): root path to where cGAN files were saved
        
        Returns:
            repaired_log (pandas.DataFrame): the repaired event log
        """
        
        #print("Reordering erroneous events...")
        #reordered_log = self.reorder_events()
        
        print("Repairing corrupted timestamps...")
        #repaired_log = self.repair_timestamps(output_path, reordered_log)
        repaired_log = self.repair_timestamps(output_path, self.df_original)
        return repaired_log

    def reorder_events(self):
        """
        Reorders events affected by identical timestamp errors.
        
        Parameters:
        
        Returns:
            reordered_log (pandas.DataFrame): The reordered event log
        """
        indices = self.df_original[self.df_original.equivalency_flag].activity.reset_index()['index'].to_list()
        old_order = self.df_original[self.df_original.equivalency_flag].activity.to_list()
        
        best_order = self.get_best_order()
        best_order = best_order.best_order.to_list()
        flat_list = [item for sublist in best_order for item in sublist]
        best_order = flat_list
        
        old_order_indexed = []
        index_dict = {key : 0 for key in set(old_order)}
        for el in old_order:
            old_order_indexed.append(el + str(index_dict[el]))
            index_dict[el] += 1
        
        best_order_indexed = []
        index_dict = {key : 0 for key in set(best_order)}
        for el in best_order:
            best_order_indexed.append(el + str(index_dict[el]))
            index_dict[el] += 1
        
        res = sorted(old_order_indexed, key = best_order_indexed.index)
        sorting = [old_order_indexed.index(x) for x in res]
        new_order = [indices[i] for i in sorting]
        for i in range(self.df_original.shape[0]):
            if i in new_order:
                continue
            else:
                new_order.insert(i, i)
        reordered_log = self.df_original.iloc[new_order].reset_index(drop=True)
        return reordered_log
    
    def get_pairwise_confidences(self):
        """
        Calculates the pairwise confidences for all activity combinations in the event log.
        
        Parameters:
        
        Returns:
            pairwise_confidences (dict): Mapping of activity combination to confidence
        """
        temp = self.df_original[~self.df_original['equivalency_flag']]
        temp = temp.reset_index(drop=True)
        occurences = {}
        last = temp.reset_index().groupby('id').last()['index'].to_list()


        for activity in tqdm(temp.activity.unique()):
            subs = list(temp[temp['activity'] == activity].index + 1)
            subs = [x for x in subs if x not in last and x < temp.shape[0]]
            occurences[activity] = temp.iloc[subs,2].value_counts().to_dict()

        pairwise_confidences = {}

        for activity in temp.activity.unique():
            for other_activity in temp.activity.unique():
                if other_activity not in occurences[activity].keys():
                    pairwise_confidences[(activity, other_activity)] = 0
                else:
                    conf = occurences[activity][other_activity] / temp[temp['activity'] == activity].shape[0]
                    pairwise_confidences[(activity, other_activity)] = conf
        return pairwise_confidences
    
    def get_best_order(self):
        """
        Calculates the best order for affected events.
        
        Returns:
            best_order (pandas.DataFrame): The best orders for timestamp-equivalent sequence in form of a list
        """
        pairwise_confidences = self.get_pairwise_confidences()
        to_repair = self.df_original[self.df_original['equivalency_flag']].groupby(['id', 'timestamp'])['activity'].apply(list).reset_index()
        best_sequences = []
        
        for i in tqdm(range(to_repair.shape[0])):
            sequence = to_repair.iloc[i, -1]
            if len(sequence) > 9:
                best_sequences.append(sequence)
                continue
            perms = list(permutations(sequence))
            sequence_confidences = []

            for perm in perms:
                seq = [(perm[i], perm[i+1]) for i in range(len(perm) - 1)]
                try:
                    seq_conf = sum([pairwise_confidences[x] for x in seq])
                except:
                    seq_conf = 0
                sequence_confidences.append(seq_conf)

            best_sequence = perms[sequence_confidences.index(max(sequence_confidences))]
            best_sequences.append(list(best_sequence))

        to_repair['best_order'] = best_sequences
        return to_repair
    
    def repair_timestamps(self, output_path, reordered_log, n_fallback_preds = 200):
        """
        Estimates the time differences of affected events on which basis the corrupted timestamp can be repaired
        
        Parameters:
            output_path (str): root path to where cGAN files were saved
            reordered_log (pandas.DataFrame): the reordered event log
            n_fallback_preds (int): the number of predictions to make in case of fallback
            
        Returns:
            repaired_log (pandas.DataFrame): The reordered and reestimated event log.
        """
        
        model = self.get_best_model(output_path)
        
        print("Estimating time difference for erroneous events...")
        
        # recalculate delta_t_norm since order of events has changed
        reordered_log['delta_t'] = reordered_log['timestamp'].diff().shift(0).dt.total_seconds()
        reordered_log = reordered_log.dropna()
        reordered_log = reordered_log.reset_index(drop=True)
        
        # replace the time difference of the first event of a case with the median difference of its activity
        first_indices = reordered_log.reset_index().groupby('id').first()['index'].to_list()
        not_first_indices = set(reordered_log.reset_index()['index'].to_list()) - set(first_indices)
        not_first_indices = list(not_first_indices)
        not_first_indices.sort()
        delta_t_median = reordered_log.iloc[not_first_indices].groupby('activity')['delta_t'].median().to_dict()
        replace_func = lambda x: delta_t_median[x['activity']] if x['activity'] in delta_t_median else 0
        reordered_log.loc[first_indices, 'delta_t'] = reordered_log.iloc[first_indices].apply(replace_func, axis=1)
        
        reordered_log['delta_t_norm'] = self.scaler.fit_transform(reordered_log[['delta_t']])
        
        reordered_log['predicted_delta'] = reordered_log['delta_t']
        
        success_count = 0
        fallback_count = 0
        change_log = []
        
        try:
            for i in range(0, reordered_log.shape[0] - 1):
                current = reordered_log.iloc[i]
                # Correct event present, no need to repair
                if (current['delta_t'] != 0):
                    change_log.append('No change')
                    continue
                else:
                    # Get next correct delta t
                    succeeding_count = 1
                    while True:
                        succeeding = reordered_log.iloc[i + succeeding_count : i + succeeding_count + self.n_suffix]
                        if succeeding.iloc[0]['delta_t'] == current['delta_t']:
                            succeeding_count += 1
                            continue
                        else:
                            break

                    # Construct generator inputs
                    current_activity = np.array([current['activities_encoded']]).reshape(1,1,-1)
                    current_activity_repeat = np.tile(current_activity,[n_fallback_preds,1,1])
                    succeeding_activity = np.array(succeeding['activities_encoded'].to_list()).reshape(1,repair.n_suffix,-1)
                    succeeding_activity_repeat = np.tile(succeeding_activity,[n_fallback_preds,1,1])
                    succeeding_deltas = np.array(succeeding['delta_t_norm'].to_list()).reshape(1,repair.n_suffix)
                    succeeding_deltas_repeat = np.tile(succeeding_deltas, [n_fallback_preds,1])
                    gauss_repeat = np.random.randn(n_fallback_preds, repair.latent_dim)

                    # Estimate time difference
                    preds = model.predict([current_activity_repeat, succeeding_activity_repeat,
                                          succeeding_deltas_repeat, gauss_repeat])

                    # Reverse transform the predicted time difference
                    preds = self.scaler.inverse_transform(preds)

                    # Check if prediction is valid or if fallback needs to be applied
                    valid_indices = np.where((preds < succeeding.iloc[-1]['delta_t']) & (preds >= 0))[0]
                    if valid_indices.shape[0] > 0:
                        new_delta = int(preds[valid_indices[0]])
                        success_count += 1
                        change_log.append('Single pred')
                    elif int(preds.mean()) < succeeding.iloc[-1]['delta_t']:
                        new_delta = preds.mean()
                        success_count += 1
                        change_log.append('Mean pred')
                    else:
                        new_delta = 1
                        fallback_count += 1
                        change_log.append('Fallback')
                        
                    try:
                        reordered_log.loc[i, 'predicted_delta'] = new_delta
                    except Exception as e:
                        print(e, 'at index', i)
        except Exception as e:
            print(e)
        print('Repair successfull! Statistics:\nSuccesful changes:', success_count, '\nFallback changes:', fallback_count)
        return reordered_log
        
    def get_best_model(self, output_path):
        """
        Determines the best cGAN according to three consecutive evaluation losses.
        
        Parameters:
            output_path (str): root path to where cGAN files were saved
        
        Returns:
            best_model (keras.Functional): The best cGAN model
        """
        print("Determining best cGAN model...")
        f = open(output_path + '/logging/eval_log.txt')
        line = f.readline()
        eval_losses = []
        
        while line:
            loss = line.split('d1=')[1].replace('\n', '')
            eval_losses.append(float(loss))
            line = f.readline()

        means = []
        for i in range(3, len(eval_losses)):
            means.append(sum(eval_losses[i-3 : i]) / 3)

        best_model = means.index(min(means))
        MODEL_PATH = output_path + '/model_files/'
        MODEL_NAME = os.listdir(MODEL_PATH)[best_model]
        best_model = keras.models.load_model(MODEL_PATH + MODEL_NAME)
        return best_model

In [None]:
repair = TimestampGAN()
repair.read_data('log.csv', 'id', 'timestamp', 'activity')
repair.preprocess_data()
repair.construct_networks()
d_loss_real, d_loss_fake, g_loss = repair.train(n_epochs = 5, n_batch = 256, output_path = 'Training/test_output_40/')
repaired_log = repair.repair_event_log(output_path)