In [None]:
from preprocessing import Preprocessor

import numpy as np
import pandas as pd

import keras
from keras.layers import Input, Embedding, Dense, Flatten, Concatenate, Reshape, LSTM, LeakyReLU, GaussianNoise, TimeDistributed
from keras.optimizers import Adam
from keras import Model
from keras.utils.vis_utils import plot_model
from keras_preprocessing.sequence import pad_sequences
from Levenshtein import distance as levenshtein_distance
from tqdm import tqdm as tqdm
from itertools import permutations 

from keras import backend as K

class ConfortiGAN:
    
    
    def __init__(self, path_to_log, id_column, timestamp_column, activity_column, output_path):
        self.preprocessor = Preprocessor(path_to_log, id_column, timestamp_column, activity_column)
        self.output_path = output_path
        self.group = 0
        self.latent_dim = 100
        
        
    def prepare_data(self):
        self.preprocessor.prepare_data()
        self.n_classes = self.preprocessor.encoder.categories_[0].shape[0]
        self.df = self.preprocessor.df
        self.x_train = self.preprocessor.x_train
        self.y_train = self.preprocessor.y_train
        self.x_val = self.preprocessor.x_val
        self.y_val = self.preprocessor.y_val
        self.x_test = self.preprocessor.x_test
        self.y_test = self.preprocessor.y_test
        
        
    def reorder_events(self):
        indices = self.df[self.df.equivalency_flag].activity.reset_index()['index'].to_list()
        old_order = self.df[self.df.equivalency_flag].activity.to_list()
        indices = self.df[self.df.equivalency_flag].activity.reset_index()['index'].to_list()
        
        best_order = self.get_best_order()
    
        best_order = best_order.best_order.to_list()
        flat_list = [item for sublist in best_order for item in sublist]
        best_order = flat_list
        
        old_order_indexed = []
        index_dict = {key : 0 for key in set(old_order)}
        for el in old_order:
            old_order_indexed.append(el + str(index_dict[el]))
            index_dict[el] += 1

        best_order_indexed = []
        index_dict = {key : 0 for key in set(best_order)}
        for el in best_order:
            best_order_indexed.append(el + str(index_dict[el]))
            index_dict[el] += 1

        res = sorted(old_order_indexed, key = best_order_indexed.index)
        sorting = [old_order_indexed.index(x) for x in res]
        new_order = [indices[i] for i in sorting]
        for i in range(self.df.shape[0]):
            if i in new_order:
                continue
            else:
                new_order.insert(i, i)
        
        self.reordered_log = self.df.iloc[new_order].sort_values(by='id').reset_index(drop=True)

    
    def get_best_order(self):
        """
        Calculates the best order for affected events.

        Returns:
            best_order (pandas.DataFrame): The best orders for timestamp-equivalent sequence in form of a list
        """
        self.pairwise_confidences = self.get_pairwise_confidences()
        to_repair = self.df.groupby(['id'])['activity'].apply(list).reset_index()
        best_sequences = []

        for i in tqdm(range(to_repair.shape[0])):
            sequence = to_repair.iloc[i, -1]
            if len(sequence) > 9: # otherwise permutations will take too long to be calculated
                best_sequences.append(sequence)
                continue
            perms = list(permutations(sequence))
            sequence_confidences = []

            for perm in perms:
                seq = [(perm[i], perm[i+1]) for i in range(len(perm) - 1)]
                try:
                    seq_conf = sum([pairwise_confidences[x] for x in seq])
                except:
                    seq_conf = 0
                sequence_confidences.append(seq_conf)

            best_sequence = perms[sequence_confidences.index(max(sequence_confidences))]
            best_sequences.append(list(best_sequence))

        to_repair['best_order'] = best_sequences
        return to_repair
    
    
    def get_pairwise_confidences(self):
        """
        Calculates the pairwise confidences for all activity combinations in the event log.

        Parameters:

        Returns:
            pairwise_confidences (dict): Mapping of activity combination to confidence
        """
        print("Calculating pairwise confidences...")
        correct_events = self.df[~self.df['equivalency_flag']]
        correct_events = correct_events.reset_index(drop=True)
        occurences = {}
        last_in_case = correct_events.reset_index().groupby('id').last()['index'].to_list()


        for activity in tqdm(correct_events.activity.unique()):
            subs = list(correct_events[correct_events['activity'] == activity].index + 1)
            subs = [x for x in subs if x not in last_in_case and x < correct_events.shape[0]]
            occurences[activity] = correct_events.iloc[subs,1].value_counts().to_dict()

        pairwise_confidences = {}

        for activity in correct_events.activity.unique():
            for other_activity in correct_events.activity.unique():
                if other_activity not in occurences[activity].keys():
                    pairwise_confidences[(activity, other_activity)] = 0
                else:
                    conf = occurences[activity][other_activity] / correct_events[correct_events['activity'] == activity].shape[0]
                    pairwise_confidences[(activity, other_activity)] = conf
        return pairwise_confidences
    
    
    def construct_networks(self):
        print("Constructing networks...")
        self.generator = self.define_generator(n_classes = self.n_classes, latent_dim = self.latent_dim)
        self.discriminator = self.define_discriminator(n_classes = self.n_classes)
        self.gan = self.define_gan()
        

    def define_generator(self, n_classes, latent_dim):
        input_sequences = Input(shape = (None, n_classes), name='input_sequences')
        input_max_duration = Input(shape = (None, 1), name='input_max_duration')
        input_latent = Input(shape=(None, latent_dim), name='input_latent')

        layer_input = Concatenate()([input_sequences, input_max_duration, input_latent])
        LSTM_1 = LSTM(1000, return_sequences=True, dropout=0.25)(layer_input)
        LSTM_2 = LSTM(500, return_sequences=True, dropout=0.25)(LSTM_1)
        LSTM_3 = LSTM(100, return_sequences=True)(LSTM_2)
        layer_output_sequences = TimeDistributed(Dense(n_classes, activation='softmax'))(LSTM_3)

        #LSTM_4 = LSTM(n_classes, return_sequences=True)(LSTM_3)
        layer_output_durations = LSTM(1, return_sequences=True, activation = 'sigmoid')(LSTM_3)

        generator = Model(inputs=[input_sequences, input_max_duration, input_latent], outputs=[layer_output_sequences, layer_output_durations])
        return generator

    
    def define_discriminator(self, n_classes):
        input_sequ = Input(shape = (None, n_classes), name='input_sequ')
        input_dur = Input(shape = (None, 1), name='input_dur')

        layer_input = Concatenate()([input_sequ, input_dur])
        noise = GaussianNoise(0.2)(layer_input)
        LSTM_1 = LSTM(1000, return_sequences=True)(noise)
        LSTM_3 = LSTM(500, return_sequences=False)(LSTM_1)
        dense_1 = Dense(100)(LSTM_3)
        dense_1_act = LeakyReLU(alpha=0.2)(dense_1)
        dense_2 = Dense(50)(dense_1_act)
        dense_2_act = LeakyReLU(alpha=0.2)(dense_2)
        output_classification = Dense(1, activation='sigmoid')(dense_2_act)

        discriminator = Model(inputs=[input_sequ, input_dur], outputs=output_classification)
        opt = Adam(lr=0.0002, beta_1=0.5)
        discriminator.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
        return discriminator

    
    def define_gan(self):
        self.discriminator.trainable = False
        gan_input = self.generator.input
        generator_output = self.generator.output
        gan_output = self.discriminator(generator_output)
        gan = Model(gan_input, gan_output)
        opt = Adam(lr=0.0002, beta_1=0.5)
        gan.compile(loss='binary_crossentropy', optimizer=opt)
        return gan
    
    def get_real_samples(self, n_samples, data):
        activities = data[0]
        durations = data[1]
        indices = np.random.randint(0, activities.shape[0], n_samples)
        random_samples = [activities[indices], durations[indices]]
        y = np.ones((n_samples, 1)) #label as real data
        return random_samples, y

    def get_generator_input(self, data, n_samples, timesteps):
        activities = data[0]
        durations = data[1]

        indices = np.random.randint(0, activities.shape[0], n_samples)

        latent = np.random.randn(n_samples, timesteps, self.latent_dim)

        return [activities[indices], durations[indices], latent]

    def generate_fake_samples(self, data, n_samples, timesteps, latent_dim):
        input_generator = get_generator_input(data, n_samples, timesteps, latent_dim)
        generated = generator.predict(input_generator)
        y = np.zeros((n_samples,1))
        return generated, y

    def train_gan(self):
        for key in self.x_train.keys():
            print("Currently at timesteps =", key)
            train(n_epochs=10, n_batch=256, timesteps=key)
    
    def train(self, n_epochs, n_batch, timesteps):

        print("Training the networks...")
        bat_per_epo = int(self.x_train[timesteps][0].shape[0] / n_batch)
        half_batch = int(n_batch / 2)

        d_loss_real = []
        d_loss_fake = []
        g_loss = []

        for i in range(n_epochs):
            for j in range(bat_per_epo):

                # get randomly selected 'real' samples
                disc_input_real, y_real = self.get_real_samples(half_batch, self.y_train[timesteps])
                d_loss1, _ = self.discriminator.train_on_batch(disc_input_real, y_real)
                d_loss_real.append(d_loss1)


                # get randomly selected 'fake' samples
                disc_input_fake, y_fake = self.generate_fake_samples(self.x_train[timesteps], half_batch, timesteps, 100)
                d_loss2, _ = self.discriminator.train_on_batch(disc_input_fake, y_fake)
                d_loss_fake.append(d_loss2)


                x_gan = self.get_generator_input(self.x_train[timesteps], n_batch, timesteps)
                y_gan = np.ones((n_batch, 1))
                gan_loss = self.gan.train_on_batch(x_gan, y_gan)
                g_loss.append(gan_loss)

                print('>%d, %d/%d, d1=%.3f, d2=%.3f g=%.3f' %(i+1, j+1, bat_per_epo, d_loss1, d_loss2, gan_loss))

                file = open(self.output_path + '/logging/metric_log.txt', 'a')
                file.write('>%d, %d/%d, d1=%.3f, d2=%.3f g=%.3f' %(i+1, j+1, bat_per_epo, d_loss1, d_loss2, gan_loss))
                file.write('\n')
                file.close()

                if j % 250 == 0:    
                    self.generator.save(self.output_path + '/model_files/epoch_' + str(i).zfill(4) + '_batch_' + str(j).zfill(4) + '.h5')
                    self.evaluate_validation_set(timesteps)


    def three_d_categorical_accuracy(self, y_true, y_pred):
        # max values along the second axis
        indices_true = np.argmax(y_true[0], axis=2)
        indices_pred = np.argmax(y_pred[0], axis=2)

        # Check for equality along individual elements
        equality = indices_true == indices_pred
        hits = [x.all() for x in equality]

        cat_acc = sum(hits) / y_true[0].shape[0]
        return cat_acc

    def evaluate_validation_set(self, timesteps):
        latent = np.random.randn(self.x_val[timesteps][0].shape[0], timesteps, self.latent_dim)
        y_pred = self.generator.predict([self.x_val[timesteps][0], self.x_val[timesteps][1], latent])
        y_true = self.y_val[timesteps]
        cat_acc = self.three_d_categorical_accuracy(y_true, y_pred)
        mae = abs(y_pred[1] - y_true[1]).mean()
        print('\n\n*******************')
        print('\n\nCategorical accuracy:', cat_acc)
        print('Mean absolute error:', mae)
        print('\n\n')

        
    def add_group_shift(self, row):
        if ((row['equivalency_flag'] & row['equivalency_shift'] | ~row['equivalency_flag'] & row['equivalency_shift']) | row['case_shift']):
            self.group += 1
        return self.group

    
    def repair_timestamps(self):
            """
            Estimates the time differences of affected events on which basis the corrupted timestamp can be repaired

            Parameters:
                output_path (str): root path to where cGAN files were saved
                reordered_log (pandas.DataFrame): the reordered event log
                n_fallback_preds (int): the number of predictions to make in case of fallback

            Returns:
                repaired_log (pandas.DataFrame): The reordered and reestimated event log.
            """

            model = self.generator
            reordered_log = self.reordered_log.copy().sort_values(by=['id', 'timestamp'])
            x = reordered_log.copy()
            # indicate change of equivalency_flag
            x['equivalency_shift'] = x['equivalency_flag'].shift() != x['equivalency_flag']
            x.loc[0, 'equivalency_shift'] = x.loc[0, 'equivalency_flag']
            # indicate change of case id
            x['case_shift'] = x['id'].shift() != x['id']
            x.loc[0, 'case_shift'] = False
            # add sequence group id
            x['group_id'] = x.apply(self.add_group_shift, axis=1)
            to_be_repaired = x[x.equivalency_flag]
            
            median_case_duration = self.df.groupby('id').duration.sum().median()

            print("Estimating duration for erroneous events...")


            reordered_log['predicted_timestamp'] = reordered_log['timestamp']
            
            for group_id in tqdm(to_be_repaired.group_id.unique()):
                current = to_be_repaired[to_be_repaired['group_id'] == group_id]
                input_activities = self.preprocessor.encoder.transform(current[['activity']]).reshape(1, current.shape[0], -1)
                valid_subsequent_events = self.df[(self.df['timestamp'] > current.timestamp.iloc[0]) & (self.df['id'] == current.id.iloc[0])]
                if valid_subsequent_events.shape[0] == 0:
                    max_duration = median_case_duration
                else:
                    max_duration = (valid_subsequent_events.timestamp.min() - current.timestamp.iloc[0]).total_seconds() / current.shape[0]
                max_duration_scaled = self.preprocessor.scaler.transform([[max_duration]])
                max_duration_scaled = np.resize(max_duration_scaled, (current.shape[0], 1))
                max_duration_scaled = max_duration_scaled.reshape(1, current.shape[0], -1)
                latent = np.random.randn(1, current.shape[0], 100)
                pred_durations = self.generator.predict([input_activities, max_duration_scaled, latent],verbose=0)[1]
                pred_seconds = self.preprocessor.scaler.inverse_transform(pred_durations.reshape(1,-1)).reshape(-1)
                current.predicted_timestamp = current.timestamp + pd.to_timedelta(pred_seconds, unit='s')
                repair.reordered_log.update(current.iloc[:, 0:-3])
            
            self.repaired_log = reordered_log
            self.reordered_log.to_csv(self.output_path + 'repaired_log.csv')

In [None]:
repair = ConfortiGAN('../../Daten/real_dataset/real_data_10.csv', 'id', 'timestamp', 'activity', 'output/')
repair.prepare_data()
repair.reorder_events()
repair.construct_networks()
repair.train_gan()
repair.repair_timestamps()