In [1]:
import os
import time
import pandas as pd
import numpy as np
import tensorflow as tf
from datetime import datetime
from gensim.models import Word2Vec
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, PReLU, Input, GaussianNoise
from keras.models import Model
from keras.optimizers import Adam
from sklearn.utils import shuffle
from collections import deque
from river import utils
from river import stats

# Read Data 

In [2]:
#innit variables for algortihm
GP_length = 2 # length of the traces
GP = 100 #how many traces with length GP_length should be present before the algortihm starts
w_cases = 100 #maximum number of traces present in SW
w_events = 1000 #maximum number of events present in SW
alpha = 0.1
dir_datasets = '_Data/Later/Real'

In [3]:
# Initialize the sequence dictionary and the autoencoder model
# Parameters for Word2Vec encoding
vector_size = 100
window = 5
min_count = 1

# Parameters for the autoencoder
 #TODO find the actual maximum length
#encoding_dim = int(vector_size *0.2)
#hidden_dim_1 = int(encoding_dim *0.2) #
#hidden_dim_2 = int(hidden_dim_1 / 2)
#learning_rate = 1e-4

# Functions

In [4]:
def create_model(cases, size, window, min_count):
    model = Word2Vec(
                vector_size=size,
                window=window,
                min_count=min_count)
    sentences = []
    for group in cases:
        group_sentences = []
        for row in group:
            # Extract sentences from row and add to group_sentences
            row_sentences = [str(item) for item in row]
            group_sentences.extend(row_sentences)
        # Add group_sentences to sentences
        sentences.append(group_sentences)
    
    model.build_vocab(sentences)
    model.train(sentences, total_examples=len(sentences), epochs=10)
    return model

def train_model(model, sequence): #array of arrays (events)
    #model.build_vocab(sentences, update=True)
    """
    for event in sequence:
        print('Eventi:', event, len(event))
        sequence_event = []
        #for row in event:
            # Extract sentences from row and add to group_sentences
        row_sequence = [str(item) for item in event]
        print('atributet:', row_sequence)
        sequence_event.extend(row_sequence)
        print('Sekuence', sequence_event)
        # Add group_sentences to sentences
        #sequence.append(sequence_event)
    """
    sequence = [[attribute for event in sequence for attribute in event]]
    model.train(sequence, total_examples=1, epochs=1)

In [5]:
def concat_feature_vector(events, model):
    vectors = []
    for event in events:
        case_vector = []
        for token in event:
            try:
                case_vector.append(model.wv[token])
            except KeyError:
                pass
        embedded_event = np.array(case_vector).mean(axis=0)
        vectors.append(embedded_event)

    embedded_sequence = np.array(vectors)
    embedded_sequence = np.reshape(embedded_sequence, (vector_size*len(vectors)))
    return embedded_sequence

In [6]:
def average_feature_vector(events, model):
    vectors = np.empty((0, vector_size))
    for event in events:
        event_vector = []
        for token in event:
            try:
                event_vector.append(model.wv[token])
                #print('event vector', event_vector)
            except KeyError:
                event_vector.append(np.zeros(vector_size))
        embedded_event = np.array(event_vector)
        print('Embedded event shape: ', embedded_event.shape)
        vectors = np.vstack([vectors, embedded_event])
        #print('Vector of events:', vectors)
    print("Vectors shape", np.array(vectors).shape)
    embedded_sequence = np.array(vectors).mean(axis=0) #taking the average of all the event
    print('Embedded sequence shape:', embedded_sequence.shape)
    # embedded_sequence = np.reshape(embedded_sequence, (100*len(vectors)))
    return embedded_sequence

In [7]:
# Function to create and train the autoencoder model
def create_autoencoder():
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(input_dim, )),
        tf.keras.layers.Dense(encoding_dim, activation='relu'),
        tf.keras.layers.PReLU(),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(hidden_dim_1, activation='relu'),
        tf.keras.layers.PReLU(),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(input_dim, activation='tanh')
    ])

    #print(autoencoder.summary())

    
    # define our early stopping
    early_stop = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        min_delta=0.0001,
        patience=100,
        verbose=1, 
        mode='min',
        restore_best_weights=True)
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(metrics=['accuracy'],
                        loss='mean_squared_error',
                        optimizer=optimizer,
                        )
    return model

In [8]:
def make_model(nr_features, hidden_layers, hidden_size_factor=0.5, noise=None):
    '''
    Create a DAE model
    '''
    # Input layer
    input_ = Input(shape=(nr_features,), name='input')
    x = input_

    # Noise layer
    if noise is not None:
        x = GaussianNoise(noise)(x) 

    # Hidden layers
    for i in range(hidden_layers):
        if isinstance(hidden_size_factor, list):
            factor = hidden_size_factor[i]
        else:
            factor = hidden_size_factor
        x = Dense(int(nr_features * factor), activation='relu', name=f'hid{i + 1}')(x)
        x = Dropout(0.5)(x)

    # Output layer
    output = Dense(nr_features, activation='tanh', name='output')(x)

    # Build model
    model = Model(inputs=input_, outputs=output)

    # Compile model
    model.compile(
        optimizer=Adam(learning_rate=0.0001, beta_2=0.99),
        loss='mean_squared_error',
    )

    return model

In [9]:
# Function to compute the reconstruction error for a sequence using the autoencoder model
def compute_reconstruction_error(autoencoder, sequence):
    reconstructed_sequence = autoencoder(sequence, training=False)
    reconstruction_error = np.mean(np.square(sequence - reconstructed_sequence))
    return reconstruction_error

In [10]:
import pytz
def get_timestamp(ts):
    try:
        # Try parsing with the first date format '%Y-%m-%d %H:%M:%S'
        timestamp = datetime.strptime(ts, '%Y-%m-%d %H:%M:%S')
    except ValueError:
            try:
                # If the first format fails, try parsing with the second date format '%Y-%m-%dT%H:%M:%S.%f%z'
                timestamp = datetime.strptime(ts, '%Y-%m-%dT%H:%M:%S.%f%z')
            except ValueError:
                raise ValueError("No valid format found")
    return timestamp

In [11]:
def remove_oldest_event(current_id):
    global case_ids, cid_to_events, event_counter

    cid_with_oldest_event = case_ids.popleft()
    while cid_with_oldest_event not in cid_to_events:
        cid_with_oldest_event = case_ids.popleft()
    cid_to_events[cid_with_oldest_event] = np.delete(cid_to_events[cid_with_oldest_event], 0, axis=0) # removing the oldest event
    if len(cid_to_events[cid_with_oldest_event]) == 0 and cid_with_oldest_event != current_id:
        del cid_to_events[cid_with_oldest_event]
        del cid_to_last_timestamp[cid_with_oldest_event]
    event_counter -= 1

def save_new_event(cid, event_data, event_timestamp, after_GP: bool):
    global cid_to_events, cid_to_last_timestamp, case_ids, event_counter
    
    event_counter += 1
    case_ids.append(cid)
    #sequence = None
    
    if cid in cid_to_events.keys():
        # we only care about the number of events
        if event_counter >= w_events and after_GP:
            # remove the oldest event
            remove_oldest_event(cid)
        # otherwise event_counter < We:
        sequence = cid_to_events[cid]
        sequence = np.vstack([sequence, event_data])
        cid_to_events[cid] = sequence
    else:
        # we need to care about the number of cases and also the number of events
        number_of_cases = len(cid_to_events.keys())
        if number_of_cases >= w_cases and after_GP:
            # remove the case with the oldest last event
            min_timestamp = datetime.strptime('2100-01-01 00:00:00', '%Y-%m-%d %H:%M:%S') # for datasets with this timestamp format
            #min_timestamp = datetime.strptime('2100-01-01T00:00:00.111+09:00', '%Y-%m-%dT%H:%M:%S.%f%z')
            removing_cid = None
            for cid, latest_timestamp in cid_to_last_timestamp.items():
                timestamp = datetime.strptime(latest_timestamp, '%Y-%m-%d %H:%M:%S')
                #timestamp = datetime.strptime(latest_timestamp, '%Y-%m-%dT%H:%M:%S.%f%z')
                if timestamp < min_timestamp:
                    min_timestamp = timestamp
                    removing_cid = cid
            del cid_to_events[removing_cid]
            del cid_to_last_timestamp[removing_cid]
        # otherwise number_of_cases < W, check the number of events
        if event_counter >= w_events and after_GP:
            #remove the oldest event
            remove_oldest_event(cid)
        # otherwise event_counter < We:
        cid_to_events[cid] = np.array([event_data])
        sequence = cid_to_events[cid]
        cid_to_last_timestamp[cid] = event_timestamp
    return sequence

In [12]:
from tensorflow import convert_to_tensor
def pre_process_model_inputs(inputs, input_length_model):
    padded_vector= np.pad(inputs, (0, input_length_model - inputs.shape[0]), 'constant', constant_values=0)
    encoded_sequence = np.expand_dims(padded_vector, axis=0)
    return encoded_sequence
    #return convert_to_tensor(final, dtype='float32')


from keras.backend import clear_session
from tensorflow import convert_to_tensor
'''
def init_train_model(window, encoder): #TODO fix the parameters
    #init models
    input_dim= sum([len(i) for i in encoder.categories_])*max_length
    model = create_autoencoder(input_dim, hidden_layers=2)
    #fit model for each prefix seen so far
    for sequence in window.values():
        #embedding = return_features_events(sequence)    
        embedding = onehot_encode_data(sequence, encoder)
        
        tensor = pre_process_model_inputs(embedding, input_dim)
        
        model.train_on_batch(tensor, tensor)
    clear_session()
    return model
'''
def init_train_model(cid_to_events, input_dim, encoder):
    model = make_model(input_dim, hidden_layers=2)
    
    max_prefix_length = 0
    for v in cid_to_events.values():
        if len(v) > max_prefix_length:
            max_prefix_length = len(v) 
    
    
    for nr_events in range(1, max_prefix_length+1):
        prefix_dict = {}
        for key, value in cid_to_events.items():
            if len(value) >= nr_events:
                prefix_dict[key] = value[:nr_events]

        for sequence in prefix_dict.values():
            train_model(encoder, sequence)
            embedding = concat_feature_vector(sequence, encoder)
            tensor = pre_process_model_inputs(embedding, input_length_model = model.input.shape[1])
            model.train_on_batch(tensor, tensor)
            
    return model


# Training

In [13]:
# Initialize DataFrame to store reconstruction errors
# reconstruction_error_df = pd.DataFrame(columns=['case_id', 'reconstruction_error'])
datasets = [dir_datasets + '/' + dataset for dataset in os.listdir(dir_datasets)]
for dataset in datasets:
    
    file_name = os.path.basename(dataset)
    df_dataset = pd.read_csv(dataset)
    df_dataset.sort_values('timestamp', inplace=True)
    #df_dataset = df_dataset[~df_dataset['case_id'].isin(df_dataset.groupby('case_id')['timestamp'].apply(lambda x: x.isna().any()).loc[lambda x: x].index)]
    df_dataset['timestamp'] = pd.to_datetime(df_dataset['timestamp']).dt.strftime('%Y-%m-%d %H:%M:%S')
    df_dataset = df_dataset.drop(['isEndTrace'], axis=1)
    max_length = df_dataset.groupby('case_id').size().max()
    cases = df_dataset.groupby(df_dataset.columns[0]).apply(lambda x: x.iloc[:, ~df_dataset.columns.isin(['case_id', 'event_position', 'timestamp', 'isAnomaly', 'anomaly'])].values.tolist())
    word2vec = create_model(cases, vector_size, window, min_count)
    input_dim = vector_size
    results_filename = 'results_w2v_' + file_name

    cid_to_events : dict[int, list]= dict()
    cid_to_last_timestamp : dict[int, float] = dict() # to know what is for a case id the timestamp of the last event (the most recent)
    case_ids = deque() # deque of the case ids such that, at each point, the popleft represents the oldest event
    event_counter = 0
    anomalous_cases = []

    # Initialize DataFrame to store reconstruction errors
    sliding_windows_dae = {}

    for i in range(max_length + 1):
        sliding_windows_dae[i] = utils.Rolling(stats.Var(), window_size=6000)

    after_GP = False
    threshold = 1 # initial default value

    for _, row in df_dataset.iterrows():
        
        #skip_training = False
        case_label = row['isAnomaly']
        case_id = row['case_id']
        timestamp = row['timestamp']
        prefix_length = row['event_position']
        anomaly_type = row['anomaly']
        event_data = row.drop(['case_id', 'event_position', 'timestamp', 'isAnomaly', 'anomaly']).values
        #event_data = row.iloc[list(range(2, 3)) + list(range(3, len(x.columns)))].values

        sequence = save_new_event(case_id, event_data, timestamp, after_GP)
        
        if not after_GP:

            if len([k for k, v in cid_to_events.items() if len(v) >= GP_length]) >= GP:
                after_GP = True
                autoencoder = init_train_model(cid_to_events, input_dim, encoder=word2vec)
        
        if after_GP:
            start_time_encoding = time.time()    
            # Encode the sequence using Word2Vec
            encoded_sequence = concat_feature_vector(sequence, word2vec)
            encoded_sequence = pre_process_model_inputs(encoded_sequence, input_length_model=autoencoder.input.shape[1])
            train_model(word2vec, sequence)
            start_time_scoring = time.time()
            
            anomaly_score = compute_reconstruction_error(autoencoder, encoded_sequence)
            # Calculate the duration and append to the prediction_times list
        

            
            
            as_window = sliding_windows_dae[prefix_length]
            std = np.sqrt(as_window.update(anomaly_score).get())
            mean = as_window.mean.get()
            threshold = mean + alpha * std
            
            if case_id in anomalous_cases:
                predicted_label = 1
                    
            else:
                predicted_label = 1 if anomaly_score >= threshold else 0
            
            #Saving the anomalous cases
            if predicted_label == 1: 
                anomalous_cases.append(case_id)
            
            start_time_label = time.time()
            
            autoencoder.train_on_batch(encoded_sequence, encoded_sequence)
            end_time_label = time.time()
            encoding_duration = start_time_scoring - start_time_encoding
            scoring_duration = start_time_label - start_time_scoring
            prediction_duration = end_time_label - start_time_label

            with open(results_filename, "a+") as csvfile:
                csvfile.write(f"{case_id},{prefix_length},{anomaly_score},{threshold},{predicted_label}, {encoding_duration},{scoring_duration},{prediction_duration}\n") #{mean_window},{std_window},

PermissionError: [Errno 13] Permission denied: 'results_w2v_bpic17-0.3-1.csv'

In [None]:
# with open("encoding_times-1.txt", "w") as f:
    # for el in encoding_times:
       # f.write(str(el) + "\n")

In [None]:
# with open("prediction_times_thres-1.txt", "w") as f:
   #  for el in prediction_times:
     #   f.write(str(el) + "\n")