In [1]:
import os 
import time
import pandas as pd
import numpy as np
import tensorflow as tf
from datetime import datetime
from gensim.models import Word2Vec
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, PReLU, Input, GaussianNoise
from keras.models import Model
from keras.optimizers import Adam
from collections import deque
from river import utils
from river import stats

In [2]:
#innit variables for algortihm
GP_length = 2 # length of the traces
GP = 100 #how many traces with length GP_length should be present before the algortihm starts
w_cases = 100 #maximum number of traces present in SW
w_events = 1000 #maximum number of events present in SW
alpha = 0.1
dir_datasets = '_Data/Later'

In [3]:
def create_autoencoder(input_dim, hidden_layers, hidden_size_factor=0.2, noise=None):
    '''
    Create a DAE model
    '''
    # Input layer
    input_ = Input(shape=(input_dim,), name='input')
    x = input_

    # Noise layer
    if noise is not None:
        x = GaussianNoise(noise)(x) 

    # Hidden layers
    for i in range(hidden_layers):
        if isinstance(hidden_size_factor, list):
            factor = hidden_size_factor[i]
        else:
            factor = hidden_size_factor
        x = Dense(int(input_dim * factor), activation='relu', name=f'hid{i + 1}')(x)
        #x = PReLU(x)
        x = Dropout(0.5)(x)

    # Output layer
    output = Dense(input_dim, activation='tanh', name='output')(x)

    # Build model
    model = Model(inputs=input_, outputs=output)

    # Compile model
    model.compile(
        optimizer=Adam(learning_rate=0.0001, beta_2=0.99),
        loss='mean_squared_error',
    )

    return model

In [4]:
# Function to compute the reconstruction error for a sequence using the autoencoder model
def compute_anomaly_score(autoencoder, sequence):
    reconstructed_sequence = autoencoder(sequence, training=False)
    reconstruction_error = np.mean(np.square(sequence - reconstructed_sequence))
    return reconstruction_error

In [5]:
def remove_oldest_event(current_id):
    global case_ids, cid_to_events, event_counter

    cid_with_oldest_event = case_ids.popleft()
    while cid_with_oldest_event not in cid_to_events:
        cid_with_oldest_event = case_ids.popleft()
    cid_to_events[cid_with_oldest_event] = np.delete(cid_to_events[cid_with_oldest_event], 0, axis=0) # removing the oldest event
    if len(cid_to_events[cid_with_oldest_event]) == 0 and cid_with_oldest_event != current_id:
        del cid_to_events[cid_with_oldest_event]
        del cid_to_last_timestamp[cid_with_oldest_event]
    event_counter -= 1

def save_new_event(cid, event_data, event_timestamp, after_GP: bool):
    global cid_to_events, cid_to_last_timestamp, case_ids, event_counter
    
    event_counter += 1
    case_ids.append(cid)
    #sequence = None
    
    if cid in cid_to_events.keys():
        # we only care about the number of events
        if event_counter >= w_events and after_GP:
            # remove the oldest event
            remove_oldest_event(cid)
        # otherwise event_counter < We:
        sequence = cid_to_events[cid]
        sequence = np.vstack([sequence, event_data])
        cid_to_events[cid] = sequence
        cid_to_last_timestamp[cid] = event_timestamp
    else:
        # we need to care about the number of cases and also the number of events
        number_of_cases = len(cid_to_events.keys())
        if number_of_cases >= w_cases and after_GP:
            # remove the case with the oldest last event
            min_timestamp = datetime.strptime('2100-01-01 00:00:00', '%Y-%m-%d %H:%M:%S')
            removing_cid = None
            for cid, latest_timestamp in cid_to_last_timestamp.items():
                timestamp = datetime.strptime(latest_timestamp, '%Y-%m-%d %H:%M:%S')
                
                if timestamp < min_timestamp:
                    min_timestamp = timestamp
                    removing_cid = cid
            del cid_to_events[removing_cid]
            del cid_to_last_timestamp[removing_cid]
        # otherwise number_of_cases < W, check the number of events
        if event_counter >= w_events and after_GP:
            #remove the oldest event
            remove_oldest_event(cid)
        # otherwise event_counter < We:
        cid_to_events[cid] = np.array([event_data])
        sequence = cid_to_events[cid]
        cid_to_last_timestamp[cid] = event_timestamp
    return sequence
    

In [6]:
from sklearn.preprocessing import OneHotEncoder

#%% return features of event for encoding

def return_features_events(events):
    if isinstance(events, pd.Series):
        columns = sorted([index for index in events.index if index not in ['case_id', 'event_position', 'timestamp', 'isAnomaly', 'anomaly']])
        return events[columns].to_frame().T
    else:
        return events.loc[:, ~events.columns.isin(['case_id', 'event_position', 'timestamp', 'isAnomaly', 'anomaly'])]

def init_encoder(dataset):
    dataset = return_features_events(dataset)
    unique_list = []
    for col in dataset.columns:
        unique_list.append(pd.DataFrame(dataset[col].unique(), columns=[col]))
    df_values = pd.concat(unique_list, axis=1)
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    encoder.fit(df_values)
    return encoder

def onehot_encode_data(sequence, encoder):
    sequence_df = pd.DataFrame(sequence, columns=['name', 'day', 'user'])
    embedded_features = encoder.transform(sequence_df)
    return embedded_features.flatten()

from tensorflow import convert_to_tensor
def pre_process_model_inputs(inputs, input_length_model):
    final = [inputs[i * input_length_model:(i + 1) * input_length_model] for i in range((len(inputs) + input_length_model - 1) // input_length_model )]
    if len(final[-1]) < input_length_model:
        final[-1] = np.append(final[-1], [0 for i in range(input_length_model-len(final[-1]))])
    # return final
    return convert_to_tensor(final, dtype='float32')


from keras.backend import clear_session
'''
def init_train_model(window, encoder): #TODO fix the parameters
    #init models
    input_dim= sum([len(i) for i in encoder.categories_])*max_length
    model = create_autoencoder(input_dim, hidden_layers=2)
    #fit model for each prefix seen so far
    for sequence in window.values():
        #embedding = return_features_events(sequence)    
        embedding = onehot_encode_data(sequence, encoder)
        
        tensor = pre_process_model_inputs(embedding, input_dim)
        
        model.train_on_batch(tensor, tensor)
    clear_session()
    return model
'''
def init_train_model(cid_to_events, encoder):
    input_dim= sum([len(i) for i in encoder.categories_])*max_length
    model = create_autoencoder(input_dim, hidden_layers=2)
    
    max_prefix_length = 0
    for v in cid_to_events.values():
        if len(v) > max_prefix_length:
            max_prefix_length = len(v) 
    
    
    for nr_events in range(1, max_prefix_length+1):
        prefix_dict = {}
        for key, value in cid_to_events.items():
            if len(value) >= nr_events:
                prefix_dict[key] = value[:nr_events]

        for sequence in prefix_dict.values():
            embedding = onehot_encode_data(sequence, encoder)
            tensor = pre_process_model_inputs(embedding, input_dim)
            model.train_on_batch(tensor, tensor)
    clear_session()        
    return model


In [7]:
datasets = [dir_datasets + '/' + dataset for dataset in os.listdir(dir_datasets)]
for dataset in datasets:
    
    file_name = os.path.basename(dataset)
    df_dataset = pd.read_csv(dataset)
    df_dataset.sort_values('timestamp', inplace=True)
    #df_dataset = df_dataset[~df_dataset['case_id'].isin(df_dataset.groupby('case_id')['timestamp'].apply(lambda x: x.isna().any()).loc[lambda x: x].index)]
    df_dataset['timestamp'] = pd.to_datetime(df_dataset['timestamp']).dt.strftime('%Y-%m-%d %H:%M:%S')
    df = df_dataset.drop(['isEndTrace'], axis=1)
    encoder = init_encoder(df)
    max_length = df.groupby('case_id').size().max()
    results_filename = 'results_onehot_' + file_name

    cid_to_events : dict[int, list]= dict()
    cid_to_last_timestamp : dict[int, float] = dict() # to know what is for a case id the timestamp of the last event (the most recent)
    case_ids = deque() # deque of the case ids such that, at each point, the popleft represents the oldest event
    event_counter = 0
    anomalous_cases = []

    # Initialize DataFrame to store reconstruction errors
    sliding_windows_dae = {}

    for i in range(max_length + 1):
        sliding_windows_dae[i] = utils.Rolling(stats.Var(), window_size=6000)


    after_GP = False
    threshold = 1 # initial default value
    for _, row in df.iterrows():
        #skip_training = False
        case_id = row['case_id']
        timestamp = row['timestamp']
        prefix_length = row['event_position']
        event_data = row.drop(['case_id', 'event_position', 'timestamp', 'isAnomaly', 'anomaly']).values
        
        sequence = save_new_event(case_id, event_data, timestamp, after_GP)
        
        if not after_GP:
            if len([k for k, v in cid_to_events.items() if len(v) >= GP_length]) >= GP:
                after_GP = True
                autoencoder = init_train_model(cid_to_events, encoder=encoder)
        
        if after_GP:
            start_time_encoding = time.time()    
            # Encode the sequence using Word2Vec
            encoded_sequence = onehot_encode_data(sequence, encoder)
            #print(encoded_sequence)
            encoded_sequence = pre_process_model_inputs(encoded_sequence, input_length_model=autoencoder.input.shape[1])
            
            start_time_scoring = time.time()
            
            anomaly_score = compute_anomaly_score(autoencoder, encoded_sequence)
        
            predicted_label = 1 if anomaly_score >= threshold else 0
            predicted_case_label = 1 if case_id in anomalous_cases else predicted_label
            #Saving the anomalous cases
            if predicted_label == 1: 
                anomalous_cases.append(case_id)
            #Setting the threshold
            as_window = sliding_windows_dae[prefix_length]
            std = np.sqrt(as_window.update(anomaly_score).get())
            mean = as_window.mean.get()
            threshold = mean + alpha * std
        
            
            start_time_label = time.time()
            autoencoder.train_on_batch(encoded_sequence, encoded_sequence)
            end_time_label = time.time()

            encoding_duration = start_time_scoring - start_time_encoding
            scoring_duration = start_time_label - start_time_scoring
            prediction_duration = end_time_label - start_time_label


            with open(results_filename, "a+") as csvfile:
                csvfile.write(f"{case_id},{prefix_length},{anomaly_score},{threshold},{predicted_label}, {predicted_case_label}, {encoding_duration},{scoring_duration},{prediction_duration}\n") #{mean_window},{std_window},

            