In [None]:
import os
import time
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Dropout, Input, GaussianNoise
from keras.models import Model
from keras.optimizers import Adam

from river import utils, stats, metrics
from gensim.models import Word2Vec

from tqdm import tqdm
from datetime import datetime
from collections import deque

# Read Data 

In [None]:
#innit variables for algortihm
GP_length = 2 # length of the traces
GP = 100 #how many traces with length GP_length should be present before the algortihm starts
w_cases = 100 #maximum number of traces present in SW
w_events = 1000 #maximum number of events present in SW
alpha = 0.1

# Parameters for Word2Vec encoding
vector_size = 100
window = 5
min_count = 1

dir_datasets = 'data/synthetic'
# dir_datasets = 'data/real'

dir_results = 'results/synthetic'
# dir_results = 'results/real'

# Functions

In [None]:
def create_model(cases, size, window, min_count):
    model = Word2Vec(
                vector_size=size,
                window=window,
                min_count=min_count)
    sentences = []
    for group in cases:
        group_sentences = []
        for row in group:
            row_sentences = [str(item) for item in row]
            group_sentences.extend(row_sentences)
        sentences.append(group_sentences)
    
    model.build_vocab(sentences)
    model.train(sentences, total_examples=len(sentences), epochs=10)
    return model

def train_model(model, sequence): 
    sequence = [[attribute for event in sequence for attribute in event]]
    model.train(sequence, total_examples=1, epochs=1)

In [None]:
def concat_feature_vector(events, model):
    vectors = []
    # print("Events:", len(events), events)
    for event in events:
        case_vector = []
        for token in event:
            try:
                case_vector.append(model.wv[token])
            except KeyError as e:
                print("Token not found:", e)
                pass
        embedded_event = np.array(case_vector).mean(axis=0)
        vectors.append(embedded_event)

    embedded_sequence = np.array(vectors)
    embedded_sequence = np.reshape(embedded_sequence, (vector_size*len(vectors)))
    # print("Encoding:", embedded_sequence.shape)
    return embedded_sequence

In [None]:
def make_model(nr_features, hidden_layers, hidden_size_factor=0.5, noise=None):
    '''
    Create the DAE model
    '''
    input_ = Input(shape=(nr_features,), name='input')
    x = input_

    if noise is not None:
        x = GaussianNoise(noise)(x) 

    for i in range(hidden_layers):
        if isinstance(hidden_size_factor, list):
            factor = hidden_size_factor[i]
        else:
            factor = hidden_size_factor
        x = Dense(int(nr_features * factor), activation='relu', name=f'hid{i + 1}')(x)
        x = Dropout(0.5)(x)

    output = Dense(nr_features, activation='tanh', name='output')(x)

    model = Model(inputs=input_, outputs=output)

    model.compile(
        optimizer=Adam(learning_rate=0.0001, beta_2=0.99),
        loss='mean_squared_error',
    )

    return model

In [None]:
# Function to compute the reconstruction error for a sequence using the autoencoder model
def compute_reconstruction_error(autoencoder, sequence):
    reconstructed_sequence = autoencoder(sequence, training=False)
    reconstruction_error = np.mean(np.square(sequence - reconstructed_sequence))
    return reconstruction_error

In [None]:
def remove_oldest_event(current_id):
    global case_ids, cid_to_events, event_counter

    cid_with_oldest_event = case_ids.popleft()
    while cid_with_oldest_event not in cid_to_events:
        cid_with_oldest_event = case_ids.popleft()
    cid_to_events[cid_with_oldest_event] = np.delete(cid_to_events[cid_with_oldest_event], 0, axis=0)
    if len(cid_to_events[cid_with_oldest_event]) == 0 and cid_with_oldest_event != current_id:
        del cid_to_events[cid_with_oldest_event]
        del cid_to_last_timestamp[cid_with_oldest_event]
    event_counter -= 1

def save_new_event(cid, event_data, event_timestamp, after_GP: bool):
    global cid_to_events, cid_to_last_timestamp, case_ids, event_counter
    
    event_counter += 1
    case_ids.append(cid)
    
    if cid in cid_to_events.keys():
        if event_counter >= w_events and after_GP:
            remove_oldest_event(cid)
        sequence = cid_to_events[cid]
        sequence = np.vstack([sequence, event_data])
        cid_to_events[cid] = sequence
    else:
        number_of_cases = len(cid_to_events.keys())
        if number_of_cases >= w_cases and after_GP:
            min_timestamp = datetime.strptime('2100-01-01 00:00:00', '%Y-%m-%d %H:%M:%S') # for datasets with this timestamp format
            #min_timestamp = datetime.strptime('2100-01-01T00:00:00.111+09:00', '%Y-%m-%dT%H:%M:%S.%f%z')
            removing_cid = None
            for cid, latest_timestamp in cid_to_last_timestamp.items():
                timestamp = datetime.strptime(latest_timestamp, '%Y-%m-%d %H:%M:%S')
                #timestamp = datetime.strptime(latest_timestamp, '%Y-%m-%dT%H:%M:%S.%f%z')
                if timestamp < min_timestamp:
                    min_timestamp = timestamp
                    removing_cid = cid
            del cid_to_events[removing_cid]
            del cid_to_last_timestamp[removing_cid]
        if event_counter >= w_events and after_GP:
            remove_oldest_event(cid)
        cid_to_events[cid] = np.array([event_data])
        sequence = cid_to_events[cid]
        cid_to_last_timestamp[cid] = event_timestamp
    return sequence

In [None]:
def pre_process_model_inputs(inputs, input_length_model):
    # print("Padding: ", input_length_model, inputs.shape) 
    padded_vector= np.pad(inputs, (0, input_length_model - inputs.shape[0]), 'constant', constant_values=0)
    encoded_sequence = np.expand_dims(padded_vector, axis=0)
    return encoded_sequence

def init_train_model(cid_to_events, input_dim, encoder):
    model = make_model(input_dim, hidden_layers=2)
    
    max_prefix_length = 0
    for v in cid_to_events.values():
        if len(v) > max_prefix_length:
            max_prefix_length = len(v) 
    
    
    for nr_events in range(1, max_prefix_length+1):
        prefix_dict = {}
        for key, value in cid_to_events.items():
            if len(value) >= nr_events:
                prefix_dict[key] = value[:nr_events]

        for sequence in prefix_dict.values():
            train_model(encoder, sequence)
            embedding = concat_feature_vector(sequence, encoder)
            tensor = pre_process_model_inputs(embedding, input_length_model = model.input.shape[1])
            model.train_on_batch(tensor, tensor)
            
    return model


# Training

In [None]:
# Initialize DataFrame to store reconstruction errors
# reconstruction_error_df = pd.DataFrame(columns=['case_id', 'reconstruction_error'])
datasets = [dir_datasets + '/' + dataset for dataset in os.listdir(dir_datasets)]
for dataset in datasets:
    
    file_name = os.path.basename(dataset)
    df_dataset = pd.read_csv(dataset)
    df_dataset.sort_values('timestamp', inplace=True)
    #df_dataset = df_dataset[~df_dataset['case_id'].isin(df_dataset.groupby('case_id')['timestamp'].apply(lambda x: x.isna().any()).loc[lambda x: x].index)]
    df_dataset['timestamp'] = pd.to_datetime(df_dataset['timestamp']).dt.strftime('%Y-%m-%d %H:%M:%S')
    # df_dataset = df_dataset.drop(['isEndTrace'], axis=1) #RCVDB
    max_length = df_dataset.groupby('case_id').size().max()
    cases = df_dataset.groupby(df_dataset.columns[0]).apply(lambda x: x.iloc[:, ~df_dataset.columns.isin(['case_id', 'event_position', 'timestamp', 'isAnomaly', 'anomaly'])].values.tolist())
    word2vec = create_model(cases, vector_size, window, min_count)
    input_dim = vector_size * max_length #RCVDB
    results_filename = f'{dir_results}/results_w2v_{file_name}'

    cid_to_events : dict[int, list]= dict()
    cid_to_last_timestamp : dict[int, float] = dict() # to know what is for a case id the timestamp of the last event (the most recent)
    case_ids = deque() # deque of the case ids such that, at each point, the popleft represents the oldest event
    event_counter = 0
    anomalous_cases = []

    # Initialize DataFrame to store reconstruction errors
    sliding_windows_dae = {}

    for i in range(max_length + 1):
        sliding_windows_dae[i] = metrics.Rolling(metrics.MSE(), window_size=6000)

    after_GP = False
    threshold = 1 # initial default value

    # for _, row in df_dataset.iterrows():
    for _, row in tqdm(df_dataset.iterrows(), total=len(df_dataset), desc=f"Processing {dataset}"):
        
        #skip_training = False
        case_label = row['isAnomaly']
        case_id = row['case_id']
        timestamp = row['timestamp']
        prefix_length = row['event_position']
        anomaly_type = row['anomaly']
        y_true = row['isAnomaly']
        event_data = row.drop(['case_id', 'event_position', 'timestamp', 'isAnomaly', 'anomaly']).values
        #event_data = row.iloc[list(range(2, 3)) + list(range(3, len(x.columns)))].values

        sequence = save_new_event(case_id, event_data, timestamp, after_GP)
        
        if not after_GP:

            if len([k for k, v in cid_to_events.items() if len(v) >= GP_length]) >= GP:
                after_GP = True
                autoencoder = init_train_model(cid_to_events, input_dim, encoder=word2vec)
        
        if after_GP:
            start_time_encoding = time.time()    
            # Encode the sequence using Word2Vec
            encoded_sequence = concat_feature_vector(sequence, word2vec)
            encoded_sequence = pre_process_model_inputs(encoded_sequence, input_length_model=autoencoder.input.shape[1])
            train_model(word2vec, sequence)
            start_time_scoring = time.time()
            
            # RCVDB:
            # anomaly_score = compute_reconstruction_error(autoencoder, encoded_sequence)
            reconstructed_sequence = autoencoder(encoded_sequence, training=False)
            anomaly_score = np.mean(np.square(encoded_sequence - reconstructed_sequence))

            as_window = sliding_windows_dae[prefix_length]

            flat_encoded_sequence = encoded_sequence.flatten()
            flat_reconstructed_sequence = reconstructed_sequence.numpy().flatten()
            # print(flat_encoded_sequence.shape, flat_reconstructed_sequence.shape)

            as_window.update(y_true=flat_encoded_sequence, y_pred=flat_reconstructed_sequence, sample_weight=1.0)

            mean = np.mean(as_window.get())
            std = np.std(as_window.get())
            # print(f"Mean: {mean}, Std: {std}")
            threshold = mean + alpha * std

            # print(f"Anomaly score: {anomaly_score}, Threshold: {threshold}")
            
            if case_id in anomalous_cases:
                predicted_label = 1
                    
            else:
                predicted_label = 1 if anomaly_score >= threshold else 0
            
            #Saving the anomalous cases
            if predicted_label == 1: 
                anomalous_cases.append(case_id)
            
            start_time_label = time.time()
            
            autoencoder.train_on_batch(encoded_sequence, encoded_sequence)
            end_time_label = time.time()
            encoding_duration = start_time_scoring - start_time_encoding
            scoring_duration = start_time_label - start_time_scoring
            prediction_duration = end_time_label - start_time_label

            with open(results_filename, "a+") as csvfile:
                csvfile.write(f"{case_id},{prefix_length},{anomaly_score},{threshold},{predicted_label},{y_true},{anomaly_type},{encoding_duration},{scoring_duration},{prediction_duration}\n") #{mean_window},{std_window},