In [1]:
import json
import os
import numpy as np
import pandas as pd

def read_traces(log_path):
    '''
    read the trace files and extract variable names
    data = [ [event, timestamp], [], [],......,[] ]
    '''
    with open(log_path, 'r') as f:
        data = json.load(f)
    return data

### LSTM

In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

# Convert subtraces into sequences
def create_sequences(subtraces, sequence_length):
    sequences = []
    for subtrace in subtraces:
        for i in range(len(subtrace) - sequence_length + 1):
            sequence = subtrace[i:i+sequence_length]
            sequences.append(sequence)
    return np.array(sequences)



In [None]:
### get the data

# Assuming you have 'normal' and 'anomalies' subtraces
normal_subtraces = [...]  # List of subtraces representing normal behavior
anomalies_subtraces = [...]  # List of subtraces representing anomalous behavior

In [None]:
# Normalize the data separately for 'normal' and 'anomalies' subtraces
scaler_normal = MinMaxScaler(feature_range=(0, 1))
scaler_anomalies = MinMaxScaler(feature_range=(0, 1))

normal_subtraces_normalized = scaler_normal.fit_transform(normal_subtraces)
anomalies_subtraces_normalized = scaler_anomalies.fit_transform(anomalies_subtraces)

# Create sequences with a specified sequence length for 'normal' subtraces
sequence_length = 10  # Adjust based on your data

# Create sequences for 'normal' subtraces
sequences_normal = create_sequences(normal_subtraces_normalized, sequence_length)

# Split 'normal' sequences into training and testing sets
train_size_normal = int(len(sequences_normal) * 0.8)
train_normal, test_normal = sequences_normal[0:train_size_normal], sequences_normal[train_size_normal:]

# Create sequences for 'anomalies' subtraces
sequences_anomalies = create_sequences(anomalies_subtraces_normalized, sequence_length)

# Split 'anomalies' sequences into testing set (no training on anomalies)
test_anomalies = sequences_anomalies

# Prepare input and output data for the LSTM model
X_train, y_train = train_normal[:, :-1], train_normal[:, -1]

# Concatenate 'normal' and 'anomalies' testing sets
X_test = np.concatenate((test_normal[:, :-1], test_anomalies[:, :-1]))
y_test = np.concatenate((test_normal[:, -1], test_anomalies[:, -1]))

# Shuffle the training set
shuffle_indices_train = np.random.permutation(len(X_train))
X_train, y_train = X_train[shuffle_indices_train], y_train[shuffle_indices_train]

# Shuffle the testing set
shuffle_indices_test = np.random.permutation(len(X_test))
X_test, y_test = X_test[shuffle_indices_test], y_test[shuffle_indices_test]

# Reshape input data to be 3D [samples, timesteps, features]
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], X_train.shape[2]))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], X_test.shape[2]))

# Build the LSTM model
model = Sequential()
model.add(LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(X_train.shape[2]))  # Adjust the number of units based on your data
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model on 'normal' subtraces
model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=2)

# Make predictions on 'anomalies' subtraces
anomalies_predictions = model.predict(X_test)

# Calculate reconstruction error
mse_anomalies = mean_squared_error(y_test, anomalies_predictions)
print(f"Mean Squared Error on Anomalies: {mse_anomalies}")

# Set a threshold for anomaly detection based on the reconstruction error
threshold = 0.1  # Adjust based on your data
anomalies_detected = np.where(mse_anomalies > threshold)[0]
print("Anomalies Detected Indices:", anomalies_detected)

#### Evaluation

_Classify Instances:_

- The model's predictions are converted to binary labels (0 for normal, 1 for anomaly) based on the specified threshold.

_Classification Report:_

- The classification_report function from scikit-learn is used to generate precision, recall, and F1-score for both classes (normal and anomaly).

_ROC AUC Score:_

- The ROC AUC score is calculated using the roc_auc_score function.

_Plot ROC Curve:_

- The ROC curve is plotted using the roc_curve function and visualized using Matplotlib.


In [None]:
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

predictions = anomalies_detected
# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, predictions))

# Calculate and print the ROC AUC score
roc_auc = roc_auc_score(y_test, mse_anomalies)
print(f"ROC AUC Score: {roc_auc}")

# Plot ROC curve
fpr, tpr, _ = roc_curve(y_test, mse_anomalies)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Curve')
plt.legend(loc='lower right')
plt.show()

### One Class SVM