In [1]:
# LSTM with ID and timestamp as separate training model

# Importing necessary libraries
import os
import sys
sys.path.append('../')  ### to detect libraries in the parent directory
import numpy as np
import pandas as pd
from utils import get_paths, read_traces, read_json, mapint2var, is_consistent

In [2]:
# Configuration
CODE = 'theft_protection'               ### application (code) theft_protection, mamba2, lora_ducy
BEHAVIOUR_FAULTY = 'faulty_data'        ### normal, faulty_data
BEHAVIOUR_NORMAL = 'normal'             ### normal, faulty_data
THREAD = 'single'                       ### single, multi
VER = 3                                 ### format of data collection

base_dir = '../../trace_data'              ### can be replaced with 'csv', 'exe_plot', 'histogram'
normalbase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_NORMAL}'
faultybase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_FAULTY}'

print("Normal base path:", normalbase_path)
print("Faulty base path:", faultybase_path)

Normal base path: ../../trace_data/theft_protection/single_thread/version_3/normal
Faulty base path: ../../trace_data/theft_protection/single_thread/version_3/faulty_data


In [3]:
train_base_path = os.path.join(normalbase_path, 'train_data')
train_data_path = [os.path.join(train_base_path, x) for x in os.listdir(train_base_path)]
train_varlist_path = [os.path.join(normalbase_path, x) for x in os.listdir(normalbase_path) if 'varlist' in x]

######### get paths #######################
paths_log, paths_traces, varlist_path, paths_label = get_paths(faultybase_path)

train_data_path = [x for x in train_data_path if '.DS_Store' not in x]
train_varlist_path = [x for x in train_varlist_path if '.DS_Store' not in x]
paths_log = [x for x in paths_log if '.DS_Store' not in x]
paths_traces = [x for x in paths_traces if '.DS_Store' not in x]
varlist_path = [x for x in varlist_path if '.DS_Store' not in x]
paths_label = [x for x in paths_label if '.DS_Store' not in x]

paths_log.sort()
paths_traces.sort()
varlist_path.sort()
paths_label.sort()

test_data_path = paths_traces
test_label_path = paths_label

In [4]:
# Check consistency
if VER == 3:
    check_con, _ = is_consistent([train_varlist_path[0]] + varlist_path)
    if check_con:
        to_number = read_json(varlist_path[0])
        from_number = mapint2var(to_number)
    else:
        to_number = read_json(train_varlist_path[0])
        from_number = mapint2var(to_number)

sorted_keys = list(from_number.keys())
sorted_keys.sort()
var_list = [from_number[key] for key in sorted_keys]

varlist 1 is consistent with varlist 0
varlist 2 is consistent with varlist 0
varlist 3 is consistent with varlist 0


In [5]:
from sklearn.preprocessing import MinMaxScaler

# Load training data ( Here changing for ID alone ignoring timestamp)
def load_data_separate_sequence(file_paths):
    id_data = []
    timestamp_data = []

    for file in file_paths:
        traces = read_traces(file)
        if isinstance(traces, list):
            id_sequence = [int(trace[0]) for trace in traces]
            timestamp_sequence = [int(trace[1]) for trace in traces]

            time_difference = []
            for i in range(1, len(timestamp_sequence)):
                difference = timestamp_sequence[i] - timestamp_sequence[i-1]
                time_difference.append(difference)


            new_id_sequence = id_sequence[1:]
            id_data.append(new_id_sequence)
            timestamp_data.append(time_difference)
            
    return id_data, timestamp_data
            

train_data_id, train_data_timestamp = load_data_separate_sequence(train_data_path)
print("train_data_id:", train_data_id)
print("train_data_timestamp:", train_data_timestamp)
print("Length of train_data_id:", len(train_data_id))
print('shpae of train_data_id:', np.array(train_data_id[1]).shape)
print('shpae of train_data_timestamp:', np.array(train_data_timestamp[1]).shape)



# scaler = MinMaxScaler()
# train_data_flat = []
# for td in train_data:
#     train_data_flat.extend(td)

# train_data_flat = np.array(train_data_flat).reshape(1, -1)
# print(train_data_flat.shape)
# train_data_scaled = [scaler.fit_transform(np.array(td).reshape(1,-1)) for td in train_data]
# print(train_data_scaled)

train_data_id: [[14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 

In [6]:
from sklearn.model_selection import train_test_split

# Prepare LSTM training data for ID and timestap separately

# For ID data
sequence_length = 10       
X_train_id, y_train_id = [], []
for single_file_data in train_data_id:
    for i in range(len(single_file_data) - sequence_length):
        X_train_id.append(single_file_data[i:i + sequence_length])
        y_train_id.append(single_file_data[i + sequence_length])
        # print(single_file_data[i:i + sequence_length], single_file_data[i + sequence_length])

X_train_id, y_train_id = np.array(X_train_id), np.array(y_train_id)
X_train_id, X_val_id, y_train_id, y_val_id = train_test_split(X_train_id, y_train_id, test_size=0.2, random_state=42)




In [7]:
print("X_train_id:", X_train_id.shape)
print("y_train_id:", y_train_id.shape)
# print('X_train_id', X_train_id[0:2])
# print('y_train_id', y_train_id[0])

X_train_id: (1452, 10)
y_train_id: (1452,)


In [8]:
# Prepare LSTM training data for ID and timestap separately

# For timestamp data
X_train_time, y_train_time = [], []
for single_file_data in train_data_timestamp:
    for i in range(len(single_file_data) - sequence_length):
        X_train_time.append(single_file_data[i:i + sequence_length])
        y_train_time.append(single_file_data[i + sequence_length])

X_train_time, y_train_time = np.array(X_train_time), np.array(y_train_time)
X_train_time, X_val_time, y_train_time, y_val_time = train_test_split(X_train_time, y_train_time, test_size=0.2, random_state=42)


In [9]:
X_train_id.shape

(1452, 10)

In [10]:
X_train_time.shape

(1452, 10)

In [11]:
y_train_id.shape

(1452,)

In [12]:
y_train_time.shape

(1452,)

In [13]:
### scale the data

from sklearn.preprocessing import MinMaxScaler
import joblib

scaler_id = MinMaxScaler()
scaler_time = MinMaxScaler()


X_train_id = scaler_id.fit_transform(X_train_id)
X_val_id = scaler_id.transform(X_val_id)

X_train_time = scaler_time.fit_transform(X_train_time)
X_val_time = scaler_time.transform(X_val_time)

### save the scaler
scaler_path = '../trained_minmax_scaler'
name_scaler_id = os.path.join(scaler_path, 'scaler_id.pkl')
name_scaler_time = os.path.join(scaler_path, 'scaler_time.pkl')
if not os.path.exists(scaler_path):
    os.makedirs(scaler_path)

if os.path.exists(name_scaler_id):
    print("Scaler ID already exists")
else:
    joblib.dump(scaler_id, name_scaler_id)
    print("Scaler ID saved")

if os.path.exists(name_scaler_time):
    print("Scaler Time already exists")
else:
    joblib.dump(scaler_time, name_scaler_time)
    print("Scaler Time saved")




Scaler ID already exists
Scaler Time already exists


In [14]:
X_val_time

array([[9.83284169e-04, 1.96463654e-03, 9.82318271e-04, ...,
        1.96656834e-03, 9.82318271e-04, 2.16110020e-02],
       [1.96656834e-03, 2.25933202e-02, 9.95088409e-01, ...,
        1.96656834e-03, 0.00000000e+00, 1.96463654e-03],
       [9.83284169e-04, 9.82318271e-04, 2.25933202e-02, ...,
        1.96656834e-03, 9.82318271e-04, 2.25933202e-02],
       ...,
       [9.83284169e-04, 2.16110020e-02, 9.96070727e-01, ...,
        9.83284169e-04, 9.82318271e-04, 1.96463654e-03],
       [9.97050147e-01, 1.96463654e-03, 9.82318271e-04, ...,
        9.97050147e-01, 1.96463654e-03, 9.82318271e-04],
       [2.16322517e-02, 9.96070727e-01, 9.82318271e-04, ...,
        2.26155359e-02, 9.96070727e-01, 9.82318271e-04]])

## Load Model

In [15]:
import tensorflow as tf

model_id = tf.keras.models.load_model('trained_models/lstm_model_id_minmax.keras')
model_time = tf.keras.models.load_model('trained_models/lstm_model_time_minmax.keras')

## Train Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping

# Define LSTM model for IDs
def create_lstm_model(sequence_length):
    model = Sequential([
        LSTM(128, activation='relu', return_sequences=True, input_shape=(sequence_length, 1), kernel_regularizer=l2(0.01)),
        Dropout(0.1),
        LSTM(64, activation='relu', return_sequences=True, kernel_regularizer=l2(0.01)),
        Dropout(0.1),
        LSTM(32, activation='relu', return_sequences=False, kernel_regularizer=l2(0.01)),
        Dense(1, activation='linear')
    ])
    model.compile(optimizer=Adam(learning_rate=0.1),
                  loss=tf.keras.losses.Huber(delta=1.0),
                  metrics=['mae'])
    
    return model

model_id = create_lstm_model(sequence_length)
model_time = create_lstm_model(sequence_length)
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Training ID alone
history_id = model_id.fit(X_train_id, y_train_id, validation_data=(X_val_id, y_val_id), epochs=100, batch_size=32, callbacks=[early_stopping], verbose=1)

# Training timestamp alone
history_time = model_time.fit(X_train_time, y_train_time, validation_data=(X_val_time, y_val_time), epochs=100, batch_size=32, callbacks=[early_stopping], verbose=1)

In [17]:
from anomaly_detection import test_single_id_timestamp, merge_detections, get_correct_detections_id_timestamp


## checking the detections against the ground truth
DIFF_VAL = 5
all_detections = []         # To store detections for each file
y_pred_all = []             # To store the predicted labels
y_true_all = []             # To store the ground truth labels
all_tp = []                 # To store all true positives
all_fp = []                 # To store all false positives
all_fn = []                 # To store all false negatives
all_gt = []                 # To store the ground truth



# Iterating through each test data file and label file
for test_data, test_label in zip(test_data_path, test_label_path):
    
    detection = test_single_id_timestamp(test_data, model_id, model_time, sequence_length)

    all_detections.append((test_data, detection, test_label))
    merge_detection = merge_detections(detection, diff_val=DIFF_VAL)
    
    ground_truth_raw = read_traces(test_label)                                               # read ground truth labels from the label file
    ground_truth = ground_truth_raw['labels']                                                # extract labels from dictionary from ground truth data

    label_trace_name = list(ground_truth.keys())[0]
    ground_truth = ground_truth[label_trace_name]
    correct_pred, rest_pred, y_pred, y_true, false_neg = get_correct_detections_id_timestamp(merge_detection, ground_truth)  # Comparing detected anomaly with ground truth
    y_pred_all.extend(y_pred)          # predicted labels
    y_true_all.extend(y_true)          # actual ground truth labels
    all_tp.append((test_data, correct_pred, test_label))
    all_fp.append((test_data, rest_pred, test_label))
    all_fn.append((test_data, false_neg, test_label))
    all_gt.append((test_data, ground_truth, test_label))

    break

id_data [1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

y_pred_all = np.array(y_pred_all)
y_true_all = np.array(y_true_all)

# Calculate evaluation metrics
precision = precision_score(y_true_all, y_pred_all)
recall = recall_score(y_true_all, y_pred_all)
f1 = f1_score(y_true_all, y_pred_all)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

conf_matrix = confusion_matrix(y_true_all, y_pred_all)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['Normal', 'Anomaly'])
disp.plot(cmap='Blues')
plt.title("Confusion Matrix")
plt.show()

In [None]:
model_name = f'trained_models/'

if not os.path.exists(model_name):
    os.makedirs(model_name)

model_name_id = os.path.join(model_name, 'lstm_model_id_minmax.keras')
model_name_time = os.path.join(model_name, 'lstm_model_time_minmax.keras')

if os.path.exists(model_name_id):
    print('model exists')
    raise FileExistsError
else:
    model_id.save(model_name_id)
    print(f'{model_name_id} saved')

if os.path.exists(model_name_time):
    print('model exists')
    raise FileExistsError
else:
    model_time.save(model_name_time)
    print(f'{model_name_time} saved')