In [1]:
# LSTM ID ALONE MODEL
# This model is used to predict the next ID of a sequence of IDs
# The model is trained on the ID alone

In [1]:
# Importing necessary libraries
import os
import numpy as np
import pandas as pd
from libraries.utils import get_paths, read_traces, read_json, mapint2var, is_consistent

In [None]:
# Configuration
CODE = 'theft_protection'               ### application (code) theft_protection, mamba2, lora_ducy
BEHAVIOUR_FAULTY = 'faulty_data'        ### normal, faulty_data
BEHAVIOUR_NORMAL = 'normal'             ### normal, faulty_data
THREAD = 'single'                       ### single, multi
VER = 3                                 ### format of data collection

base_dir = './trace_data'              ### can be replaced with 'csv', 'exe_plot', 'histogram'
normalbase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_NORMAL}'
faultybase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_FAULTY}'

print("Normal base path:", normalbase_path)
print("Faulty base path:", faultybase_path)

Normal base path: ./trace_data/lora_ducy/single_thread/version_3/normal
Faulty base path: ./trace_data/lora_ducy/single_thread/version_3/faulty_data


In [3]:
train_base_path = os.path.join(normalbase_path, 'train_data')
print("Train base path:", train_base_path)

print("Current working directory:", os.getcwd())
train_data_path = [os.path.join(train_base_path, x) for x in os.listdir(train_base_path)]
train_varlist_path = [os.path.join(normalbase_path, x) for x in os.listdir(normalbase_path) if 'varlist' in x]

######### get paths #######################
paths_log, paths_traces, varlist_path, paths_label = get_paths(faultybase_path)

train_data_path = [x for x in train_data_path if '.DS_Store' not in x]
train_varlist_path = [x for x in train_varlist_path if '.DS_Store' not in x]
paths_log = [x for x in paths_log if '.DS_Store' not in x]
paths_traces = [x for x in paths_traces if '.DS_Store' not in x]
varlist_path = [x for x in varlist_path if '.DS_Store' not in x]
paths_label = [x for x in paths_label if '.DS_Store' not in x]

paths_log.sort()
paths_traces.sort()
varlist_path.sort()
paths_label.sort()

test_data_path = paths_traces
test_label_path = paths_label

Train base path: ./trace_data/lora_ducy/single_thread/version_3/normal\train_data
Current working directory: c:\Uni Bremen\Job\Comnets\Anomaly Detection\Anomaly_Detection


In [4]:
train_data_path

['./trace_data/lora_ducy/single_thread/version_3/normal\\train_data\\trace_trial1.json',
 './trace_data/lora_ducy/single_thread/version_3/normal\\train_data\\trace_trial10',
 './trace_data/lora_ducy/single_thread/version_3/normal\\train_data\\trace_trial11',
 './trace_data/lora_ducy/single_thread/version_3/normal\\train_data\\trace_trial12',
 './trace_data/lora_ducy/single_thread/version_3/normal\\train_data\\trace_trial2.json',
 './trace_data/lora_ducy/single_thread/version_3/normal\\train_data\\trace_trial3',
 './trace_data/lora_ducy/single_thread/version_3/normal\\train_data\\trace_trial5',
 './trace_data/lora_ducy/single_thread/version_3/normal\\train_data\\trace_trial6',
 './trace_data/lora_ducy/single_thread/version_3/normal\\train_data\\trace_trial7',
 './trace_data/lora_ducy/single_thread/version_3/normal\\train_data\\trace_trial8',
 './trace_data/lora_ducy/single_thread/version_3/normal\\train_data\\trace_trial9']

In [5]:
# Check consistency
if VER == 3:
    check_con, _ = is_consistent([train_varlist_path[0]] + varlist_path)
    if check_con:
        to_number = read_json(varlist_path[0])
        from_number = mapint2var(to_number)
    else:
        to_number = read_json(train_varlist_path[0])
        from_number = mapint2var(to_number)

sorted_keys = list(from_number.keys())
sorted_keys.sort()
var_list = [from_number[key] for key in sorted_keys]

varlist 1 is consistent with varlist 0
varlist 2 is consistent with varlist 0
varlist 3 is consistent with varlist 0
varlist 4 is consistent with varlist 0
varlist 5 is consistent with varlist 0
varlist 6 is consistent with varlist 0


In [6]:
# Load training data
def load_data(file_paths):
    data = []
    for file in file_paths:
        traces = read_traces(file)
        if isinstance(traces, list):
            id_sequence = [int(trace[0]) for trace in traces]
            data.append(id_sequence)
    return data

train_data = load_data(train_data_path)
print(train_data)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 51, 52, 53, 54, 50, 51, 52, 53, 54, 50, 55, 54, 56, 57, 58, 57, 59, 60, 56, 0, 38, 23, 28, 0, 57, 59, 60, 56, 0, 77, 78, 38, 0, 38, 79, 74, 80, 75, 81, 40, 82, 23, 83, 28, 0, 57, 59, 60, 56, 0, 77, 78, 38, 0, 38, 79, 30, 74, 75, 23, 83, 28, 0, 57, 59, 56, 0, 77, 78, 38, 0, 38, 79, 30, 74, 80, 75, 81, 40, 82, 23, 83, 28, 0, 57, 58, 57, 59, 56, 0, 77, 78, 38, 0, 38, 79, 30, 74, 75, 23, 83, 28, 0, 57, 58, 57, 59, 56, 0, 77, 78, 38, 0, 38, 79, 30, 74, 75, 23, 83, 28, 0, 57, 58, 57, 59, 56, 0, 61, 62, 63, 38, 64, 65, 66, 67, 34, 68, 69, 70, 71, 72, 73, 22, 74, 75, 61, 70, 71, 72, 73, 22, 74, 75, 61, 70, 71, 72, 73, 22, 74, 75, 61, 70, 71, 72, 73, 22, 74, 75, 61, 70, 71, 72, 73, 22, 74, 75, 61, 70, 71, 72, 73, 22, 74, 75, 61, 70, 71, 72, 73, 22, 74, 75, 61, 70, 71, 72, 73, 22, 74, 75, 61

In [7]:
for idx, td in enumerate(train_data):
    td_array = np.array(td)  
    print(f"Dataset {idx + 1}: shape = {td_array.shape}")

Dataset 1: shape = (1026,)
Dataset 2: shape = (1435,)
Dataset 3: shape = (1068,)
Dataset 4: shape = (1469,)
Dataset 5: shape = (1086,)
Dataset 6: shape = (1209,)
Dataset 7: shape = (1246,)
Dataset 8: shape = (1327,)
Dataset 9: shape = (1277,)
Dataset 10: shape = (1894,)
Dataset 11: shape = (1325,)


In [8]:
from sklearn.model_selection import train_test_split

# Prepare LSTM training data
sequence_length = 10       # Already tuned parameter value
X_train, y_train = [], []
for single_file_data in train_data:
    for i in range(len(single_file_data) - sequence_length):
        X_train.append(single_file_data[i:i + sequence_length])
        y_train.append(single_file_data[i + sequence_length])

X_train, y_train = np.array(X_train), np.array(y_train)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [9]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

X_train_new = X_train.reshape(-1, X_train.shape[-1])
X_val_new = X_val.reshape(-1, X_val.shape[-1])

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_new)
X_val_scaled = scaler.transform(X_val_new)

X_train = X_train_scaled.reshape(X_train.shape)
X_val = X_val_scaled.reshape(X_val.shape)


In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2

import psutil

# Define LSTM model
# Layers 128, 64 and 32 are chosen by parameter tuning
model = Sequential([
    LSTM(128, activation='relu', return_sequences=True, input_shape=(sequence_length, 1), kernel_regularizer=l2(0.001)),
    Dropout(0.1),
    LSTM(64, activation='relu', return_sequences=True, kernel_regularizer=l2(0.001)),
    Dropout(0.1),
    LSTM(32, activation='relu', return_sequences=False, kernel_regularizer=l2(0.001)),
    Dense(1, activation='linear')
])


  super().__init__(**kwargs)


In [12]:
model.compile(optimizer=Adam(),
              loss='mse',
              metrics=['mae'])

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)


history = model.fit(X_train, y_train, validation_split=0.2, epochs=100, batch_size=32, callbacks=[early_stopping], verbose=1)

# Finding RAM usage
ram_usage = psutil.Process().memory_info().rss / (1024 ** 2)
print(f"Total RAM usage: {ram_usage:.2f} MB")


Epoch 1/100
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 16ms/step - loss: 67.5386 - mae: 4.0953 - val_loss: 51.4404 - val_mae: 3.2180
Epoch 2/100
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - loss: 62.1383 - mae: 3.8954 - val_loss: 52.1990 - val_mae: 3.1146
Epoch 3/100
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - loss: 58.4016 - mae: 3.6914 - val_loss: 64.9277 - val_mae: 4.1894
Epoch 4/100
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - loss: 72.3323 - mae: 4.2762 - val_loss: 51.4217 - val_mae: 3.3236
Epoch 5/100
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - loss: 56.4837 - mae: 3.6769 - val_loss: 58.6157 - val_mae: 3.6097
Epoch 6/100
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - loss: 63.9986 - mae: 3.8838 - val_loss: 52.6896 - val_mae: 3.3482
Epoch 7/100
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

In [13]:
# Calculating the MAE and Accuracy
from sklearn.metrics import mean_absolute_error

pred = model.predict(X_val)
mae = mean_absolute_error(y_val, pred)
print(f"Mean Absolute Error (MAE): {mae}")

correct = []
incorrect = []
pred = np.round(pred).astype(int)
for i in range(len(y_val)):
    yt_event = y_val[i]
    pred_event = pred[i]

    yt_event = yt_event.reshape(1,)
    pred_event = pred_event.reshape(1,)

    if np.abs(yt_event - pred_event) < 1:
        correct.append(y_val[i])
    else:
        incorrect.append(y_val[i])


accuracy = len(correct) / len(y_val)
print(f"Accuracy: {accuracy}")

[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step
Mean Absolute Error (MAE): 2.526855230331421
Accuracy: 0.3861802876183795


In [14]:
# Saving the Model for future use
model_path = './trained_models'
if not os.path.exists(model_path):
    os.makedirs(model_path)
model.save(f'{model_path}/lstm_id_alone_model_lora_ducy.keras')

In [None]:
# Saving the scaler
import joblib

scaler_path = './scalers'
if not os.path.exists(scaler_path):
    os.makedirs(scaler_path)
joblib.dump(scaler, f'{scaler_path}/scaler_lstm_id_lora_ducy.pkl')

['./scalers/scaler_lstm_id_lora_ducy.pkl']

: 