In [1]:
# LSTM ID TIMESTAMP MODEL
# To predict the next (ID,Timestamp) of a given timestamp

In [1]:
# Importing necessary libraries
import os
import numpy as np
import pandas as pd
from libraries.utils import get_paths, read_traces, read_json, mapint2var, is_consistent

In [2]:
# Configuration
CODE = 'lora_ducy'               ### application (code) theft_protection, mamba2, lora_ducy
BEHAVIOUR_FAULTY = 'faulty_data'        ### normal, faulty_data
BEHAVIOUR_NORMAL = 'normal'             ### normal, faulty_data
THREAD = 'single'                       ### single, multi
VER = 3                                 ### format of data collection

base_dir = './trace_data'              ### can be replaced with 'csv', 'exe_plot', 'histogram'
normalbase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_NORMAL}'
faultybase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_FAULTY}'

print("Normal base path:", normalbase_path)
print("Faulty base path:", faultybase_path)

Normal base path: ./trace_data/lora_ducy/single_thread/version_3/normal
Faulty base path: ./trace_data/lora_ducy/single_thread/version_3/faulty_data


In [3]:
train_base_path = os.path.join(normalbase_path, 'train_data')
train_data_path = [os.path.join(train_base_path, x) for x in os.listdir(train_base_path)]
train_varlist_path = [os.path.join(normalbase_path, x) for x in os.listdir(normalbase_path) if 'varlist' in x]

######### get paths #######################
paths_log, paths_traces, varlist_path, paths_label = get_paths(faultybase_path)

train_data_path = [x for x in train_data_path if '.DS_Store' not in x]
train_varlist_path = [x for x in train_varlist_path if '.DS_Store' not in x]
paths_log = [x for x in paths_log if '.DS_Store' not in x]
paths_traces = [x for x in paths_traces if '.DS_Store' not in x]
varlist_path = [x for x in varlist_path if '.DS_Store' not in x]
paths_label = [x for x in paths_label if '.DS_Store' not in x]

paths_log.sort()
paths_traces.sort()
varlist_path.sort()
paths_label.sort()

test_data_path = paths_traces
test_label_path = paths_label

In [4]:
# Check consistency
if VER == 3:
    check_con, _ = is_consistent([train_varlist_path[0]] + varlist_path)
    if check_con:
        to_number = read_json(varlist_path[0])
        from_number = mapint2var(to_number)
    else:
        to_number = read_json(train_varlist_path[0])
        from_number = mapint2var(to_number)

sorted_keys = list(from_number.keys())
sorted_keys.sort()
var_list = [from_number[key] for key in sorted_keys]

varlist 1 is consistent with varlist 0
varlist 2 is consistent with varlist 0
varlist 3 is consistent with varlist 0
varlist 4 is consistent with varlist 0
varlist 5 is consistent with varlist 0
varlist 6 is consistent with varlist 0


In [5]:
# Load training data
def load_data(file_paths):
    data = []
    for file in file_paths:
        traces = read_traces(file)
        if isinstance(traces, list):
            data.append(traces)
    return data

train_data = load_data(train_data_path)
print(train_data)

[[[0, 44], [1, 47], [2, 52], [3, 333], [4, 542], [5, 549], [6, 553], [7, 555], [8, 557], [9, 559], [10, 561], [11, 562], [12, 564], [13, 566], [14, 568], [15, 570], [16, 572], [17, 573], [18, 575], [19, 577], [20, 579], [21, 581], [22, 583], [23, 585], [24, 587], [25, 589], [26, 591], [27, 592], [28, 594], [29, 596], [30, 598], [31, 600], [32, 602], [33, 604], [34, 606], [35, 608], [36, 610], [37, 612], [38, 614], [39, 616], [40, 618], [41, 620], [42, 622], [43, 624], [44, 626], [45, 628], [46, 630], [47, 632], [48, 634], [49, 637], [50, 639], [51, 641], [52, 644], [53, 646], [51, 648], [52, 650], [53, 652], [54, 656], [50, 658], [51, 660], [52, 662], [53, 664], [54, 667], [50, 669], [55, 672], [54, 674], [56, 741], [57, 746], [58, 1053], [57, 1907], [59, 2210], [60, 2212], [56, 2214], [0, 2219], [38, 2222], [23, 2224], [28, 2226], [0, 2232], [57, 20755], [59, 21056], [60, 21058], [56, 21061], [0, 21070], [77, 21074], [78, 21077], [38, 21083], [0, 22073], [38, 22075], [79, 22077], [74,

In [6]:
for idx, td in enumerate(train_data):
    td_array = np.array(td)  
    print(f"Dataset {idx + 1}: shape = {td_array.shape}")

Dataset 1: shape = (1026, 2)
Dataset 2: shape = (1435, 2)
Dataset 3: shape = (1068, 2)
Dataset 4: shape = (1469, 2)
Dataset 5: shape = (1086, 2)
Dataset 6: shape = (1209, 2)
Dataset 7: shape = (1246, 2)
Dataset 8: shape = (1327, 2)
Dataset 9: shape = (1277, 2)
Dataset 10: shape = (1894, 2)
Dataset 11: shape = (1325, 2)


In [7]:
from sklearn.model_selection import train_test_split

# Prepare LSTM training data
sequence_length = 10       # Already tuned parameter value
X_train, y_train = [], []
for single_file_data in train_data:
    for i in range(len(single_file_data) - sequence_length):
        X_train.append(single_file_data[i:i + sequence_length])
        y_train.append(single_file_data[i + sequence_length])

X_train, y_train = np.array(X_train), np.array(y_train)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [8]:
y_val.shape

(2851, 2)

In [9]:
y_train.shape

(11401, 2)

In [10]:
# Scaling the data
from sklearn.preprocessing import MinMaxScaler
import numpy as np

X_train_new = X_train.reshape(-1,2)
X_val_new = X_val.reshape(-1, 2)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_new)
X_val_scaled = scaler.transform(X_val_new)

X_train = X_train_scaled.reshape(X_train.shape)
X_val = X_val_scaled.reshape(X_val.shape)


# Scaling output
y_train_new = y_train.reshape(-1, 2)
y_val_new = y_val.reshape(-1, 2)

y_train_scaled = scaler.fit_transform(y_train_new)
y_val_scaled = scaler.transform(y_val_new)

y_train = y_train_scaled.reshape(y_train.shape)
y_val = y_val_scaled.reshape(y_val.shape)

In [11]:
y_train.shape

(11401, 2)

In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2

import psutil

# Define LSTM model
# Layers 128, 64 and 32 are chosen by parameter tuning
model = Sequential([
    LSTM(128, activation='relu', return_sequences=True, input_shape=(sequence_length, 2), kernel_regularizer=l2(0.001)),
    Dropout(0.1),
    LSTM(64, activation='relu', return_sequences=True, kernel_regularizer=l2(0.001)),
    Dropout(0.1),
    LSTM(32, activation='relu', return_sequences=False, kernel_regularizer=l2(0.001)),
    Dense(2, activation='linear')
])


  super().__init__(**kwargs)


In [15]:
model.compile(optimizer=Adam(),
              loss='mse',
              metrics=['mae'])

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)


history = model.fit(X_train, y_train, validation_split=0.2, epochs=100, batch_size=32, callbacks=[early_stopping], verbose=1)

# Finding RAM usage
ram_usage = psutil.Process().memory_info().rss / (1024 ** 2)
print(f"Total RAM usage: {ram_usage:.2f} MB")


Epoch 1/100
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 16ms/step - loss: 0.0136 - mae: 0.0606 - val_loss: 0.0121 - val_mae: 0.0473
Epoch 2/100
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - loss: 0.0128 - mae: 0.0556 - val_loss: 0.0139 - val_mae: 0.0610
Epoch 3/100
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - loss: 0.0116 - mae: 0.0516 - val_loss: 0.0141 - val_mae: 0.0528
Epoch 4/100
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - loss: 0.0127 - mae: 0.0546 - val_loss: 0.0119 - val_mae: 0.0490
Epoch 5/100
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - loss: 0.0128 - mae: 0.0555 - val_loss: 0.0115 - val_mae: 0.0483
Epoch 6/100
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - loss: 0.0121 - mae: 0.0532 - val_loss: 0.0122 - val_mae: 0.0472
Epoch 7/100
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0

In [16]:
# Calculating the MAE and Accuracy
from sklearn.metrics import mean_absolute_error

pred = model.predict(X_val)

mae = mean_absolute_error(y_val, pred)
print(f"Mean Absolute Error (MAE): {mae}")


[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step
Mean Absolute Error (MAE): 0.03323578180632207


In [17]:
pred = scaler.inverse_transform(pred)
pred = np.round(pred).astype(int)

y_val = scaler.inverse_transform(y_val)
y_val = np.round(y_val).astype(int)

In [19]:

correct = []
incorrect = []

for i in range(len(y_val)):
    yt_id, yt_timestamp = y_val[i]              # ID and timestamp from y_val
    pred_id, pred_timestamp = pred[i]           # ID and timestamp from pred

    if np.abs(yt_id - pred_id) < 1 and ((np.abs(yt_timestamp - pred_timestamp))/1000) < 5:
        correct.append(y_val[i])
    else:
        incorrect.append(y_val[i])

print("LEN OF CORRECT:", len(correct))
print("Output of y_val:", len(y_val))
accuracy = len(correct) / len(y_val)
print(f"Accuracy: {accuracy}")

LEN OF CORRECT: 267
Output of y_val: 2851
Accuracy: 0.09365135040336724


In [20]:
# Saving the Model for future use
model_path = './trained_models'
if not os.path.exists(model_path):
    os.makedirs(model_path)
model.save(f'{model_path}/lstm_id_timestamp_model_lora_ducy.keras')

In [None]:
# Saving the scaler
import joblib

scaler_path = './scalers'
if not os.path.exists(scaler_path):
    os.makedirs(scaler_path)
joblib.dump(scaler, f'{scaler_path}/scaler_lstm_id_timestamp_lora_ducy.pkl')

['./scalers/scaler_lstm_id_timestamp_lora_ducy.pkl']

: 