# TSFEDL Models

In [1]:
import json
import os
import sys
sys.path.append('../')  ### to detect libraries in the parent directory
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from libraries.utils import *





## Load data

In [2]:
# ############ configuration - trace ################
# ############################################


CODE = 'theft_protection'       ### application (code)       ###  'theft_protection', 'mamba2', 'lora_ducy'
BEHAVIOUR_FAULTY = 'faulty_data'            ### normal, faulty_data
BEHAVIOUR_NORMAL = 'normal'            ### normal, faulty_data
THREAD = 'single'           ### single, multi
VER = 3                     ### format of data collection

base_dir = '../../trace_data' ### can be replaced with 'csv', 'exe_plot', 'histogram'
normalbase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_NORMAL}'
faultybase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_FAULTY}'

print(normalbase_path)
print(faultybase_path)


################# configuration - diag ################
IS_VAR_WINDOW = False             ### True: varibale window size, False: fixed window size; wether to use variable window size or not

#####################################################


ref_samples_basepath = os.path.join(normalbase_path, 'diag_refsamples')
ref_var_samples_basepath = os.path.join(normalbase_path, 'diag_var_refsamples')
diag_subseq_basepath = os.path.join(faultybase_path, 'diag_subseq')
subseq_label_basepath = os.path.join(diag_subseq_basepath, 'subseq_labels')


print('ref_samples_path:\n', ref_samples_basepath)
print('ref_var_samples_path:\n', ref_var_samples_basepath)
print('diag_subseq_path:\n', diag_subseq_basepath)

######### get paths #######################
ref_samples_path = [os.path.join(ref_samples_basepath, x) for x in os.listdir(ref_samples_basepath)]
ref_var_samples_path = [os.path.join(ref_var_samples_basepath, x) for x in os.listdir(ref_var_samples_basepath)]   

train_varlist_path = os.listdir(normalbase_path)
train_varlist_path = [os.path.join(normalbase_path, x) for x in train_varlist_path if 'varlist' in x]

######### get paths #######################
paths_log, paths_traces, varlist_path, paths_label = get_paths(faultybase_path)

test_subseq_path = [os.path.join(diag_subseq_basepath, x) for x in os.listdir(diag_subseq_basepath)]
test_labels_path = [os.path.join(subseq_label_basepath, x) for x in os.listdir(subseq_label_basepath)]

# ### remove.Ds_store from all lists
train_varlist_path = [x for x in train_varlist_path if '.DS_Store' not in x]
varlist_path = [x for x in varlist_path if '.DS_Store' not in x]
paths_label = [x for x in paths_label if '.DS_Store' not in x]
ref_samples_path = [x for x in ref_samples_path if '.DS_Store' not in x]
ref_var_samples_path = [x for x in ref_var_samples_path if '.DS_Store' not in x]
test_subseq_path = [x for x in test_subseq_path if '.DS_Store' not in x if '.json' in x]
test_labels_path = [x for x in test_labels_path if '.DS_Store' not in x]


varlist_path.sort()

# print(paths_log)
# print(paths_traces)
# print(varlist_path)
# print(paths_label)

if IS_VAR_WINDOW:
    train_data_path = ref_var_samples_path
else:
    train_data_path = ref_samples_path

test_data_path = test_subseq_path

print('train_data:\n', train_data_path)
print(len(train_data_path))
print('test_data:\n', test_data_path)
print(len(test_data_path))
print('test_labels:\n', test_labels_path)



../../trace_data/theft_protection/single_thread/version_3/normal
../../trace_data/theft_protection/single_thread/version_3/faulty_data
ref_samples_path:
 ../../trace_data/theft_protection/single_thread/version_3/normal/diag_refsamples
ref_var_samples_path:
 ../../trace_data/theft_protection/single_thread/version_3/normal/diag_var_refsamples
diag_subseq_path:
 ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq
train_data:
 ['../../trace_data/theft_protection/single_thread/version_3/normal/diag_refsamples/379.json', '../../trace_data/theft_protection/single_thread/version_3/normal/diag_refsamples/396.json', '../../trace_data/theft_protection/single_thread/version_3/normal/diag_refsamples/115.json', '../../trace_data/theft_protection/single_thread/version_3/normal/diag_refsamples/400.json', '../../trace_data/theft_protection/single_thread/version_3/normal/diag_refsamples/142.json', '../../trace_data/theft_protection/single_thread/version_3/normal/diag_refsam

In [3]:
############# check varlist is consistent ############
############# only for version 3 ######################

if VER == 3 or VER == 4:
    check_con, _ = is_consistent([train_varlist_path[0]]+ varlist_path) ### compare with train varlist

    if check_con != False:
        to_number = read_json(varlist_path[0])
        from_number = mapint2var(to_number)
    else:
        ### load normal varlist
        print('loading normal varlist')
        to_number = read_json(train_varlist_path[0])
        from_number = mapint2var(to_number)



varlist 1 is consistent with varlist 0
varlist 2 is consistent with varlist 0
varlist 3 is consistent with varlist 0


In [4]:
to_number = read_json(train_varlist_path[0])
from_number = mapint2var(to_number)

In [5]:
# #### key finder ####
# from_number[44]

In [6]:
############ Get variable list ######################
sorted_keys = list(from_number.keys())
sorted_keys.sort()
var_list = [from_number[key] for key in sorted_keys]   ### get the variable list
# print(var_list)

## Prepare Training Data

In [None]:
'''
TODO:
1. check the code for feature extraction in Approach 1
2. split the ref_samples in seq of 50 events (sliding interval of 1)
3. prepare y_train i.e. the expected output of the seq
4. train the model
'''

In [157]:
### load all the reference samples (fixed window size)
### the ref_samples: list of list of events, list of intervals for the subseq of size 500
ref_samples = []
for ref_sample_path in train_data_path:
    ref_trace = read_traces(ref_sample_path)
    ref_samples.append(ref_trace)
    # print(ref_trace)
    # break

In [158]:
np.array(ref_samples).shape

(438, 2, 500)

In [159]:
### make subseq of 50 events, with sliding interval of 1
WINDOW = 50
sub_seq_events = []
sub_seq_intervals = []
X_train = []
Y_train = []
for ref_trace in ref_samples:
    # print(ref_trace)
    ref_events = ref_trace[0]
    ref_intervals = ref_trace[1]
    # print(len(ref_events))
    # print(len(ref_intervals))
    ### we take one less event for training because we need last event as label
    for i in range(len(ref_events)-WINDOW): 
        # print(i)
        # print(ref_events[i])
        sub_seq_events.append(ref_events[i:i+WINDOW])
        sub_seq_intervals.append(ref_intervals[i:i+WINDOW])

        ### both events and intervals are taken as input
        # _x_train = [ref_events[i:i+WINDOW], ref_intervals[i:i+WINDOW]]
        # _x_train = np.array(_x_train)
        # _x_train = np.transpose(_x_train)
        # _y_train = np.array([ref_events[i+WINDOW], ref_intervals[i+WINDOW]])

        ### only events are taken as input
        _x_train = [ref_events[i:i+WINDOW]]
        _y_train = [ref_events[i+WINDOW]]

        X_train.append(_x_train)
        Y_train.append(_y_train)


    # break

In [160]:
print(np.array(X_train).shape)

(197100, 1, 50)


In [170]:
### preprocess training data
X_train = np.array(X_train)
Y_train = np.array(Y_train)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[2], X_train.shape[1])
Y_train = Y_train.reshape(Y_train.shape[0], Y_train.shape[1],1)

### shuffle the data
X_train, Y_train = shuffle(X_train, Y_train, random_state=0)

### split the data in train, validation and test sets
X_train, x_test, Y_train, y_test = train_test_split(X_train, Y_train, test_size=0.2, random_state=0)
# X_val, X_test, Y_val, Y_test = train_test_split(X_val, Y_val, test_size=0.5, random_state=0)

print(X_train.shape)
print(Y_train.shape)

# print(X_val.shape)
# print(Y_val.shape)

print(x_test.shape)
print(y_test.shape)


(100915, 50, 1)
(100915, 1, 1)
(25229, 50, 1)
(25229, 1, 1)


## Build and Train DL Models

In [171]:
import tensorflow as tf
import TSFEDL.models_keras as tsfedl

In [172]:
#### build model ####
input = tf.keras.Input(shape=(50,1))
model = tsfedl.HuangMeiLing(input_tensor=input, include_top=False)
x = model.output
x = tf.keras.layers.LSTM(units=20)(x)
### Add the top module
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(50)(x)
# x = tf.keras.layers.Dense(10)(x)
x = tf.keras.layers.Dense(1)(x)
out = tf.keras.layers.Reshape([1, 1])(x)

### create new model
forecaster = tf.keras.Model(inputs=input, outputs=out, name="forecaster")

# print(model.summary())
print(forecaster.summary())


None


In [173]:
### train
forecaster.compile(loss='mae', optimizer='adam', metrics=['mae', 'mse'])

# history = forecaster.fit(X_train, Y_train, epochs=100, batch_size=32, validation_split=0.2)
callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)]
history = forecaster.fit(X_train, Y_train, epochs=100, batch_size=64, validation_split=0.2, callbacks=callbacks)

Epoch 1/100
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - loss: 1.6697 - mae: 1.6697 - mse: 7.1534 - val_loss: 0.0860 - val_mae: 0.0860 - val_mse: 0.1780
Epoch 2/100
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - loss: 0.0746 - mae: 0.0746 - mse: 0.1678 - val_loss: 0.0606 - val_mae: 0.0606 - val_mse: 0.1766
Epoch 3/100
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - loss: 0.0644 - mae: 0.0644 - mse: 0.1733 - val_loss: 0.0547 - val_mae: 0.0547 - val_mse: 0.1779
Epoch 4/100
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - loss: 0.0629 - mae: 0.0629 - mse: 0.1775 - val_loss: 0.0852 - val_mae: 0.0852 - val_mse: 0.1768
Epoch 5/100
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - loss: 0.0605 - mae: 0.0605 - mse: 0.1629 - val_loss: 0.0891 - val_mae: 0.0891 - val_mse: 0.1826
Epoch 6/100
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [197]:
history
### save the model
forecaster.save('./trained_models/forecaster_events.h5')
forecaster.save_weights('./trained_models/forecaster_events.weights.h5')




In [184]:
# print(x_test.shape)

y_pred = forecaster.predict(x_test)

print(y_pred.shape)
print(y_test.shape)

[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 884us/step
(25229, 1, 1)
(25229, 1, 1)


In [185]:
for i in range(10):
    print(y_test[i], y_pred[i])

[[7]] [[7.0197396]]
[[9]] [[9.029033]]
[[15]] [[15.028571]]
[[7]] [[7.019776]]
[[6]] [[6.018703]]
[[7]] [[7.0197744]]
[[9]] [[9.029016]]
[[8]] [[8.021746]]
[[15]] [[15.028571]]
[[6]] [[6.018719]]


In [None]:
### testing
correct = []
incorrect = []
for yt,yp in zip(y_test, y_pred):
    # print(yt.shape)
    # print(yp.shape)
    # print(yt, yp)
    yt = yt.reshape(1,)
    yp = yp.reshape(1,)

    # print(yt, yp)

    if np.abs(yt-yp) < 1:
        # print('correct prediction')
        correct.append(x)
    else:
        # print('incorrect prediction')
        incorrect.append(x)
        # print(yt, yp)

    
    # break

print('correct:', len(correct))
print('incorrect:', len(incorrect))

### accuracy
accuracy = len(correct)/len(y_test)
print('accuracy:', accuracy)

correct: 24927
incorrect: 302
accuracy: 0.9880296484204685
f1_score: 0.9880296484204685


In [None]:
incorrect