In [None]:
import json
import os
import sys
sys.path.append('../')  ### to detect libraries in the parent directory
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from collections import defaultdict
from libraries.utils import *
from libraries.exeint import exeInt

############ configuration ################
############################################

CODE = 'mamba2'       ### application (code) theft_protection, mamba2, lora_ducy
BEHAVIOUR_FAULTY = 'faulty_data'            ### normal, faulty_data
BEHAVIOUR_NORMAL = 'normal'            ### normal, faulty_data
THREAD = 'single'           ### single, multi
VER = 3                     ### format of data collection

base_dir = '../../trace_data' ### can be replaced with 'csv', 'exe_plot', 'histogram'
normalbase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_NORMAL}'
faultybase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_FAULTY}'

print(normalbase_path)
print(faultybase_path)

train_base_path = os.path.join(normalbase_path, 'train_data')
train_data_path = [os.path.join(train_base_path, x) for x in os.listdir(train_base_path)]
train_varlist_path = os.listdir(normalbase_path)
train_varlist_path = [os.path.join(normalbase_path, x) for x in train_varlist_path if 'varlist' in x]

######### get paths #######################
paths_log, paths_traces, varlist_path, paths_label = get_paths(faultybase_path)

### remove.Ds_store from all lists
train_data_path = [x for x in train_data_path if '.DS_Store' not in x]
train_varlist_path = [x for x in train_varlist_path if '.DS_Store' not in x]
paths_log = [x for x in paths_log if '.DS_Store' not in x]
paths_traces = [x for x in paths_traces if '.DS_Store' not in x]
varlist_path = [x for x in varlist_path if '.DS_Store' not in x]
paths_label = [x for x in paths_label if '.DS_Store' not in x]

paths_log.sort()
paths_traces.sort()
varlist_path.sort()
paths_label.sort()

# print(paths_log)
# print(paths_traces)
# print(varlist_path)
# print(paths_label)

test_data_path = paths_traces
test_label_path = paths_label

print(train_data_path)
print(test_data_path)
print(test_label_path)


In [None]:
############# check varlist is consistent ############
############# only for version 3 ######################

if VER == 3 or VER == 4:
    check_con, _ = is_consistent([train_varlist_path[0]]+ varlist_path) ### compare with train varlist

    if check_con != False:
        to_number = read_json(varlist_path[0])
        from_number = mapint2var(to_number)
    else:
        ### load normal varlist
        print('loading normal varlist')
        to_number = read_json(train_varlist_path[0])
        from_number = mapint2var(to_number)

In [None]:
############ Get variable list ######################
sorted_keys = list(from_number.keys())
sorted_keys.sort()
var_list = [from_number[key] for key in sorted_keys]   ### get the variable list
# print(var_list)

## Generate ref subseq (500 events window size)

In [None]:
SAMPLE_LEN = 500

### check if folder 'diag_refsamples' exists
ref_samples_path = os.path.join(normalbase_path, f'diag_refsamples{SAMPLE_LEN}')
# print(ref_samples_path)
if not os.path.exists(ref_samples_path):
    os.makedirs(ref_samples_path)
    print(f'Folder does not exist. Creating folder {ref_samples_path}')
else:
    print('Folder exists')

### logic for creating reference samples
'''
- store reference samples of 500 events
- store only unique samples (uniqueness based on sequence of events, intervals are ignored)
- store the reference samples as json files
- store the reference samples in the folder 'diag_refsamples'
'''
ind_count = 0   ### to keep track of the number of reference samples across all files
unique_samples = []   ### to store unique reference samples across all files
for train_data in train_data_path:
    print(train_data)
    trace_data = read_traces(train_data)
    # print(len(trace_data))
    # print(trace_data[0:50])

    ### slide window of 500 events and save as reference samples
    for i in range(0, len(trace_data)-SAMPLE_LEN+1):
        # print(i)
        ref_sample = trace_data[i:i+SAMPLE_LEN]
        # print('len of ref sample', len(ref_sample))

        ### transform the trace data to events and intervals. Interval is the time difference between timestamps of consecutive events
        events = []
        intervals = []
        #### old implementation with 0 at start of intervals
        # prev_time = ref_sample[0][1]
        # time_diff = 0
        # for x in ref_sample:
        #     time_diff = x[1] - prev_time
        #     intervals.append(time_diff)
        #     prev_time = x[1]
        #     events.append(x[0])
        ### revised implementation with interval as the time difference between consecutive events, the difference between 1st and 2nd event is the first interval
        for x,y in zip(ref_sample[:-1], ref_sample[1:]):
            events.append(x[0])
            intervals.append(y[1] - x[1])

        ref_sample = (events, intervals)
        # print(ref_sample)
        # print(len(ref_sample[0]), len(ref_sample[1]))
        is_unique = False
        if unique_samples == []:
            unique_samples.append(ref_sample)
            is_unique = True
        else:
            found = False
            for unique_sample in unique_samples:
                if unique_sample[0] == events:
                    found = True
                    print(ind_count, 'duplicate')
                    # print(unique_sample[0])
                    # print(events)
                    break
            if not found:
                unique_samples.append(ref_sample)
                is_unique = True
                # print(ind_count, 'unique')

        if is_unique:
            ref_samples_name = os.path.join(ref_samples_path, str(ind_count)+'.json')
            save_json(ref_sample, ref_samples_name)
            print(f'Saved {ref_samples_name}')

        ind_count += 1 

        print('---------------------------------\n')
    #     break


    # break
    

In [None]:
'''
TODO:
- instead of storing the trace as it is, store the variable and difference of two consecutive TS
- store it as two lists instead of single list of tuples 
- logic for only unique samples
- store the reference samples as json files
'''



## Generate ref subseq (variable window size)

In [None]:
MIN_WINDOW = 10
MAX_WINDOW = 500
SLIDING_INTERVAL = 50

### check if folder 'diag_refsamples' exists
ref_samples_path = os.path.join(normalbase_path, 'diag_var_refsamples')
if not os.path.exists(ref_samples_path):
    os.makedirs(ref_samples_path)
    print(f'Folder does not exist. Creating folder {ref_samples_path}')
else:
    print('Folder exists')

ind_count = 0
unique_samples = []
map_len = defaultdict(list)
for train_data in train_data_path:
    print(train_data)
    trace_data = read_traces(train_data)
    print(len(trace_data))

    for sample_len in range(MIN_WINDOW, MAX_WINDOW+1, SLIDING_INTERVAL):
        print(f'Window size: {sample_len}')
        for i in range(0, len(trace_data)-sample_len+1):
            ref_sample = trace_data[i:i+sample_len]

            ### transform the trace data to events and intervals. Interval is the time difference between timestamps of consecutive events
            events = []
            intervals = []
            prev_time = ref_sample[0][1]
            time_diff = 0
            for x in ref_sample:
                time_diff = x[1] - prev_time
                intervals.append(time_diff)
                prev_time = x[1]
                events.append(x[0])

            ref_sample = (events, intervals)
            is_unique = False
            if unique_samples == []:
                unique_samples.append(ref_sample)
                map_len[sample_len] += [ind_count]
                is_unique = True
            else:
                found = False
                for unique_sample in unique_samples:
                    if unique_sample[0] == events:
                        found = True
                        print(ind_count, 'duplicate')
                        # print(unique_sample[0])
                        # print(events)
                        break
                if not found:
                    unique_samples.append(ref_sample)
                    map_len[sample_len] += [ind_count]
                    is_unique = True
                    # print(ind_count, 'unique')

            if is_unique:
                ref_samples_name = os.path.join(ref_samples_path, str(ind_count)+'.json')
                save_json(ref_sample, ref_samples_name)
                print(f'Saved {ref_samples_name}')

            ind_count += 1 

            print('---------------------------------\n')

    save_json(map_len, os.path.join(ref_samples_path, 'map_len.json'))
    # break

In [None]:
ref_sample

In [None]:
total = 0
for k in map_len.keys():
    print(k, len(map_len[k]))
    print(map_len[k])
    total += len(map_len[k])
