In [None]:
################# state transition machine ##################
import json
import os
import numpy as np
import pandas as pd

def read_traces(log_path):
    '''
    read the trace files and extract variable names
    data = [ [event, timestamp], [], [],......,[] ]
    '''
    with open(log_path, 'r') as f:
        data = json.load(f)
    return data


# Subtrace Generation (faulty, 50)

In [None]:
############ configuration ################
############################################

code = 'theft_protection'       ### application (code)
behaviour = 'faulty_data'            ### normal, faulty_data
thread_typ = 'single'           ### single, multi
version = 2.2                     ### format of data collection
sub_len = 'dynamic'

base_dir = '../trace_data' ### can be replaced with 'csv', 'exe_plot', 'histogram'
log_path = base_dir+f'/{code}/{thread_typ}_thread/version_{version}/{behaviour}'

#### subtraces
subtrace_path = f"data-subtraces/version_{version}/{behaviour}/subtraces/{sub_len}/"
print(log_path)

### Get paths to the files

In [None]:

###### get file paths #######

all_files = os.listdir(log_path)
all_files.sort()
logs = []
traces = []
unknown = []
for i in all_files:
    if i.find('log') == 0:
        logs += [i]
    elif i.find('trace') == 0 and i.find('.txt') == -1:
        traces += [i]
    else:
        unknown += [i]

######### path to files
paths_log = [os.path.join(log_path, x) for x in logs]
paths_traces = [os.path.join(log_path, x) for x in traces]
paths_log.sort()
print(paths_log)

In [None]:
paths_traces

### Generate data samples (size=50)

In [None]:
########## generate raw data from traces ###########

# col_data = []
# for (p,w) in zip(paths_traces, traces):
#     trace = read_traces(p)
#     print(p,w)

#     ### path to save data samples
#     write_path = subtrace_path
#     print(write_path)

#     counter = 0
#     for i in range(0,len(trace),50):
#         if i==0:
#             ### take samples from 0 to 50
#             sample = trace[i:i+51]
#             np.save(write_path+f'{w}_{i}_{i+50}', sample, allow_pickle=False)
#             # print(len(sample))
#         elif len(trace) - i >= 50:
#             ### take samples from 50 to 99
#             sample = trace[i:i+51]
#             np.save(write_path+f'{w}_{i}_{i+50}', sample, allow_pickle=False)
#             # print(len(sample))
#         else:
#             sample = trace[i:]
#             np.save(write_path+f'{w}_{i}_{len(trace)}', sample, allow_pickle=False)
#             # print(len(sample))
#         counter += 1
#         print(counter)

#     # break

# State Transition Labels- instances

In [None]:
### get files from subtraces
all_subtraces = os.listdir(subtrace_path)
all_subtraces.remove('.DS_Store')

In [None]:
### generate label files

for sub in all_subtraces:
    sub_path = os.path.join(subtrace_path, sub)
    label_path = 'state transition/data/unlabelled/'
    # isPath = os.path.exists(os.path.dirname(label_path)) ### check if the path exists
    # ### create the folder if it does not exist
    # if not isPath:
    #     os.makedirs(os.path.dirname(label_path))
    # print(sub)
    subtrace = np.load(sub_path)
    start_count = sub.split('_')[1]
    #print(start_count)

    # print(subtrace)
    all_rows = []
    for ind, (event1, event2) in enumerate(zip(subtrace[0:-1], subtrace[1:])):
        # print(event1,event2)
        var1, var2 = event1[0], event2[0]
        ts1, ts2 = int(event1[1]), int(event2[1])
        data_row = [int(start_count)+ind, var1, var2, ts1, ts2, 0]
        # print(data_row)
        all_rows += [data_row]

    columns = ['ind', 's1', 's2', 'ts1', 'ts2', 'label']
    df_sub = pd.DataFrame(all_rows, columns=columns)
    excel_file_path = label_path + sub.replace('.npy', '.xlsx')

    ############# uncomment to save files
    # df_sub.to_excel(excel_file_path, index=False)



# Clustering - instances and labels

## labels for subtraces (len 50)

In [None]:
### get files from subtraces
all_subtraces = os.listdir(subtrace_path)
all_subtraces.remove('.DS_Store')

## labels for traces

In [None]:
paths_traces

### substitute zero

In [None]:
### generate label files (only single exe inter per instances, subsitute zero for other variables)

### load var_list
_var_list = np.load('var_list.npy', allow_pickle=False)
_var_list=tuple(_var_list)

for tr in paths_traces:
    ### paths
    label_path = subtrace_path.replace('/subtraces', '/clustering_instances_labels') + '/trace_labels/'
    print(tr)
    
    ### load file
    trace = read_traces(tr)
    print(trace)

    exe_list = np.zeros(len(_var_list))  ### list to store the execution interval to create instances
    prev_exe = np.zeros(len(_var_list))  ### list to store the previous execution time of each variable

    instances = []  ### list to store the instances
    create_instance = False  ### flag to indicate any element in exe_list is not 0
    for ind, event in enumerate(trace):
        # print(event)
        var, ts = event[0], int(event[1])
        event_ind = _var_list.index(var)
        trace_ind = ind
        # print(trace_ind, exe_list, create_instance)
        # print(trace_ind, prev_exe)

        ### if the first instance of variable in log file then update the prev_exe list
        if prev_exe[event_ind] == 0:
            prev_exe[event_ind] = ts
        else:
            ### calculate the execution interval
            exe_inter = ts - prev_exe[event_ind]
            prev_exe[event_ind] = ts
            exe_list[event_ind] = exe_inter

        ### if atleast one exe_inter is calculated save the instance. To avoid instances with all parameters as 0
        if any(element != 0 for element in exe_list):
            create_instance = True

        if create_instance:
            # print(trace_ind, exe_list, create_instance)
            instances += [(trace_ind,tuple(exe_list), 0)]     ### format of instance (index, [exe_inter], label)
            exe_list = np.zeros(len(_var_list))  ### list to store the execution interval to create instances
            create_instance = False

    columns = ['ind', 'exe_inter', 'label']
    df_sub = pd.DataFrame(instances, columns=columns)
    excel_file_path = label_path + os.path.basename(tr) + '.xlsx'

    # ############ uncomment to save files
    df_sub.to_excel(excel_file_path, index=False)
    
        


In [None]:
os.path.basename(tr)


### last values

In [None]:
### generate label files (only single exe inter per instances, subsitute zero for other variables)

### load var_list
_var_list = np.load('var_list.npy', allow_pickle=False)
_var_list=tuple(_var_list)

for tr in paths_traces:
    ### paths
    label_path = subtrace_path.replace('/subtraces', '/clustering_instances_labels') + '/trace_labels/'
    print(tr)
    
    ### load file
    trace = read_traces(tr)
    print(trace)

    exe_list = np.zeros(len(_var_list))  ### list to store the execution interval to create instances
    prev_exe = np.zeros(len(_var_list))  ### list to store the previous execution time of each variable

    instances = []  ### list to store the instances
    create_instance = False  ### flag to indicate any element in exe_list is not 0
    for ind, event in enumerate(trace):
        # print(event)
        var, ts = event[0], int(event[1])
        event_ind = _var_list.index(var)
        trace_ind = ind
        # print(trace_ind, exe_list, create_instance)
        # print(trace_ind, prev_exe)

        ### if the first instance of variable in log file then update the prev_exe list
        if prev_exe[event_ind] == 0:
            prev_exe[event_ind] = ts
        else:
            ### calculate the execution interval
            exe_inter = ts - prev_exe[event_ind]
            prev_exe[event_ind] = ts
            exe_list[event_ind] = exe_inter

        ### if atleast one exe_inter is calculated save the instance. To avoid instances with all parameters as 0
        if create_instance == False:
            if any(element != 0 for element in exe_list):
                create_instance = True

        if create_instance:
            # print(trace_ind, exe_list, create_instance)
            instances += [(trace_ind,tuple(exe_list), 0)]     ### format of instance (index, [exe_inter], label)

    columns = ['ind', 'exe_inter', 'label']
    df_sub = pd.DataFrame(instances, columns=columns)
    excel_file_path = label_path + os.path.basename(tr) + '.xlsx'

    # ############ uncomment to save files
    df_sub.to_excel(excel_file_path, index=False)
    
        


# Examine Subtraces 

In [None]:
############ configuration ################
############################################

code = 'theft_protection'       ### application (code)
behaviour = 'faulty_data'            ### normal, faulty_data
thread_typ = 'single'           ### single, multi
version = 2.2                     ### format of data collection
sub_len = 50

base_dir = 'data-subtraces' ### can be replaced with 'csv', 'exe_plot', 'histogram'
normal_path = base_dir+f'/version_{version}/{behaviour}/subtraces/{sub_len}/normal'
anomalies_path = base_dir+f'/version_{version}/{behaviour}/subtraces/{sub_len}/anomalies'
print(normal_path, anomalies_path)

In [None]:
normal_files = os.listdir(normal_path)
if '.DS_Store' in normal_files:
    normal_files.remove('.DS_Store')

anomalies_files = os.listdir(anomalies_path)
if '.DS_Store' in anomalies_files:
    anomalies_files.remove('.DS_Store')

normal_files = [os.path.join(normal_path, x) for x in normal_files]
anomalies_files = [os.path.join(anomalies_path, x) for x in anomalies_files]


# Dynamic Labelling

In [None]:
import os
import json
import numpy as np

def read_traces(log_path):
    '''
    read the trace files and extract variable names
    data = [ [event, timestamp], [], [],......,[] ]
    '''
    with open(log_path, 'r') as f:
        data = json.load(f)
    return data

In [None]:
############ configuration ################
############################################

code = 'theft_protection'       ### application (code)
behaviour = 'faulty_data'            ### normal, faulty_data
thread_typ = 'single'           ### single, multi
version = 2.2                     ### format of data collection
sub_len = 'dynamic'

base_dir = '../trace_data' ### can be replaced with 'csv', 'exe_plot', 'histogram'
log_path = base_dir+f'/{code}/{thread_typ}_thread/version_{version}/{behaviour}'

#### subtraces
subtrace_path = f"data-subtraces/version_{version}/{behaviour}/subtraces/{sub_len}/"
print(log_path)

In [None]:
### check if paths exist
isPath = os.path.exists(os.path.dirname(subtrace_path))
print(isPath)

## Subtraces (based on indexs in paper summaries)

### Get file paths

In [None]:
###### get file paths #######

all_files = os.listdir(log_path)
all_files.sort()
logs = []
traces = []
unknown = []
for i in all_files:
    if i.find('log') == 0:
        logs += [i]
    elif i.find('trace') == 0 and i.find('.txt') == -1:
        traces += [i]
    else:
        unknown += [i]

######### path to files
paths_log = [os.path.join(log_path, x) for x in logs]
paths_traces = [os.path.join(log_path, x) for x in traces]
paths_log.sort()

In [None]:
paths_traces

### Generate subtraces

In [None]:
#### Generate subtrace for test data (based on the labeling index from paper summaries excel sheet)
subtrace_ranges = ((0,5330), (0,3610), (0,6410))   ### trace1-comm, trace2-bit, trace3-sensor

### read traces and save to the subtrace folder
for (i, tr) in enumerate(paths_traces):
    print(tr)
    ### wrtie path
    write_path = subtrace_path + os.path.basename(tr)
    ### load file
    trace = read_traces(tr)
    # print(trace)
    ### get the subtrace range
    sub_range = subtrace_ranges[i]
    print(sub_range)
    ### save the subtrace in human readable format
    with open(write_path, 'w') as f:
        json.dump(trace[sub_range[0]:sub_range[1]], f)
    

    

### Generate label files (index of anomalies)

In [None]:
### read the excel file
import pandas as pd
import numpy as np
import os
import json

label_file_path = subtrace_path + 'labels/raw/label_indices.xlsx'
df = pd.read_excel(label_file_path)
trace_file_names = df.columns

write_path = subtrace_path + 'labels/'

### save the labels for each subtrace
file_label = []
for tf in trace_file_names:
    data = df[tf].dropna().values
    ### convert float to int
    data = data.astype(int)
    # print(data)

    write_name = write_path + tf + '_labels.json'
    ### save the data as human readable file
    with open(write_name, 'w') as f:
        json.dump(data.tolist(), f)
    


## Labels for State Transition

In [None]:
'''
As the label_indices.xlsx includes the index of the variables after which the anomaly occurs. In case of state transition the label will be assigned to the transition.
In case of State Transition method, it will detect the transition that is anomalous.
To evaluate the performance, we check the timestamp of first variable if it exists in the ground truth.
'''

## Labels for Thresholding

## Labels for NN