In [None]:
import json
import os
import sys
sys.path.append('../')  ### to detect libraries in the parent directory
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from collections import defaultdict
from libraries.utils import *
from libraries.exeint import exeInt

In [None]:
############ configuration ################
############################################

CODE = 'lora_ducy'       ### application (code) theft_protection, mamba2
BEHAVIOUR_FAULTY = 'faulty_data'            ### normal, faulty_data
BEHAVIOUR_NORMAL = 'normal'            ### normal, faulty_data
THREAD = 'single'           ### single, multi
VER = 3                     ### format of data collection

base_dir = '../../trace_data' ### can be replaced with 'csv', 'exe_plot', 'histogram'
normalbase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_NORMAL}'
faultybase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_FAULTY}'

print(normalbase_path)
print(faultybase_path)

In [None]:
train_base_path = os.path.join(normalbase_path, 'train_data')
train_data_path = [os.path.join(train_base_path, x) for x in os.listdir(train_base_path)]
train_varlist_path = os.listdir(normalbase_path)
train_varlist_path = [os.path.join(normalbase_path, x) for x in train_varlist_path if 'varlist' in x]

######### get paths #######################
paths_log, paths_traces, varlist_path, paths_label = get_paths(faultybase_path)

### remove.Ds_store from all lists
train_data_path = [x for x in train_data_path if '.DS_Store' not in x]
train_varlist_path = [x for x in train_varlist_path if '.DS_Store' not in x]
paths_log = [x for x in paths_log if '.DS_Store' not in x]
paths_traces = [x for x in paths_traces if '.DS_Store' not in x]
varlist_path = [x for x in varlist_path if '.DS_Store' not in x]
paths_label = [x for x in paths_label if '.DS_Store' not in x]

paths_log.sort()
paths_traces.sort()
varlist_path.sort()
paths_label.sort()

# print(paths_log)
# print(paths_traces)
# print(varlist_path)
# print(paths_label)

test_data_path = paths_traces
test_label_path = paths_label

print(train_data_path)
print(test_data_path)
print(test_label_path)


In [None]:
varlist_path

In [None]:
############# check varlist is consistent ############
############# only for version 3 ######################

if VER == 3 or VER == 4:
    check_con, _ = is_consistent([train_varlist_path[0]]+ varlist_path) ### compare with train varlist

    if check_con != False:
        to_number = read_json(varlist_path[0])
        from_number = mapint2var(to_number)
    else:
        ### load normal varlist
        print('loading normal varlist')
        to_number = read_json(train_varlist_path[0])
        from_number = mapint2var(to_number)

In [None]:
############ Get variable list ######################
sorted_keys = list(from_number.keys())
sorted_keys.sort()
var_list = [from_number[key] for key in sorted_keys]   ### get the variable list
# print(var_list)

## EI Training

In [None]:
### initialize exeinz
ei = exeInt()

In [None]:
train_data_path

In [None]:
### get execution intervals for all variables

exe_list, filewise_exe_list = ei.get_exeint(train_data_path)

In [None]:
################## methods to detect outliers based on execution intervals ####################

############ calculate dynamic thresholds ############
thresholds = ei.get_dynamicthresh(exe_list)

############ train lof model ################
lof_models = ei.train_lof(exe_list)

######### save thresholds and lof models ############
### visualize the thresholds for varlist
thresholds_var = {}
for key in thresholds.keys():
    print('key:', key)
    thresholds_var[from_number[key]] = thresholds[key]

assert len(thresholds_var) == len(thresholds)
thresholds_var
save_json(thresholds_var, os.path.join(faultybase_path, 'thresholds.json'))

In [None]:
thresholds_var

## Replicate Runtime Detection

In [None]:
from_number

In [None]:
#### Detect anomalies in faulty traces
DIFF_VAL = 5
all_tp = []
all_fp = []
all_fn = []
all_detections = [] ### format [file1_detection, file2_detection] -> file1_detection: [(state1, 0), (ts1, ts2), filename]  
all_group_detections = [] ### format [file1_detection, file2_detection] -> file1_detection: [(state1, 0), (ts1, ts2), filename]
all_merged_detections = [] ### format [file1_detection, file2_detection] -> file1_detection: [(state1, 0), (ts1, ts2), filename]
y_pred_all = []
y_true_all = []
all_gt = []
for test_data, test_label in zip(test_data_path[0:], test_label_path[0:]):
    print(test_data, test_label)

    ### general evaluation
    detection = ei.test_single(test_data, thresholds=thresholds)   ### detection in format: [var, (ts1,ts2), file_name]     ### threshold based detection

    # ### runtime evaluation
    # sample_data = read_traces(test_data)
    # ### map integers to variables
    # sample_data = [ [from_number[x[0]], x[1]] for x in sample_data ]
    # print('sample_data:', sample_data)
    # detection = ei.runtime_detection(sample_data, thresholds==thresholds_var, int2var=_int2var)
    # break

    # detection = ei.test_single(test_data, lof_models=lof_models)   ### detection in format: [var, (ts1,ts2), file_name]    ### lof based detection
    before_merge = len(detection)

    merged_detection, grouped_det = ei.merge_detections(detection, DIFF_VAL)  ### merge detections for multiple variables
    detection = merged_detection
    # dedup_detection, grouped_det = ei.remove_duplicates(detection, DIFF_VAL)  ### remove multiple detections for single ground truth
    # detection = dedup_detection
    after_merge = len(detection)
    print('before merge:', before_merge, 'after merge:', after_merge)

    all_detections += [(test_data, detection, test_label)]  ### used to plot detections
    # all_group_detections += [(test_data, grouped_det, test_label)]  ### used to plot grouped detections
    # all_merged_detections += [(test_data, merged_detection, test_label)]  ### used to plot merged detections

    ### load ground truths
    ground_truth_raw = read_traces(test_label)
    ground_truth = ground_truth_raw['labels']
    label_trace_name = list(ground_truth.keys())[0]
    ground_truth = ground_truth[label_trace_name]
    print('ground truths:', ground_truth)
    print(len(ground_truth))

    # correct_pred, rest_pred, y_pred, y_true = get_ypred_ytrue(detection, ground_truth)  ### case1_pred, case2_pred, case34_pred, rest_pred
    # correct_pred, rest_pred, y_pred, y_true = ei.get_correct_detections(detection, ground_truth)  ### case1_pred, case2_pred, case34_pred, rest_pred
    correct_pred, rest_pred, y_pred, y_true, false_neg = ei.get_correct_detections(detection, ground_truth)  ### case1_pred, case2_pred, case34_pred, rest_pred

    assert( len(detection) == len(correct_pred) + len(rest_pred) )

    all_tp += [(test_data, correct_pred, test_label)]
    all_fp += [(test_data, rest_pred, test_label)]
    all_fn += [(test_data, false_neg, test_label)]
    all_gt += [(test_data, ground_truth, test_label)]


    y_pred_all.extend(y_pred)
    y_true_all.extend(y_true)

    # break

In [None]:
### Evaluation metrics

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, average_precision_score, ConfusionMatrixDisplay


# Calculate precision
precision = precision_score(y_true_all, y_pred_all)
print(f'Precision: {precision:.4f}')

# Calculate recall
recall = recall_score(y_true_all, y_pred_all)
print(f'Recall: {recall:.4f}')

# # Calculate average precision
# average_precision = average_precision_score(y_true_all, y_pred_all)
# print(f'Average Precision: {average_precision:.4f}')

# Calculate F1 score
f1 = f1_score(y_true_all, y_pred_all)
print(f"F1 Score: {f1:.4f}")

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_true_all, y_pred_all)
print("Confusion Matrix:")
print(conf_matrix)
if len(conf_matrix) == 1:
    conf_matrix = np.array([[0, 0], [0, conf_matrix[0][0]]])
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['normal', 'anomaly'])
disp.plot()

## Classwise Detections

In [None]:
classwise_fn = defaultdict(list)
classwise_tp = defaultdict(list)
gt_len = 0
for file_fn, file_gt in zip(all_fn, all_gt):
    fn = file_fn[1]
    gt = file_gt[1]
    for label in gt:
        if label in fn:
            classwise_fn[label[4]].append(label)
        else:
            classwise_tp[label[4]].append(label)
            # print('tp:', label)

    gt_len += len(gt)
    # print('file gt:', len(gt))
    # print('file fn:', len(fn))
    # print('\n')
    # break

total_fn = 0
total_tp = 0
keys = set(list(classwise_fn.keys()) + list(classwise_tp.keys()))
# print('keys:', keys)
for key in keys:
    print('class:', key)
    total_fn += len(classwise_fn[key])
    total_tp += len(classwise_tp[key])

    # print('not detected:', len(classwise_fn[key]))
    print('detected:', len(classwise_tp[key]))
    print('total anomalies:', len(classwise_fn[key])+len(classwise_tp[key]))
    print('\n')


# print('total fn+tp:', total_fn+total_tp)
# print('total gt:', gt_len)
assert total_fn+total_tp == gt_len, 'total fn+tp not equal to total gt'

In [None]:
### number of events to take before and after the detection for cropping subsequence
BUFFER_EVENTS = 0

total_subseq_lens = [] 
subseq_class = defaultdict(list)
for test_data, detections, test_label in all_detections:
    print(test_data, test_label)

    ### read traces
    trace = read_traces(test_data)
    print('trace:', len(trace))    

    ### path for sub-sequences
    subseq_path = os.path.dirname(test_label).replace('labels', 'diag_subseq')
    # print(subseq_path)

    ### rules for subsequence
    # print('detections:', detections)
    timestamps = [x[1] for x in trace]
    timestamps = np.array(timestamps)
    # print('timestamps:', timestamps)

    ### load ground truths
    ground_truth_raw = read_traces(test_label)
    ground_truth = ground_truth_raw['labels']
    label_trace_name = list(ground_truth.keys())[0]
    ground_truth = ground_truth[label_trace_name]
    # print('ground truths:', ground_truth)
    print(len(ground_truth))
    print(len(detections))

    all_subseq = []
    
    for det in detections:
        # print('detection:', det)
        var, ts, file_name = det
        lb_det, ub_det = ts

        # print('bounds:', lb_det, ub_det)
        lb_rel_ts = [abs(x-lb_det) for x in timestamps]
        # print('lb_rel_ts:', lb_rel_ts)
        lb_det_ind = np.argmin(lb_rel_ts)
        # print('lb_trace ind:', lb_det_ind)
        # print('lb_trace:', timestamps[lb_det_ind])

        ub_rel_ts = [abs(x-ub_det) for x in timestamps]
        # print('ub_rel_ts:', ub_rel_ts)
        ub_det_ind = np.argmin(ub_rel_ts)
        # print('ub_trace ind:', ub_det_ind)
        # print('ub_trace:', timestamps[ub_det_ind])

        # ### exact match using numpy (alternate implementation)
        # ub_trace = np.where(timestamps == ub_det)[0][0]
        # print('ub_trace ind:', ub_trace)
        # print('ub_trace:', timestamps[ub_trace])


        lb_trace_ind = np.clip(lb_det_ind - BUFFER_EVENTS, 0, None)
        ub_trace_ind = ub_det_ind
        # print('lb_trace ind:', lb_trace_ind)
        # print('ub_trace ind:', ub_trace_ind)

        sub_seq = trace[lb_trace_ind:ub_trace_ind]

        all_subseq.append(sub_seq)

        ### save subsequence
        sub_seq_name = os.path.basename(test_data)+'_'+str(lb_trace_ind)+'-'+str(ub_trace_ind)+'.json'
        file_name = sub_seq_name.strip('.json')
        sub_seq_name = os.path.join(subseq_path,sub_seq_name)
        if not os.path.exists(subseq_path):
            os.makedirs(subseq_path)
        save_json(sub_seq, sub_seq_name)
        print('subseq:', sub_seq_name)

        ### get labels for subsequence
        no_gt = True
        for gt in ground_truth:
            # print('gt:', gt)
            gt_ind1, gt_ind2 = gt[0], gt[1]
            # print('gt:', gt_ind1, gt_ind2)
            if (gt_ind1 > lb_trace_ind and gt_ind1 < ub_trace_ind) or (gt_ind2 > lb_trace_ind and gt_ind2 < ub_trace_ind):
                subseq_class[file_name] += [gt[4]]
                no_gt = False
            
        if no_gt:
            subseq_class[file_name] += [100]   ### 100 is the label for FP detections

        ### average length of subsequence
        total_subseq_lens += [len(sub_seq)]
        if len(sub_seq) == 0:
            break

        

    print('')
    # break

### save the subsequence class labels
subseq_class_path = os.path.join(subseq_path, 'subseq_labels')
if not os.path.exists(subseq_class_path):
    os.makedirs(subseq_class_path)
save_json(subseq_class, os.path.join(subseq_class_path, 'subseq_class.json'))

# avg_subseq_len = np.mean(total_subseq_lens)
# print('average subsequence length:', avg_subseq_len)
# median_subseq_len = np.median(total_subseq_lens)
# print('median subsequence length:', median_subseq_len)
# max_subseq_len = np.max(total_subseq_lens)
# print('max subsequence length:', max_subseq_len)

In [None]:
subseq_class

## Trace Plot

In [None]:
num_trace = []
time_stamp = []
for (t, ts) in sub_seq:
    num_trace.extend([t])
    time_stamp.extend([ts])
    # ### take limited samples
    # if ts > 250000:
    #     break

In [None]:
plot_data = dict()
plot_data['time'] = time_stamp   ### x_data
plot_data['subseq'] = num_trace   ### y_data (traces)

########## process the traces ###########
df_trace = pd.DataFrame(plot_data, columns=['time', 'subseq'])

In [None]:
trace_obj = plot_single_trace(df_trace, var_list, with_time=False, is_xticks=True)
trace_obj.show()

In [None]:
############ configuration ################
############################################

all_applications = ['theft_protection', 'mamba2', 'lora_ducy']
app_paths = defaultdict(dict)

for CODE in all_applications:

    # CODE = 'theft_protection'       ### application (code)
    BEHAVIOUR_FAULTY = 'faulty_data'            ### normal, faulty_data
    BEHAVIOUR_NORMAL = 'normal'            ### normal, faulty_data
    THREAD = 'single'           ### single, multi
    VER = 3                     ### format of data collection

    base_dir = '../../trace_data' ### can be replaced with 'csv', 'exe_plot', 'histogram'
    normalbase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_NORMAL}'
    faultybase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_FAULTY}'
    
    app_paths[CODE]['base_dir'] = base_dir
    app_paths[CODE]['normalbase_path'] = normalbase_path
    app_paths[CODE]['faultybase_path'] = faultybase_path

    print(normalbase_path)
    print(faultybase_path)


    train_base_path = os.path.join(normalbase_path, 'train_data')
    train_data_path = [os.path.join(train_base_path, x) for x in os.listdir(train_base_path)]
    train_varlist_path = os.listdir(normalbase_path)
    train_varlist_path = [os.path.join(normalbase_path, x) for x in train_varlist_path if 'varlist' in x]

    ######### get paths #######################
    paths_log, paths_traces, varlist_path, paths_label = get_paths(faultybase_path)

    ### remove.Ds_store from all lists
    train_data_path = [x for x in train_data_path if '.DS_Store' not in x]
    train_varlist_path = [x for x in train_varlist_path if '.DS_Store' not in x]
    paths_log = [x for x in paths_log if '.DS_Store' not in x]
    paths_traces = [x for x in paths_traces if '.DS_Store' not in x]
    varlist_path = [x for x in varlist_path if '.DS_Store' not in x]
    paths_label = [x for x in paths_label if '.DS_Store' not in x]

    paths_log.sort()
    paths_traces.sort()
    varlist_path.sort()
    paths_label.sort()

    # print(paths_log)
    # print(paths_traces)
    # print(varlist_path)
    # print(paths_label)

    test_data_path = paths_traces
    test_label_path = paths_label

    app_paths[CODE]['train_data_path'] = train_data_path
    app_paths[CODE]['train_varlist_path'] = train_varlist_path
    app_paths[CODE]['paths_log'] = paths_log
    app_paths[CODE]['paths_traces'] = paths_traces
    app_paths[CODE]['varlist_path'] = varlist_path
    app_paths[CODE]['paths_label'] = paths_label
    app_paths[CODE]['test_data_path'] = test_data_path
    app_paths[CODE]['test_label_path'] = test_label_path
    

    print(train_data_path)
    print(test_data_path)
    print(test_label_path)

    break