# Execution Interval Method

In [None]:
import json
import os
import numpy as np
import pandas as pd
from scipy.stats import t
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from collections import defaultdict
from libraries.utils import *



def calculate_confidence_interval(data, confidence=0.95):
    '''
    calculate the confidence interval of the data
    data: a list of execution intervals -> [1,2,3,4,5,6,7,8,9,10]
    '''
    n = len(data)
    m = np.mean(data)
    std_err = np.std(data, ddof=1) / np.sqrt(n)
    h = std_err * t.ppf((1 + confidence) / 2, n - 1)
    start = m - h
    end = m + h
    return start, end

def get_uniquevar(raw_trace):
    ''' 
    convert the v2.2 trace into list of unique variables
    raw_trace = data from read_traces, list( (var, ts),(var, ts),(var, ts),.... )
    return:
        unique_var = list(var1,var2,...) ## list of strings
    '''
    unique_var = []
    for rt in raw_trace:
        [var, timestamp] = rt
        # print([var, timestamp])
        if var not in unique_var:
            unique_var += [var]
            # print(rt)
    return unique_var


def generate_map(unique_events):
    '''
    unique_events -> list of all the variables in the code (unique, and in order of logging)
    return:
        event_map -> takes the variable name and gives corresponding event number
        event_remap -> takes event number and gives associated variable name
    '''
    event_map = dict()
    event_remap = dict()
    for i in range(len(unique_events)):
        event_remap[i+1] = unique_events[i]
        event_map[unique_events[i]] = i+1

    return(event_map, event_remap)


    

## Load Data

In [None]:
############ configuration ################
############################################

CODE = 'theft_protection'       ### application (code)
BEHAVIOUR_FAULTY = 'faulty_data'            ### normal, faulty_data
BEHAVIOUR_NORMAL = 'normal'            ### normal, faulty_data
THREAD = 'single'           ### single, multi
VER = 3                     ### format of data collection

base_dir = '../../trace_data' ### can be replaced with 'csv', 'exe_plot', 'histogram'
normalbase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_NORMAL}'
faultybase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_FAULTY}'

print(normalbase_path)
print(faultybase_path)

In [None]:

train_base_path = os.path.join(normalbase_path, 'train_data')
train_data_path = [os.path.join(train_base_path, x) for x in os.listdir(train_base_path)]
print(train_data_path)


######### get paths #######################
paths_log, paths_traces, varlist_path, paths_label = get_paths(faultybase_path)

paths_log.sort()
paths_traces.sort()
varlist_path.sort()
paths_label.sort()

print(paths_log)
print(paths_traces)
print(varlist_path)
print(paths_label)

test_data_path = paths_traces
test_label_path = paths_label


In [None]:
############# check varlist is consistent ############
############# only for version 3 ######################

if VER == 3:
    to_number = is_consistent(varlist_path)

    if to_number != False:
        from_number = mapint2var(to_number)


In [None]:
############ Get variable list ######################
var_list = [value for key, value in from_number.items()]   ### get the variable list

In [None]:
'''
Done till here
'''

## Confidence Interval

__Confidence Interval:__

A confidence interval is a range around the mean that is likely to contain the true population mean. The formula for a confidence interval is mean ± margin of error mean±margin of error, where the margin of error depends on the desired confidence level and the standard error.

_Example:_

1. Choose a confidence level (e.g., 95%).
2. Calculate the standard error: standard deviation/ sqr_root(number of observations)
3. Calculate the margin of error: critical value × standard error
4. Determine the confidence interval: mean ± margin of error


### Path to Traces

### Data Processing

In [None]:
### get execution intervals for all variables
exe_list = {}   ### {var1: [1,2,3,4,5,6,7,8,9,10], var2: [1,2,3,4,5,6,7,8,9,10], ....}
filewise_exe_list = {}   ### {file1: {var1: [1,2,3,4,5,6,7,8,9,10], var2: [1,2,3,4,5,6,7,8,9,10], ....}, file2: {var1: [1,2,3,4,5,6,7,8,9,10], var2: [1,2,3,4,5,6,7,8,9,10], ....}, ....}
for sample_path in train_data_path:
    print(sample_path)
    sample_data = read_traces(sample_path)
    filename = sample_path.split('/')[-1].split('.')[0]
    # print(sample_data)
    ### collect timestamps for all variables
    timestamps = {}
    for i, event in enumerate(sample_data):
        var, ts = event
        ts = int(ts)
        # print(var, ts)
        if var not in timestamps.keys():
            timestamps[var] = [ts]
        else:
            timestamps[var].append(ts)

    print(timestamps.keys())
    ### calculate execution intervals for all variables
    intervals = {}
    for key in timestamps.keys():
        ts_list = timestamps[key]
        for ts1, ts2 in zip(ts_list[:-1], ts_list[1:]):
            exe_time = ts2 - ts1
            ### convert timestampt from miliseconds to seconds, and only consdider 1 decimal point. 
            exe_time = round(exe_time/1000, 2)
            print(key, ts1,ts2)

            ### for filewise exe_list
            if key not in intervals.keys():
                intervals[key] = [exe_time]
            else:
                intervals[key].append(exe_time)

            ### overall exe list
            if key not in exe_list.keys():
                exe_list[key] = [exe_time]
            else:
                exe_list[key].append(exe_time)

    filewise_exe_list[filename] = intervals

    # break

In [None]:
exe_list[13]

In [None]:
### get the confidence intervals for all variables

confidence_intervals = {}
for key in exe_list.keys():
    data = exe_list[key]
    start, end = calculate_confidence_interval(data)
    confidence_intervals[key] = [start, end]

In [None]:
############ calculate upper and lower thrsholds for all variables ############

### get uniques values from exe_list
unique_values = {}
outliers = {}
for key in exe_list.keys():
    data = exe_list[key]
    unique_values[key] = list(set(data))
    ### calculate probability for each unique value
    prob = {}
    for val in unique_values[key]:
        prob[val] = data.count(val)/len(data)
    unique_values[key] = prob

### consider values with probability > 0.05
outliers[key] = dict()
for key in unique_values.keys():
    print(key)
    prob = unique_values[key]
    print(prob.keys())
    filtered_values = defaultdict(list)
    out = dict()
    for val in prob.keys():
        print(prob[val])
        if prob[val] > 0.05:    
            filtered_values[val] = prob[val]
        else:
            out[val] = prob[val]


    unique_values[key] = filtered_values
    outliers[key] = out


### get upper and lower bound by taking min and max from unique_values (can try some other approach)
thresholds = {}
for key in unique_values.keys():
    values = list(unique_values[key].keys())
    thresholds[key] = [round(min(values)-0.1, 1), round(max(values)+0.1, 1)]


In [None]:
thresholds

In [None]:
exe_list[9]

### Visualising Thresholds

In [None]:
#### plot exe_list to vsiualize the distribution of execution intervals

import plotly.graph_objects as go

for key in exe_list.keys():
    fig = go.Figure()

    # Histogram
    fig.add_trace(go.Histogram(x=exe_list[key], nbinsx=100, name='execution intervals', histnorm='probability', marker=dict(color='midnightblue')))

    # Vertical lines
    fig.add_shape(type="line", x0=confidence_intervals[key][0], x1=confidence_intervals[key][0], y0=0, y1=1, yref='paper', line=dict(color="Red", dash="dash"))
    fig.add_shape(type="line", x0=confidence_intervals[key][1], x1=confidence_intervals[key][1], y0=0, y1=1, yref='paper', line=dict(color="Red", dash="dash"))
    fig.add_shape(type="line", x0=min(thresholds[key]), x1=min(thresholds[key]), y0=0, y1=1, yref='paper', line=dict(color="Green", dash="dash"))
    fig.add_shape(type="line", x0=max(thresholds[key]), x1=max(thresholds[key]), y0=0, y1=1, yref='paper', line=dict(color="Green", dash="dash"))

    # Add traces for the lines to include them in the legend
    fig.add_trace(go.Scatter(x=[confidence_intervals[key][0]], y=[0], mode='lines', name='Confidence Interval', line=dict(color="Red", dash="dash"), showlegend=True))
    fig.add_trace(go.Scatter(x=[min(thresholds[key])], y=[0], mode='lines', name='Dynamic Threshold', line=dict(color="Green", dash="dash"), showlegend=True))

    # Layout
    fig.update_layout(title=key, xaxis_title="Value", yaxis_title="Count", bargap=0.2, bargroupgap=0.1, title_font_size=20,
                        xaxis=dict(
                            tickfont = dict(size = 20),
                            titlefont = dict(size = 20),
                            color='black',
                        ),
                        yaxis=dict(
                            tickfont = dict(size = 20),
                            titlefont = dict(size = 20),
                            color='black'
                        ),
                        plot_bgcolor='rgba(0,0,0,0)',)
    
    fig.update_xaxes(
        mirror=True,
        ticks='outside',
        showline=True,
        linecolor='black',
        gridcolor='lightgrey'
    )
    
    fig.update_yaxes(
        mirror=True,
        ticks='outside',
        showline=True,
        linecolor='black',
        gridcolor='lightgrey'
    )

    fig.show()
    

### Validation

In [None]:
#### Detect anomalies in faulty traces
detected_anomalies = []
for sample_path in test_data_path:
    sample_data = read_traces(sample_path)
    filename = sample_path.split('/')[-1].split('.')[0]

    ### iterate trace and make decision for each exe interval
    var_tracking = {}
    for i in range(len(sample_data)):
        event = sample_data[i]
        var, ts = event
        ts = int(ts)
        if var not in var_tracking.keys():
            var_tracking[var] = [ts]
        else:
            var_tracking[var].append(ts)

        ### calculate exe interval
        if len(var_tracking[var]) > 1:
            exe_time = var_tracking[var][-1] - var_tracking[var][-2]
            ### convert timestampt from miliseconds to seconds, and only consdider 1 decimal point. 
            exe_time = round(exe_time/1000, 1)

            ### check if exe_time is an outlier
            if exe_time < thresholds[var][0] or exe_time > thresholds[var][1]:
                print(f'Anomaly detected for {var} in {filename} at {i}th event')
                print(f'Execution interval: {exe_time}')
                # detected_anomalies += [[(var, var_tracking[var][-2]), (var, var_tracking[var][-1]), os.path.basename(sample_path)]]
                detected_anomalies += [[var, (var_tracking[var][-2], var_tracking[var][-1]), os.path.basename(sample_path)]]

                # break


    # break

In [None]:
detected_anomalies

In [None]:
'''
TODO:
- plot the detections
- group the detections for single anomaly that are detected in different variables
- since the detection in these methods are between two events, the gt will usually lie in between the detected timestamps
- once these issues are solved, evaluate the performance
'''