# State Transition - only states

In [None]:
import numpy as np
import os
from libraries.utility import load_sample
from libraries.state_transition import StateTransition as st
import pandas as pd
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

## Instance

In [None]:
############ configuration ################
############################################

code = 'theft_protection'       ### application (code)
behaviour = 'faulty_data'            ### normal, faulty_data
thread_typ = 'single'           ### single, multi
version = 2.2                     ### format of data collection

base_dir = '../data-subtraces' ### can be replaced with 'csv', 'exe_plot', 'histogram'
train_label_path = base_dir+f'/version_{version}/{behaviour}/train_label'
test_label_path = base_dir+f'/version_{version}/{behaviour}/test_label'
print(train_label_path)

#### fetch files from labels


### Get file paths

In [None]:
### prepare train and test data
'''
train_data :
    absolute path to the sample files (.npy) containing event traces of length 50 (can be longer and shorter), these event traces will be a part of a bigger trace.
    event traces -> list( ['1_control_updatedata_cls.sensor_data' '997892'], ['1_control_readdata_0' '997896'], ['1_0_loracom_data' '997901'],..... )

test_data:
    same as above

labels:
    absolute path to .xlsx files containing transion between two event and its label. 
    labels ->   ind | s1 | s2 | ts1 | ts2 | label |   -> tha labels should be in given format and column heading
'''
train_labels = os.listdir(train_label_path)
if '.DS_Store' in train_labels:
    train_labels.remove('.DS_Store')
train_labels = [os.path.join(train_label_path, x) for x in train_labels]
train_data = [x.replace('train_label', 'subtraces').replace('.xlsx', '.npy') for x in train_labels]

test_labels = os.listdir(test_label_path)
if '.DS_Store' in test_label_path:
    test_label_path.remove('.DS_Store')
test_labels = [os.path.join(test_label_path, x) for x in test_labels]
test_data = [x.replace('test_label', 'subtraces').replace('.xlsx', '.npy') for x in test_labels]

### Training

In [None]:
### initialize
model = st()
model.train(train_data)

In [None]:
transitions = model.transitions

In [None]:
transitions

In [None]:
import pandas as pd
from pptx import Presentation
from pptx.util import Inches
from pptx.util import Pt
from pptx.util import Inches
from pptx.dml.color import RGBColor

# Create a DataFrame from the dictionary
df = pd.DataFrame(list(transitions.items()), columns=['Key', 'Values'])

# Combine values for each key into a single cell
df_combined = df.groupby('Key')['Values'].agg(lambda x: ', '.join(map(str, x))).reset_index()

# Create a PowerPoint presentation
presentation = Presentation()

# Add a slide to the presentation
slide_layout = presentation.slide_layouts[5]  # Using a blank slide layout
slide = presentation.slides.add_slide(slide_layout)

# Define the position and size of the table
left = Inches(1)
top = Inches(1)
width = Inches(6)
height = Inches(4)

# Add a table shape to the slide
table = slide.shapes.add_table(rows=df_combined.shape[0] + 1, cols=df_combined.shape[1], left=left, top=top, width=width, height=height).table

# Add column names to the first row
for col, col_name in enumerate(df_combined.columns):
    cell = table.cell(0, col)
    cell.text = col_name
    cell.text_frame.text = col_name
    cell.text_frame.paragraphs[0].font.size = Pt(10)
    cell.text_frame.paragraphs[0].font.bold = True
    cell.fill.solid()
    cell.fill.fore_color.rgb = RGBColor(240, 240, 240)  # Light gray background color

# Add data to the table
for row in range(df_combined.shape[0]):
    for col in range(df_combined.shape[1]):
        cell = table.cell(row + 1, col)
        cell.text = str(df_combined.iloc[row, col])
        cell.text_frame.text = str(df_combined.iloc[row, col])
        cell.text_frame.paragraphs[0].font.size = Pt(10)

# Save the PowerPoint presentation
presentation.save('table_presentation.pptx')



### Validation

In [None]:
#### Validate model
result = model.test(test_data)

In [None]:
########################
#### Extract the ground truth
########################
# count=0
# ground_truth = defaultdict(list)  ### labels of the events that are anomalous
# for lab in test_labels:
#     # print(lab)
#     file_name = os.path.basename(lab).removesuffix('.xlsx')
#     labels = pd.read_excel(lab)
#     columns = labels.columns
#     # print(labels)
#     for index, row in labels.iterrows():
#         if row['label'] == 1:
#             count+=1
#             ground_truth[file_name] += [[(row['s1'],row['ts1']), (row['s2'],row['ts2']), row['ind']]]

count=0
ground_truth = list()  ### labels of the events that are anomalous
for lab in test_labels:
    # print(lab)
    file_name = os.path.basename(lab).removesuffix('.xlsx')
    labels = pd.read_excel(lab)
    columns = labels.columns
    # print(labels)
    for index, row in labels.iterrows():
        if row['label'] == 1:
            count+=1
            ground_truth += [[(row['s1'],row['ts1']), (row['s2'],row['ts2']), file_name, row['ind']]]


In [None]:
#########################
########## Evaluate Results
#########################

# ### ground truth for metrics
# y_true = []
# detected = False
# for pred in result:
#     print(pred)
#     p_file = pred[2].removesuffix('.npy')
#     ps1, pts1 = pred[0]
#     ps2, pts2 = pred[1]
#     # print(p_file)
#     if p_file in ground_truth:
#         events = ground_truth[p_file]
#         for gt in events:
#             (gs1,gts1), (gs2,gts2), ind = gt[0], gt[1], gt[2]
#             # print(ps1, pts1,ps2, pts2, gs1,gts1, gs2,gts2)
#             # print( ps1==gs1 and str(pts1)==str(gts1) and ps2==gs2 and str(pts2)==str(gts2) )   
    
#             if ps1==gs1 and str(pts1)==str(gts1) and ps2==gs2 and str(pts2)==str(gts2):
#                 detected = True
#                 break ### not for testing, part of code

#     if detected==True:
#         y_true.append(1)
#         detected=False
#     else:
#         y_true.append(0)

# y_true = np.array(y_true)

# ### predictions for metrics
# y_pred = np.ones(len(result))

#############################################################################################

### ground truth for metrics
y_pred = np.zeros(len(ground_truth))
y_true = np.ones(len(ground_truth))
detected = False
for im, pred in enumerate(result):
    # print(pred)
    p_file = pred[2].removesuffix('.npy')
    ps1, pts1 = pred[0]
    ps2, pts2 = pred[1]
    # print(p_file)
    for gt in ground_truth:
        (gs1,gts1), (gs2,gts2), g_file, ind = gt[0], gt[1], gt[2], gt[3]
        # print(ps1, pts1,ps2, pts2, gs1,gts1, gs2,gts2)
        # print( p_file, g_file )
            
        if ps1==gs1 and str(pts1)==str(gts1) and ps2==gs2 and str(pts2)==str(gts2) and p_file==g_file:
            detected = True
            # print(pred, gt)
            ### remove all the detected instances to check which instances not detected
            ground_truth.remove(gt)
            break ### not for testing, part of code

    if detected==True:
        y_pred[im] = 1
        detected=False
    else:
        print(pred)

    

In [None]:
len(y_pred)

In [None]:
trial = []
for g in ground_truth:
    # print(g)
    trial+=[(g[2], g[3])]

sorted_trial = sorted(trial, key=lambda x: x[0])

In [None]:
### Evaluation metrics

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Calculate precision
precision = precision_score(y_true, y_pred)
print(f'Precision: {precision:.4f}')

# Calculate recall
recall = recall_score(y_true, y_pred)
print(f'Recall: {recall:.4f}')

# Calculate F1 score
f1 = f1_score(y_true, y_pred)
print(f"F1 Score: {f1:.4f}")

## Subtrace

In [None]:
############ configuration ################
############################################

code = 'theft_protection'       ### application (code)
behaviour = 'faulty_data'            ### normal, faulty_data
thread_typ = 'single'           ### single, multi
version = 2.2                     ### format of data collection
sub_len = 50

base_dir = '../data-subtraces' ### can be replaced with 'csv', 'exe_plot', 'histogram'
normal_path = base_dir+f'/version_{version}/{behaviour}/subtraces/{sub_len}/normal'
anomalies_path = base_dir+f'/version_{version}/{behaviour}/subtraces/{sub_len}/anomalies'
print(normal_path, anomalies_path)


In [None]:
############ Get the Data ############
normal_files = os.listdir(normal_path)
if '.DS_Store' in normal_files:
    normal_files.remove('.DS_Store')

anomalies_files = os.listdir(anomalies_path)
if '.DS_Store' in anomalies_files:
    anomalies_files.remove('.DS_Store')

normal_files = [os.path.join(normal_path, x) for x in normal_files]
anomalies_files = [os.path.join(anomalies_path, x) for x in anomalies_files]

normal_labels = [0]*len(normal_files)
anomalies_labels = [1]*len(anomalies_files)

#### split the normal data in 80:20 ratio
X_train, X_test, y_train, y_test = train_test_split(normal_files, normal_labels, test_size=0.2, random_state=42)

#### combine the train and test data
X_test += anomalies_files
y_test += anomalies_labels

#### shuffle test files
X_test, y_test = shuffle(X_test, y_test, random_state=42)
    

In [None]:
############ inititliaze and train ############

model = st()
model.train(X_train)

### Validation

In [None]:
############ prediction on test data ############
detected_anomalies = []
y_pred = []
for i, test in enumerate(X_test):
    print(i, test)
    result = model.test_single(test)
    # print(result)

    ### store the predictions and label the result as 1 if anomalies detected
    if result == []:
        y_pred.append(0)
    else:
        y_pred.append(1)
        detected_anomalies+=[result]


In [None]:
############ Evaluation ############
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Calculate F1 score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.4f}")

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Calculate precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision:.4f}')

# Calculate recall
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall:.4f}')





In [None]:
############ plot the confusion matrix ############
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Create the labels for the Confusion Matrix
labels = ['Normal', 'Anomaly']

# Make predictions on the test data
y_pred = np.array(y_pred)
y_true = np.array(y_test)

# Calculate the confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Create a DataFrame for the Confusion Matrix
df_cm = pd.DataFrame(cm, index=labels, columns=labels)

# Create the Confusion Matrix
plt.figure(figsize=(10, 6))
#sns.heatmap(df_cm, annot=True, cmap='Blues', fmt='g')
sns.heatmap(df_cm, annot=True, cmap='Blues', fmt='g', annot_kws={"size": 24})  # Change font size

# increase label font size
plt.yticks(fontsize=22)
plt.xticks(fontsize=22)

# increase title font size


plt.title('Confusion Matrix', fontsize=22)
plt.ylabel('True Label', fontsize=22)
plt.xlabel('Predicted Label', fontsize=22)
plt.tight_layout()
# plt.savefig('confusion_matrix.png', dpi=300)
plt.show()


# State Transition - with probability

## Complete Trace - Auto labelling

In [None]:
### init libraries
import numpy as np
import os
import json
from libraries.utility import load_sample
from libraries.state_transition import StateTransitionProb as stp
import pandas as pd
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

def read_traces(log_path):
    '''
    read the trace files and extract variable names
    data = [ [event, timestamp], [], [],......,[] ]
    '''
    with open(log_path, 'r') as f:
        data = json.load(f)
    return data


def get_uniquevar(raw_trace):
    ''' 
    convert the v2.2 trace into list of unique variables
    raw_trace = data from read_traces, list( (var, ts),(var, ts),(var, ts),.... )
    return:
        unique_var = list(var1,var2,...) ## list of strings
    '''
    unique_var = []
    for rt in raw_trace:
        [var, timestamp] = rt
        # print([var, timestamp])
        if var not in unique_var:
            unique_var += [var]
            # print(rt)
    return unique_var


def generate_map(unique_events):
    '''
    unique_events -> list of all the variables in the code (unique, and in order of logging)
    return:
        event_map -> takes the variable name and gives corresponding event number
        event_remap -> takes event number and gives associated variable name
    '''
    event_map = dict()
    event_remap = dict()
    for i in range(len(unique_events)):
        event_remap[i+1] = unique_events[i]
        event_map[unique_events[i]] = i+1

    return(event_map, event_remap)


In [None]:
############ configuration ################
############################################

code = 'theft_protection'       ### application (code)
behaviour = 'faulty_data'            ### normal, faulty_data
thread_typ = 'single'           ### single, multi
version = 2.2                     ### format of data collection
sub_len = 'dynamic'

# base_dir = '../data-subtraces' ### can be replaced with 'csv', 'exe_plot', 'histogram'
base_dir = '../../trace_data' ### can be replaced with 'csv', 'exe_plot', 'histogram'
log_path = base_dir+f'/{code}/{thread_typ}_thread/version_{version}'
print(log_path)
normal_path = log_path+f'/normal'
anomalies_path = log_path+f'/faulty_data'
print(normal_path, anomalies_path)

#### subtraces
subtrace_path = f"../data-subtraces/version_{version}/{behaviour}/subtraces/{sub_len}/"


### Path to Traces

In [None]:
###### get file paths #######

### normal files
normal_files = os.listdir(normal_path)
normal_files.sort()
logs = []
traces = []
unknown = []
for i in normal_files:
    if i.find('log') == 0:
        logs += [i]
    elif i.find('trace') == 0 and i.find('.txt') == -1:
        traces += [i]
    else:
        unknown += [i]

######### path to files
normal_logpaths = [os.path.join(normal_path, x) for x in logs]
normal_tracespaths = [os.path.join(normal_path, x) for x in traces]
normal_logpaths.sort()
normal_tracespaths.sort()
print(normal_tracespaths)

### anomalies files
anomalies_files = os.listdir(anomalies_path)
anomalies_files.sort()
logs = []
traces = []
unknown = []
for i in anomalies_files:
    if i.find('log') == 0:
        logs += [i]
    elif i.find('trace') == 0 and i.find('.txt') == -1:
        traces += [i]
    else:
        unknown += [i]
        
######### path to files
anomalies_logpaths = [os.path.join(anomalies_path, x) for x in logs]
anomalies_tracespaths = [os.path.join(anomalies_path, x) for x in traces]
anomalies_logpaths.sort()
anomalies_tracespaths.sort()
print(anomalies_tracespaths)

#### get path to subtraces
files_in_subtrace = os.listdir(subtrace_path)
# print(files_in_subtrace)
anomaly_subtraces = []
for i in files_in_subtrace:
    # print(i)
    # print(i.find('.npy'))
    if i.find('trace') != -1:   ### filter all the files except trace files
        anomaly_subtraces += [subtrace_path+i]
print(anomaly_subtraces)


### Training

In [None]:
############################### Training #######################################

############ logic for creating transition table with probabilities from normal data ############
'''
Here we want to go through the trace generated from normal data and create a transition table with probabilities. 
However, the probabilities should show which instances are anomalies and spot them for the developer to evaluate.

Logic:
- create transition table similar as before
- while creating this transition table, allocate a unique number to each transition
- for each transition that occurs store the number of occurences of that transition with reference to the unique number
- calculate probability for each occurence with respect to total events in the trace
- update this probability for multiple traces
'''

model = stp()
model.train(normal_tracespaths)


In [None]:
model.transitions

In [None]:
model.invalid_transitions

### Testing

In [None]:
############ prediction on test data ############
detected_anomalies = model.test(anomaly_subtraces)

In [None]:
detected_anomalies[0]

### Visualization

In [None]:
### prepare the traces for plotting

# ### get variable list that is same for all log files ####
# raw_trace = read_traces(normal_tracespaths[0])
# _var_list = get_uniquevar(raw_trace)
# #np.save('var_list.npy', _var_list, allow_pickle=False)
# to_number, from_number = generate_map(_var_list)

### replace the variable names with event numbers based on var_list.npy
_var_list = np.load('../../analysis scripts/var_list.npy', allow_pickle=True)
to_number, from_number = generate_map(_var_list)


########## process the traces ###########
col_data = []
for p in anomaly_subtraces:
    trace = read_traces(p)
    w = os.path.basename(p).removesuffix('.txt')
    num_trace = []
    time_stamp = []
    for (t, ts) in trace:
        nt = to_number[t]
        num_trace.extend([nt])
        time_stamp.extend([ts])
        # ### take limited samples
        # if ts > 250000:
        #     break
    col_data += [(w, time_stamp, num_trace, _var_list, p)]   ### in the format (trace_name, x_data, y_data, y_labels, trace_path) 


all_df = []
for col in col_data:
    # print(col)
    plot_data = dict()
    plot_data['time'] = col[1]   ### x_data
    plot_data[col[0]] = col[2]   ### y_data (traces)

    ### convert the list to data frame and store it for plotting
    df = pd.DataFrame(plot_data)
    all_df += [df]

    #break

In [None]:
####### prepare anomalies for plotting ########

### seperate the anomalies accoring to the trace files and store the timestamps in seperate lists
anomalies = defaultdict(list)
for anomaly in detected_anomalies:
    # print(anomaly)
    file_name = anomaly[2]
    anomalies[file_name] += [anomaly[0][1], anomaly[1][1]]

### sort the anomalies according to the timestamp and remove duplicates
for k in anomalies.keys():
    anomalies[k] = sorted(list(set(anomalies[k])))

### get the index number for timestamps in the traces to plot
anomalies_df = []
for i in range(len(all_df)):
    df = all_df[i]
    k = df.columns[1]   ### get name of the trace
    #print(k)
    anomalies_plot = defaultdict(list)
    for ts in anomalies[k]:
        #print(ts)
        index = df.index[df['time'] == ts].tolist()
        ### store the respective values from the trace columns
        y_val = df[k].iloc[index[0]]
        #print(index, y_val)
        anomalies_plot['index'] += index
        anomalies_plot[k+'-anomalies'] += [y_val]

    ### convert the dict to data frame and store it for plotting
    df = pd.DataFrame(anomalies_plot)
    anomalies_df += [df]


In [None]:
##### plot the traces with anomalies #####

import plotly.graph_objects as go
import pandas as pd




### add all the traces to the graph
for df, dfa in zip(all_df, anomalies_df):

    # ### plot only one graph
    # trace_toplot = 0
    # df = all_df[trace_toplot]
    # dfa = anomalies_df[trace_toplot]
    # ############################

    # Create figure
    fig = go.Figure()

    df_col = df.columns
    fig.add_trace(
                go.Scatter(y=list(df[df_col[1]]), name=df_col[1], mode='markers', marker=dict(size=10, color='midnightblue')),   ### equivalent to: y=list(df['trace1'])
                )
    ### add anomalies to the graph without lines
    for (ax, ay) in zip(dfa['index'], dfa[df_col[1]+'-anomalies']):
        fig.add_trace(
                go.Scatter(x=[ax], y=[ay], name=df_col[1]+'-anomalies', mode='markers', marker=dict(size=50, color='red', symbol='square'), showlegend=False, line=dict(color='black', width=2) ),
                )
    
    
    # break
    ### generate x ticks with timestamp and index num  
    x_data = df[df_col[0]]
    x_ticks = [(i,x_data[i]) for i in range(0,len(x_data),10) ]

    # Add range slider, title, yticks, axes labels
    fig.update_layout(
        title_text="Event Trace without Time",
        xaxis=dict(
            title="Number of events",
            rangeslider=dict(visible=True),
            type='linear',
            # tickvals=[k for k in range(0,len(x_data),10)],
            # ticktext=x_ticks,
            tickfont = dict(size = 20),
            titlefont = dict(size = 20),
            color='black',
        ),
        yaxis=dict(
            title="Variables",
            tickvals=[k for k in range(1,len(_var_list)+1)],
            ticktext=_var_list,
            tickfont = dict(size = 20),
            titlefont = dict(size = 20),
            color='black',
        ),
        autosize=True,
        width=1200,
        height=800,
        plot_bgcolor='rgba(0,0,0,0)',
        
    )

    fig.update_xaxes(
        mirror=True,
        ticks='outside',
        showline=True,
        linecolor='black',
        gridcolor='lightgrey'
    )
    fig.update_yaxes(
        mirror=True,
        ticks='outside',
        showline=True,
        linecolor='black',
        gridcolor='lightgrey'
    )

    # style all the traces
    fig.update_traces(
        #hoverinfo="name+x+text",
        line={"width": 0.5},
        marker={"size": 8},
        mode="lines+markers",
        showlegend=True,
        
    )

    fig.show()
    # break

### Evaluation