## Purpose

Test ahc method on the ROAD dataset.

In [1]:
import os 
import sys
import numpy as np
from collections import defaultdict
import CAN_objects.aid_message
import matplotlib.pyplot as plt


actt_path = os.path.join(os.path.join(os.path.expanduser("~"), "Projects", "CAN", "actt"))
os.chdir(actt_path)
sys.path.insert(0, "src") # add src folder to path so that files from this folder can be imported

from generalFunctions import unpickle
import subprocess

import importlib
importlib.reload(CAN_objects.aid_message)
from init_cancapture_from_canlog import init_cancap
import json
import seaborn as sns
import pandas as pd

from CAN_objects.capture import MappedCapture, MatchedCapture
import math
from scipy.cluster.hierarchy import single, complete, average, ward, dendrogram, linkage, fcluster

from pprint import pprint
from sklearn.metrics.cluster import normalized_mutual_info_score

from clusim.clustering import Clustering, remap2match
import clusim.sim as sim

import glob
from tqdm import tqdm
import itertools
from scipy.stats import shapiro, mannwhitneyu, ttest_ind, spearmanr
from sklearn.preprocessing import normalize, scale, MinMaxScaler, StandardScaler

from sklearn.cluster import DBSCAN
import hdbscan
from sklearn.neighbors import NearestNeighbors

## Enable the Use of Functions From the Detect Repo

In [2]:
# sys.path.insert(0, "/home/cades/Projects/CAN/detect/") # add detect folder to path so that files from this folder can be imported
sys.path.insert(0, "/home/cloud/Projects/CAN/detect/") # add detect folder to path so that files from this folder can be imported
import signal_based_preprocess_functions
print(os.getcwd())

/home/cloud/Projects/CAN/actt


## Functions

In [3]:
def hierarchical_clustering(corr_matrix, method="complete"):
    
    if method == "complete":
        Z = complete(corr_matrix)
    if method == "single":
        Z = single(corr_matrix)
    if method == "average":
        Z = average(corr_matrix)
    if method == "ward":
        Z = ward(corr_matrix)
  
    # PLotting the dendrogram
    # fig = plt.figure(figsize=(16, 8))
    # dn = dendrogram(Z)
    # plt.title(f"Dendrogram for {method}-linkage with correlation distance")
    # plt.show()
    
    return Z


def from_capture_to_time_series(cap, ground_truth_dbc_path):
    
    signal_multivar_ts, timepts, aid_signal_tups = signal_based_preprocess_functions.capture_to_mv_signal_timeseries(cap, ground_truth_dbc_path)

    return signal_multivar_ts, timepts, aid_signal_tups


def from_captures_to_time_series(cap_1, cap_2, ground_truth_dbc_path):
        
    signal_multivar_ts_1, timepts_1, aid_signal_tups_1 = signal_based_preprocess_functions.capture_to_mv_signal_timeseries(cap_1, ground_truth_dbc_path)
    signal_multivar_ts_2, timepts_2, aid_signal_tups_2 = signal_based_preprocess_functions.capture_to_mv_signal_timeseries(cap_2, ground_truth_dbc_path)

    return signal_multivar_ts_1, timepts_1, aid_signal_tups_1, signal_multivar_ts_2, timepts_2, aid_signal_tups_2


def from_single_series_to_correlation_matrix(signal_multivar_ts_1, aid_signal_tups_1):
    
    # First dataframe
    # Convert matrix of time series into a dataframe
    df_1 = pd.DataFrame({f"{tup[0]}_{tup[1]}": signal_multivar_ts_1[:,index] for index, tup in enumerate(aid_signal_tups_1)})
    display(df_1)

    # Remove columns with constant values
    df_1 = df_1.loc[:, (df_1 != df_1.iloc[0]).any()] 
    # display(df_1)
    # df_1.dtypes
    
    df_1_scaled = normalize(df_1) # scale
    df_1_scaled = pd.DataFrame(df_1_scaled, columns=df_1.columns)
    df_1 = df_1_scaled
    display(df_1)

    # Compute correlation matrix
    corr_matrix_1 = df_1.corr(method="pearson")
    # display(corr_matrix_1)
    # display(corr_matrix_1.loc[["1760_0", "1760_1", "1760_2", "1760_3"], ["1760_0", "1760_1", "1760_2", "1760_3"]])
    signal_names_1 = corr_matrix_1.columns.values

    
    return corr_matrix_1, signal_names_1


def from_series_to_correlation_matrix(signal_multivar_ts_1, aid_signal_tups_1, signal_multivar_ts_2, aid_signal_tups_2):
    
    # First dataframe
    # Convert matrix of time series into a dataframe
    df_1 = pd.DataFrame({f"{tup[0]}_{tup[1]}": signal_multivar_ts_1[:,index] for index, tup in enumerate(aid_signal_tups_1)})
    display(df_1)

    # Remove columns with constant values
    df_1 = df_1.loc[:, (df_1 != df_1.iloc[0]).any()] 
    # display(df_1)
    # df_1.dtypes
    
    df_1_scaled = normalize(df_1) # scale
    df_1_scaled = pd.DataFrame(df_1_scaled, columns=df_1.columns)
    df_1 = df_1_scaled
    display(df_1)

    # Compute correlation matrix
    corr_matrix_1 = df_1.corr(method="pearson")
    # display(corr_matrix_1)
    # display(corr_matrix_1.loc[["1760_0", "1760_1", "1760_2", "1760_3"], ["1760_0", "1760_1", "1760_2", "1760_3"]])
    signal_names_1 = corr_matrix_1.columns.values
    # print(len(signal_names_training), signal_names_training)
    
    ##################
    # Second dataframe
    # Convert matrix of time series into a dataframe
    df_2 = pd.DataFrame({f"{tup[0]}_{tup[1]}": signal_multivar_ts_2[:,index] for index, tup in enumerate(aid_signal_tups_2)})
    # display(df)

    # Remove columns with constant values
    df_2 = df_2.loc[:, (df_2 != df_2.iloc[0]).any()] 
    # display(df_2)
    # df_2.dtypes
    
    df_2_scaled = normalize(df_2) # scale
    df_2_scaled = pd.DataFrame(df_2_scaled, columns=df_2.columns)
    df_2 = df_2_scaled
    # display(df_2)

    # Compute correlation matrix
    corr_matrix_2 = df_2.corr(method="pearson")
    # display(corr_matrix_2)
    # display(corr_matrix_2.loc[["1760_0", "1760_1", "1760_2", "1760_3"], ["1760_0", "1760_1", "1760_2", "1760_3"]])
    signal_names_2 = corr_matrix_2.columns.values
    # print(len(signal_names_training), signal_names_training)
    
    signal_names_intersection = list(set(signal_names_1).intersection(set(signal_names_2)))
    
    return corr_matrix_1, corr_matrix_2, signal_names_intersection


def compute_hierarchical_clustering(corr_matrix_1, corr_matrix_2, signal_names_intersection, method):
    
    # Filter correlation matrices by common names
    corr_matrix_1 = corr_matrix_1.loc[signal_names_intersection, signal_names_intersection]
    # display(corr_matrix_1)

    corr_matrix_2 = corr_matrix_2.loc[signal_names_intersection, signal_names_intersection]
    # display(corr_matrix_2)
    
    linkage_matrix_1 = hierarchical_clustering(corr_matrix_1, method=method)
    linkage_matrix_2 = hierarchical_clustering(corr_matrix_2, method=method)
    
    return linkage_matrix_1, linkage_matrix_2


def compute_hierarchical_clustering_training(corr_matrix, method):
       
    linkage_matrix = hierarchical_clustering(corr_matrix, method=method)
    
    return linkage_matrix


def compute_element_centric_similarity(linkage_matrix_1, linkage_matrix_2, r=1.0):
    
    c_1 = Clustering().from_scipy_linkage(linkage_matrix_1, dist_rescaled=True)
    c_2 = Clustering().from_scipy_linkage(linkage_matrix_2, dist_rescaled=True)
    
    return sim.element_sim(c_1, c_2, r=r, alpha=0.9)


def compute_distribution_training(training_captures, ground_truth_dbc_path):
    
    pairs = []
    
    for row in tqdm(range(len(training_captures))):
    
        for column in range(row, len(training_captures)):

            signal_multivar_ts_1, timepts_1, aid_signal_tups_1, signal_multivar_ts_2, timepts_2, aid_signal_tups_2 = from_captures_to_time_series(training_captures[row], training_captures[column], ground_truth_dbc_path)
            corr_matrix_1, corr_matrix_2, signal_names_intersection = from_series_to_correlation_matrix(signal_multivar_ts_1, aid_signal_tups_1, signal_multivar_ts_2, aid_signal_tups_2)
            linkage_matrix_1, linkage_matrix_2 = compute_hierarchical_clustering(corr_matrix_1, corr_matrix_2, signal_names_intersection, method="ward")
            similarity = compute_element_centric_similarity(linkage_matrix_1, linkage_matrix_2, r=-5)
            pairs.append([(training_captures[row], training_captures[column]), similarity])

    print(pairs)

    with open(f"./similarity_matrix_training.json", "w") as f:
        json.dump(pairs, f)


def compute_distribution_mixed(training_captures, testing_captures, ground_truth_dbc_path):
    
    pairs = []
    
    for interest_capture in tqdm(training_captures):
        for capture in testing_captures:
        
            signal_multivar_ts_1, timepts_1, aid_signal_tups_1, signal_multivar_ts_2, timepts_2, aid_signal_tups_2 = from_captures_to_time_series(interest_capture, capture, ground_truth_dbc_path)
            corr_matrix_1, corr_matrix_2, signal_names_intersection = from_series_to_correlation_matrix(signal_multivar_ts_1, aid_signal_tups_1, signal_multivar_ts_2, aid_signal_tups_2)
            linkage_matrix_1, linkage_matrix_2 = compute_hierarchical_clustering(corr_matrix_1, corr_matrix_2, signal_names_intersection, method="ward")
            similarity = compute_element_centric_similarity(linkage_matrix_1, linkage_matrix_2, r=-5)
            pairs.append([(interest_capture, capture), similarity])
        
    print(pairs)

    with open(f"./similarity_matrix_mixed.json", "w") as f:
        json.dump(pairs, f)


def compute_distribution_testing(testing_captures, ground_truth_dbc_path):
    
    pairs = []
    
    for row in tqdm(range(len(testing_captures))):
    
        for column in range(row, len(testing_captures)):

            signal_multivar_ts_1, timepts_1, aid_signal_tups_1, signal_multivar_ts_2, timepts_2, aid_signal_tups_2 = from_captures_to_time_series(testing_captures[row], testing_captures[column], ground_truth_dbc_path)
            corr_matrix_1, corr_matrix_2, signal_names_intersection = from_series_to_correlation_matrix(signal_multivar_ts_1, aid_signal_tups_1, signal_multivar_ts_2, aid_signal_tups_2)
            linkage_matrix_1, linkage_matrix_2 = compute_hierarchical_clustering(corr_matrix_1, corr_matrix_2, signal_names_intersection, method="ward")
            similarity = compute_element_centric_similarity(linkage_matrix_1, linkage_matrix_2, r=-5)
            pairs.append([(testing_captures[row], testing_captures[column]), similarity])

    print(pairs)

    with open(f"./similarity_matrix_testing.json", "w") as f:
        json.dump(pairs, f)


def remove_constant_signals(signal_multivar_ts):
    return signal_multivar_ts[:, ~np.all(signal_multivar_ts[1:] == signal_multivar_ts[:-1], axis=0)]


def partition_time_series(signal_multivar_ts, window_length, offset):
    
    n = signal_multivar_ts.shape[0]
    i = 0
    partition = []
    
    while (i + window_length) < n:
        partition.append(signal_multivar_ts[i: i + window_length,:])
        i = i + offset
        
    if i != n:
        partition.append(signal_multivar_ts[i:n,:])
        
    return partition
    
    
def process_multivariate_signals(signal_multivar_ts, aid_signal_tups, window_length, offset):
    
    # First dataframe
    # Convert matrix of time series into a dataframe
    df = pd.DataFrame({f"{tup[0]}_{tup[1]}": signal_multivar_ts[:,index] for index, tup in enumerate(aid_signal_tups)})
    # display(df)

    # Remove columns with constant values
    df = df.loc[:, (df != df.iloc[0]).any()] 
    # display(df)
    
    # Stadarization
    df_standardized = (df-df.mean())/df.std()
    # display(df_standardized)
    
    # Partition of data frames
    n = df_standardized.shape[0]
    i = 0
    partition = []
    
    while (i + window_length) < n:
        partition.append(df_standardized.iloc[i:i + window_length, :])
        i = i + offset
        
    if i != n:
        partition.append(df_standardized.iloc[i:n, :])
        
    return partition


def process_multiple_multivariate_signals(signal_multivar_ts_1, aid_signal_tups_1, signal_multivar_ts_2, aid_signal_tups_2, window_length, offset):
    
    # First dataframe
    # Convert matrix of time series into a dataframe
    df_1 = pd.DataFrame({f"{tup[0]}_{tup[1]}": signal_multivar_ts_1[:,index] for index, tup in enumerate(aid_signal_tups_1)})
    # display(df)
    print(df_1.shape)

    # Remove columns with constant values
    df_1 = df_1.loc[:, (df_1 != df_1.iloc[0]).any()] 
    # display(df)
    
    # Stadarization
    df_1_standardized = (df_1-df_1.mean())/df_1.std()
    # display(df_2_standardized)
    
    # Partition of data frames
    n = df_1_standardized.shape[0]
    i = 0
    partition_1 = []
    
    while (i + window_length) < n:
        partition_1.append(df_1_standardized.iloc[i:i + window_length, :])
        i = i + offset
        
    if i != n:
        partition_1.append(df_1_standardized.iloc[i:n, :])
        
        
    # Second dataframe
    # Convert matrix of time series into a dataframe
    df_2 = pd.DataFrame({f"{tup[0]}_{tup[1]}": signal_multivar_ts_2[:,index] for index, tup in enumerate(aid_signal_tups_2)})
    # display(df)
    print(df_2.shape)

    # Remove columns with constant values
    df_2 = df_2.loc[:, (df_2 != df_2.iloc[0]).any()] 
    # display(df)
    
    # Stadarization
    df_2_standardized = (df_2-df_2.mean())/df_2.std()
    # display(df_2_standardized)
    
    # Partition of data frames
    n = df_2_standardized.shape[0]
    i = 0
    partition_2 = []
    
    while (i + window_length) < n:
        partition_2.append(df_2_standardized.iloc[i:i + window_length, :])
        i = i + offset
        
    if i != n:
        partition_2.append(df_2_standardized.iloc[i:n, :])
        
    return partition_1, partition_2


def upper(df):
    '''Returns the upper triangle of a correlation matrix (excluding diagonal).
    You can use scipy.spatial.distance.squareform to recreate matrix from upper triangle.
    Args:
      df: pandas or numpy correlation matrix
    Returns:
      list of values from upper triangle
    '''
    try:
        assert(type(df) == np.ndarray)
    except:
        if type(df) == pd.DataFrame:
            df = df.values
        else:
            raise TypeError('Must be np.ndarray or pd.DataFrame')
    mask = np.triu_indices(df.shape[0], k=1)
    
    return df[mask]



def randomized_test_permutations(m1, m2):
    """Nonparametric permutation testing Monte Carlo"""
    np.random.seed(0)
    rhos = []
    n_iter = 100
    true_rho, _ = spearmanr(upper(m1), upper(m2))
    # matrix permutation, shuffle the groups
    m_ids = list(m1.columns)
    m2_v = upper(m2)
    for iter in range(n_iter):
        np.random.shuffle(m_ids) # shuffle list 
        r, _ = spearmanr(upper(m1.loc[m_ids, m_ids]), m2_v)  
        rhos.append(r)
    perm_p = ((np.sum(np.abs(true_rho) <= np.abs(rhos)))+1)/(n_iter+1) # two-tailed test

    return perm_p


def compute_correlation_matrices(partition):
    
    corr_matrices = []

    for df in partition:

        # Remove columns with constant values
        df = df.loc[:, (df != df.iloc[0]).any()] 

        # Compute correlation matrix
        corr_matrices.append(df.corr(method="pearson"))
        
    return corr_matrices


def compute_similarity_from_correlation_matrices(corr_matrices):
    
    similarities = []
    
    for i in range(len(corr_matrices)-1):

        # print("raw: ", corr_matrices[i].shape, corr_matrices[i+1].shape)

        signal_names_1 = corr_matrices[i].columns.values
        signal_names_2 = corr_matrices[i+1].columns.values
        signal_names_intersection = list(set(signal_names_1).intersection(set(signal_names_2)))

        df_1 = corr_matrices[i].loc[signal_names_intersection, signal_names_intersection] 
        df_2 = corr_matrices[i+1].loc[signal_names_intersection, signal_names_intersection]
  
        # print("pro: ", df_1.shape, df_2.shape, "\n")

        similarities.append((df_1.shape[0], spearmanr(upper(df_1), upper(df_2))[0], spearmanr(upper(df_1), upper(df_2))[1]))
        
    return similarities


def compute_similarity_from_multiple_correlation_matrices(corr_matrices_1, corr_matrices_2):
    
    similarities = []
    
    if len(corr_matrices_1) <= len(corr_matrices_2):
        corr_matrices_reference = corr_matrices_1
    else:
        corr_matrices_reference = corr_matrices_2
        
    print(len(corr_matrices_reference))
            
    for i in range(len(corr_matrices_reference)):

        # print("raw: ", corr_matrices[i].shape, corr_matrices[i+1].shape)

        signal_names_1 = corr_matrices_1[i].columns.values
        signal_names_2 = corr_matrices_2[i].columns.values
        signal_names_intersection = list(set(signal_names_1).intersection(set(signal_names_2)))

        df_1 = corr_matrices_1[i].loc[signal_names_intersection, signal_names_intersection] 
        df_2 = corr_matrices_2[i].loc[signal_names_intersection, signal_names_intersection]
  
        # print("pro: ", df_1.shape, df_2.shape, "\n")

        # similarities.append((df_1.shape[0], spearmanr(upper(df_1), upper(df_2))[0], spearmanr(upper(df_1), upper(df_2))[1]))
        
        correlation = spearmanr(upper(df_1), upper(df_2))[0]
        p_value = spearmanr(upper(df_1), upper(df_2))[1]
        
        if p_value > 0.05:
            similarities.append((i, correlation, p_value))
        else:
            similarities.append(i)
            
        
    return similarities


def create_time_intervals(total_length, window, offset):
    
    # Partition of data frames
    i = 0
    intervals = []
    
    while (i + window) < total_length:
        intervals.append((i, i + window))
        i = i + offset
        
    if i != total_length:
        intervals.append((i , total_length))
        
    return intervals


    # # Partition of data frames
    # n = df_standardized.shape[0]
    # i = 0
    # partition = []
    
    # while (i + window_length) < n:
    #     partition.append(df_standardized.iloc[i:i + window_length, :])
    #     i = i + offset
        
    # if i != n:
    #     partition.append(df_standardized.iloc[i:n, :])
        
    # return partition
    

    # intervals = []
    # # offset = 0.1*offset
    
    # for i in np.arange(0, total_length - window + 1, offset, dtype=float):
    #     intervals.append((i, i + window))

    # if i + window < total_length:
    #     intervals.append((i + offset, total_length))

    # return intervals 


def compute_distance_matrix(corr_matrix):

    signal_names = np.array(corr_matrix.columns)

    # display(corr_matrix)

    # compute distance matrix
    # distance_matrix = np.sqrt(2*(1 - corr_matrix.to_numpy())) 
    distance_matrix = 2*(1 - corr_matrix.to_numpy())
    distance_matrix[distance_matrix < 0] = 0
    # display(distance_matrix.shape)
    # display(distance_matrix)

    return signal_names, distance_matrix

## Loading the Data

In [4]:
ground_truth_dbc_path = os.path.join(actt_path, "metadata", "dbcs", "heuristic_labeled", "anonymized_020822_030640.dbc")

# training_captures = [directory for directory in os.listdir("/home/cades/Projects/CAN/actt/data-cancaptures/") if ("road_ambient_dyno" in directory) or ("road_ambient_highway" in directory)]
training_captures = [directory for directory in os.listdir("/home/cloud/Projects/CAN/actt/data-cancaptures/") if ("road_ambient_dyno" in directory) or ("road_ambient_highway" in directory)]
print(len(training_captures), training_captures)  

testing_captures = ["correlated_masquerade_1_030804_082640", "correlated_masquerade_2_031128_011320", "correlated_masquerade_3_040322_190000", 
                    "road_attack_max_speedometer_attack_1_masquerade_060215_054000", "road_attack_max_speedometer_attack_2_masquerade_060611_002640", 
                    "road_attack_max_speedometer_attack_3_masquerade_061004_181320", "road_attack_max_engine_coolant_temp_attack_masquerade_041109_063320",
                    "road_attack_reverse_light_on_attack_1_masquerade_091205_030000", "road_attack_reverse_light_on_attack_2_masquerade_100330_214640", 
                    "road_attack_reverse_light_on_attack_3_masquerade_100724_153320", "road_attack_reverse_light_off_attack_1_masquerade_080110_162000", 
                    "road_attack_reverse_light_off_attack_2_masquerade_080505_110640", "road_attack_reverse_light_off_attack_3_masquerade_080829_045320"]

print(len(testing_captures), testing_captures) 

12 ['road_ambient_dyno_drive_basic_short_020822_030640', 'road_ambient_dyno_idle_radio_infotainment_030410_144000', 'road_ambient_dyno_drive_winter_030410_144000', 'road_ambient_highway_street_driving_diagnostics_031128_011320', 'road_ambient_dyno_drive_extended_short_021215_195320', 'road_ambient_highway_street_driving_long_050305_002000', 'road_ambient_dyno_drive_extended_long_040716_134640', 'road_ambient_dyno_drive_benign_anomaly_030804_082640', 'road_ambient_dyno_exercise_all_bits_030410_144000', 'road_ambient_dyno_reverse_040322_190000', 'road_ambient_dyno_drive_radio_infotainment_041109_063320', 'road_ambient_dyno_drive_basic_long_050305_002000']
13 ['correlated_masquerade_1_030804_082640', 'correlated_masquerade_2_031128_011320', 'correlated_masquerade_3_040322_190000', 'road_attack_max_speedometer_attack_1_masquerade_060215_054000', 'road_attack_max_speedometer_attack_2_masquerade_060611_002640', 'road_attack_max_speedometer_attack_3_masquerade_061004_181320', 'road_attack_max

## Obtain Metadata

In [5]:
# with open("/home/cades/Projects/CAN/actt/data/capture_metadata.json") as f:
with open("/home/cloud/Projects/CAN/actt/data/capture_metadata.json") as f:
    attack_metadata = json.load(f)
    
# pprint(testing_captures)
# pprint(attack_metadata)

attack_metadata_keys = ["correlated_signal_attack_1_masquerade", "correlated_signal_attack_2_masquerade", "correlated_signal_attack_3_masquerade", 
                        "max_speedometer_attack_1_masquerade", "max_speedometer_attack_2_masquerade", "max_speedometer_attack_3_masquerade",
                        "max_engine_coolant_temp_attack_masquerade", "reverse_light_on_attack_1_masquerade", "reverse_light_on_attack_2_masquerade",
                        "reverse_light_on_attack_3_masquerade", "reverse_light_off_attack_1_masquerade", "reverse_light_off_attack_2_masquerade",
                        "reverse_light_off_attack_3_masquerade"]

print(len(attack_metadata_keys))

13


## Experiments on a Single Capture

In [6]:
index = -1
print(training_captures[index])
signal_multivar_ts, timepts, aid_signal_tups = from_capture_to_time_series(training_captures[index], ground_truth_dbc_path) # training_captures[0], testing_captures[0]

road_ambient_dyno_drive_basic_long_050305_002000


In [7]:
signal_multivar_ts.shape

(12510, 337)

In [8]:
signal_multivar_ts

array([[  148,     1, 15026, ...,     0,     0,     0],
       [  148,     1, 15023, ...,     0,     0,     0],
       [  148,     1, 15023, ...,     0,     0,     0],
       ...,
       [  133,     1, 15011, ...,     1,     0,     0],
       [  133,     1, 15014, ...,     1,     0,     0],
       [  133,     1, 15014, ...,     1,     0,     0]])

In [9]:
timepts[-1]/60

20.848333333333336

In [10]:
np.diff(timepts)

array([0.1, 0.1, 0.1, ..., 0.1, 0.1, 0.1])

In [11]:
corr_matrix_training, signal_names_training = from_single_series_to_correlation_matrix(signal_multivar_ts, aid_signal_tups)

Unnamed: 0,14_0,14_1,14_2,51_0,51_1,51_2,51_3,51_4,51_5,51_6,...,1788_1,1788_2,1788_3,1788_4,1788_5,1788_6,1788_7,1788_8,1788_9,1788_10
0,148,1,15026,0,-6,0,3,2,0,125,...,0,0,15026,1,1,1771,0,0,0,0
1,148,1,15023,0,-15,0,3,11,0,125,...,0,0,15026,1,1,1771,0,0,0,0
2,148,1,15023,0,-9,0,3,5,0,125,...,0,0,15025,1,1,1771,0,0,0,0
3,148,1,15026,0,-19,0,3,15,0,125,...,0,0,15026,1,1,1771,0,0,0,0
4,148,1,15023,0,-13,0,3,9,0,125,...,0,0,15026,1,1,1771,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12505,133,1,15008,0,-15,0,3,11,0,125,...,0,0,15011,1,1,1771,0,1,0,0
12506,133,1,15011,0,-9,0,3,5,0,125,...,0,0,15012,1,1,1771,0,1,0,0
12507,133,1,15011,0,-19,0,3,15,0,125,...,0,0,15009,1,1,1771,0,1,0,0
12508,133,1,15014,0,-13,0,3,9,0,125,...,0,0,15009,1,1,1771,0,1,0,0


Unnamed: 0,14_0,14_2,51_0,51_1,51_2,51_4,51_5,51_6,60_0,60_1,...,1694_3,1694_4,1760_0,1760_1,1760_2,1760_3,1788_3,1788_6,1788_8,1788_9
0,0.001849,0.187750,0.0,-0.000075,0.0,0.000025,0.0,0.001562,0.000012,0.000600,...,0.0,0.002124,0.0,0.0,0.0,0.0,0.187750,0.022129,0.000000,0.0
1,0.001847,0.187468,0.0,-0.000187,0.0,0.000137,0.0,0.001560,0.000012,0.000599,...,0.0,0.002121,0.0,0.0,0.0,0.0,0.187506,0.022100,0.000000,0.0
2,0.001857,0.188476,0.0,-0.000113,0.0,0.000063,0.0,0.001568,0.000013,0.000602,...,0.0,0.002133,0.0,0.0,0.0,0.0,0.188501,0.022219,0.000000,0.0
3,0.001856,0.188474,0.0,-0.000238,0.0,0.000188,0.0,0.001568,0.000013,0.000590,...,0.0,0.002120,0.0,0.0,0.0,0.0,0.188474,0.022214,0.000000,0.0
4,0.001856,0.188353,0.0,-0.000163,0.0,0.000113,0.0,0.001567,0.000013,0.000589,...,0.0,0.002119,0.0,0.0,0.0,0.0,0.188391,0.022204,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12505,0.001663,0.187700,0.0,-0.000188,0.0,0.000138,0.0,0.001563,0.000013,0.000588,...,0.0,0.002114,0.0,0.0,0.0,0.0,0.187737,0.022149,0.000013,0.0
12506,0.001661,0.187516,0.0,-0.000112,0.0,0.000062,0.0,0.001561,0.000012,0.000575,...,0.0,0.002111,0.0,0.0,0.0,0.0,0.187529,0.022123,0.000012,0.0
12507,0.001659,0.187252,0.0,-0.000237,0.0,0.000187,0.0,0.001559,0.000012,0.000586,...,0.0,0.002108,0.0,0.0,0.0,0.0,0.187227,0.022092,0.000012,0.0
12508,0.001656,0.186977,0.0,-0.000162,0.0,0.000112,0.0,0.001557,0.000012,0.000585,...,0.0,0.002105,0.0,0.0,0.0,0.0,0.186914,0.022055,0.000012,0.0


In [12]:
corr_matrix_training

Unnamed: 0,14_0,14_2,51_0,51_1,51_2,51_4,51_5,51_6,60_0,60_1,...,1694_3,1694_4,1760_0,1760_1,1760_2,1760_3,1788_3,1788_6,1788_8,1788_9
14_0,1.000000,-0.440874,-0.148724,-0.192036,-0.151097,0.016395,-0.627870,0.481696,0.245136,-0.248611,...,0.001980,0.453064,-0.384668,-0.384575,-0.386809,-0.386920,-0.439385,-0.367238,0.015436,0.027146
14_2,-0.440874,1.000000,-0.571431,0.030519,-0.568443,0.012463,0.132011,0.417016,0.211454,-0.167717,...,-0.015260,0.458294,-0.473274,-0.473456,-0.471735,-0.471659,0.997990,-0.390327,0.020665,-0.222581
51_0,-0.148724,-0.571431,1.000000,0.039721,0.999424,-0.021295,0.588958,-0.692876,-0.343280,0.312333,...,0.013816,-0.725942,0.600956,0.600997,0.602893,0.603037,-0.570774,0.521998,-0.054409,0.198841
51_1,-0.192036,0.030519,0.039721,1.000000,0.040216,-0.040989,0.208830,-0.156318,-0.061471,0.214957,...,-0.008567,-0.142070,0.195426,0.195416,0.195776,0.195778,0.030521,0.193163,-0.008535,0.014039
51_2,-0.151097,-0.568443,0.999424,0.040216,1.000000,-0.021033,0.591678,-0.692763,-0.343230,0.311783,...,0.012048,-0.726122,0.600124,0.600164,0.602076,0.602222,-0.568154,0.521322,-0.054352,0.198845
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1760_3,-0.386920,-0.471659,0.603037,0.195778,0.602222,-0.027623,0.564311,-0.912901,-0.459226,0.638263,...,0.013869,-0.883371,0.999976,0.999976,0.999998,1.000000,-0.471525,0.938022,-0.135716,0.239066
1788_3,-0.439385,0.997990,-0.570774,0.030521,-0.568154,0.012532,0.129821,0.416754,0.210893,-0.167603,...,-0.015390,0.458315,-0.473115,-0.473295,-0.471603,-0.471525,1.000000,-0.390348,0.020806,-0.222540
1788_6,-0.367238,-0.390327,0.521998,0.193163,0.521322,-0.023677,0.550801,-0.790726,-0.389847,0.671403,...,0.000662,-0.748108,0.938121,0.938101,0.938022,0.938022,-0.390348,1.000000,-0.129478,0.216887
1788_8,0.015436,0.020665,-0.054409,-0.008535,-0.054352,0.003020,-0.098144,0.088930,0.048265,-0.235469,...,-0.002217,0.074726,-0.135790,-0.135749,-0.135729,-0.135716,0.020806,-0.129478,1.000000,-0.017181


## Compute Linkage Matrix

In [13]:
linkage_matrix_training = compute_hierarchical_clustering_training(corr_matrix_training, method="ward")

## Stream a Single Attack

In [19]:
window = 10
offset = 1

print("Processing: ", attack_metadata_keys[0])
signal_multivar_ts, timepts, aid_signal_tups = from_capture_to_time_series(testing_captures[0], ground_truth_dbc_path)
# print(signal_multivar_ts.shape)

partition_testing = process_multivariate_signals(signal_multivar_ts, aid_signal_tups, window, offset) # Partition time series
# print(len(partition_testing), partition_testing[0])

print("intervals: ", len(partition_testing))

# display(partition_testing[0])
# display(partition_testing[1])
# display(partition_testing[-1])

corr_matrices_testing = compute_correlation_matrices(partition_testing) # Compute correlation matrices
# display(corr_matrices_testing[0])

# total_length = int(np.ceil(timepts[-1]))  
total_length = timepts[-1] 
print("total length (s): ", total_length)
intervals_testing = create_time_intervals(total_length, window/10, offset/10)
#print(len(intervals_testing), intervals_testing)
print("attack interval (s): ", attack_metadata[attack_metadata_keys[0]]["injection_interval"][0], attack_metadata[attack_metadata_keys[0]]["injection_interval"][1])

tp, fp, fn, tn = 0, 0, 0, 0

for index_interval in tqdm(range(len(intervals_testing))):

    # print("Interval: ", intervals_testing[index_interval])

    # print(np.isnan(corr_matrices_testing[index_interval]).any().any())
    # print((corr_matrices_testing[index_interval] < 0).any().any())

    signal_names_testing = corr_matrices_testing[index_interval].columns.values
    # print(type(signal_names_testing), signal_names_testing)

    signal_names_intersection = list(set(signal_names_training).intersection(set(signal_names_testing)))

    linkage_matrix_training, linkage_matrix_testing = compute_hierarchical_clustering(corr_matrix_training, corr_matrices_testing[index_interval], signal_names_intersection, "ward")

    similarity = compute_element_centric_similarity(linkage_matrix_training, linkage_matrix_testing, r=-5)

    # print("similarity: ", similarity)

    if similarity <= 0.9: # positive detection
        if ((intervals_testing[index_interval][1] > attack_metadata[attack_metadata_keys[0]]["injection_interval"][0] and intervals_testing[index_interval][0] < attack_metadata[attack_metadata_keys[0]]["injection_interval"][0])
               or (intervals_testing[index_interval][0] > attack_metadata[attack_metadata_keys[0]]["injection_interval"][0] and intervals_testing[index_interval][1] < attack_metadata[attack_metadata_keys[0]]["injection_interval"][1])
                   or (intervals_testing[index_interval][0] < attack_metadata[attack_metadata_keys[0]]["injection_interval"][1] and intervals_testing[index_interval][1] > attack_metadata[attack_metadata_keys[0]]["injection_interval"][1])):
            tp += 1
        else:
            fp += 1
    else: # negative detection
        if ((intervals_testing[index_interval][1] > attack_metadata[attack_metadata_keys[0]]["injection_interval"][0] and intervals_testing[index_interval][0] < attack_metadata[attack_metadata_keys[0]]["injection_interval"][0])
               or (intervals_testing[index_interval][0] > attack_metadata[attack_metadata_keys[0]]["injection_interval"][0] and intervals_testing[index_interval][1] < attack_metadata[attack_metadata_keys[0]]["injection_interval"][1])
                   or (intervals_testing[index_interval][0] < attack_metadata[attack_metadata_keys[0]]["injection_interval"][1] and intervals_testing[index_interval][1] > attack_metadata[attack_metadata_keys[0]]["injection_interval"][1])):
            fn += 1
        else:
            tn += 1
            
# precision
if tp + fp != 0:            
    precision = tp/(tp + fp)
else:
    precision = np.nan

# recall
if tp + fn != 0:
    recall = tp/(tp + fn)
else:
    recall = np.nan

# f1
if precision + recall != 0:
    f1 = 2*((precision*recall)/(precision + recall))

else:
    f1 = np.nan

# fpr
if fp + tn != 0:
    fpr = fp/(fp + tn)
else:
    fpr = np.nan

# fnr
if fn + tp != 0:
    fnr = fn/(fn + tp)
else:
    fnr = np.nan

# mcc
if (tp+fp == 0) or (tp+fn == 0) or (tn+fp == 0) or (tn+fn == 0):
    mcc = (tp*tn) - (fp*fn)
else:
    mcc = (tp*tn - fp*fn)/(math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)))

print(f"tp: {tp}, tn: {tn}, fp: {fp}, fn: {fn}")
print(f"precision: {precision:.3f}, recall: {recall:.3f}, f1: {f1:.3f}, fpr: {fpr:.3f}, fnr: {fnr:.3f}, mcc: {mcc:.3f}")
print(f"positive_intervals: {tp+fn:.3f}, negative_intervals: {tn+fp:.3f}\n")

Processing:  correlated_signal_attack_1_masquerade
intervals:  331
total length (s):  33.9
attack interval (s):  9.191851 30.050109


100%|██████████| 330/330 [01:03<00:00,  5.16it/s]

tp: 208, tn: 0, fp: 111, fn: 11
precision: 0.652, recall: 0.950, f1: 0.773, fpr: 1.000, fnr: 0.050, mcc: -0.132
positive_intervals: 219.000, negative_intervals: 111.000






## Stream All Attacks

In [21]:
window = 10
offset = 1

for index_attack in range(len(attack_metadata_keys)):

    print("Processing: ", attack_metadata_keys[index_attack])
    signal_multivar_ts, timepts, aid_signal_tups = from_capture_to_time_series(testing_captures[index_attack], ground_truth_dbc_path)
    
    partition_testing = process_multivariate_signals(signal_multivar_ts, aid_signal_tups, window, offset) # Partition time series

    print("intervals: ", len(partition_testing))

    # display(partition_testing[0])
    # display(partition_testing[1])
    # display(partition_testing[-1])

    corr_matrices_testing = compute_correlation_matrices(partition_testing) # Compute correlation matrices
    # display(corr_matrices_testing[0])

    # total_length = int(np.ceil(timepts[-1])) 
    total_length = timepts[-1]
    print("total length (s): ", total_length) 
    intervals_testing = create_time_intervals(total_length, window/10, offset/10)
    # print(len(intervals_testing), intervals_testing)
    print("attack interval (s): ", attack_metadata[attack_metadata_keys[index_attack]]["injection_interval"][0], attack_metadata[attack_metadata_keys[index_attack]]["injection_interval"][1])

    tp, fp, fn, tn = 0, 0, 0, 0

    for index_interval in range(len(intervals_testing)):

        # print("Interval: ", intervals_testing[index_interval])

        # print(np.isnan(corr_matrices_testing[index_interval]).any().any())
        # print((corr_matrices_testing[index_interval] < 0).any().any())

        # print("Interval: ", intervals_testing[index_interval])

        # print(np.isnan(corr_matrices_testing[index_interval]).any().any())
        # print((corr_matrices_testing[index_interval] < 0).any().any())

        signal_names_testing = corr_matrices_testing[index_interval].columns.values
        # print(type(signal_names_testing), signal_names_testing)

        signal_names_intersection = list(set(signal_names_training).intersection(set(signal_names_testing)))

        linkage_matrix_training, linkage_matrix_testing = compute_hierarchical_clustering(corr_matrix_training, corr_matrices_testing[index_interval], signal_names_intersection, "ward")

        similarity = compute_element_centric_similarity(linkage_matrix_training, linkage_matrix_testing, r=-5)

        # print("similarity: ", similarity)

        if similarity <= 0.9: # positive detection
            if ((intervals_testing[index_interval][1] > attack_metadata[attack_metadata_keys[0]]["injection_interval"][0] and intervals_testing[index_interval][0] < attack_metadata[attack_metadata_keys[0]]["injection_interval"][0])
                or (intervals_testing[index_interval][0] > attack_metadata[attack_metadata_keys[0]]["injection_interval"][0] and intervals_testing[index_interval][1] < attack_metadata[attack_metadata_keys[0]]["injection_interval"][1])
                    or (intervals_testing[index_interval][0] < attack_metadata[attack_metadata_keys[0]]["injection_interval"][1] and intervals_testing[index_interval][1] > attack_metadata[attack_metadata_keys[0]]["injection_interval"][1])):
                tp += 1
            else:
                fp += 1
        else: # negative detection
            if ((intervals_testing[index_interval][1] > attack_metadata[attack_metadata_keys[0]]["injection_interval"][0] and intervals_testing[index_interval][0] < attack_metadata[attack_metadata_keys[0]]["injection_interval"][0])
                or (intervals_testing[index_interval][0] > attack_metadata[attack_metadata_keys[0]]["injection_interval"][0] and intervals_testing[index_interval][1] < attack_metadata[attack_metadata_keys[0]]["injection_interval"][1])
                    or (intervals_testing[index_interval][0] < attack_metadata[attack_metadata_keys[0]]["injection_interval"][1] and intervals_testing[index_interval][1] > attack_metadata[attack_metadata_keys[0]]["injection_interval"][1])):
                fn += 1
            else:
                tn += 1
                
    # precision
    if tp + fp != 0:            
        precision = tp/(tp + fp)
    else:
        precision = np.nan

    # recall
    if tp + fn != 0:
        recall = tp/(tp + fn)
    else:
        recall = np.nan

    # f1
    if precision + recall != 0:
        f1 = 2*((precision*recall)/(precision + recall))

    else:
        f1 = np.nan

    # fpr
    if fp + tn != 0:
        fpr = fp/(fp + tn)
    else:
        fpr = np.nan

    # fnr
    if fn + tp != 0:
        fnr = fn/(fn + tp)
    else:
        fnr = np.nan

    # mcc
    if (tp+fp == 0) or (tp+fn == 0) or (tn+fp == 0) or (tn+fn == 0):
        mcc = (tp*tn) - (fp*fn)
    else:
        mcc = (tp*tn - fp*fn)/(math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)))

    print(f"tp: {tp}, tn: {tn}, fp: {fp}, fn: {fn}")
    print(f"precision: {precision:.3f}, recall: {recall:.3f}, f1: {f1:.3f}, fpr: {fpr:.3f}, fnr: {fnr:.3f}, mcc: {mcc:.3f}")
    print(f"positive_intervals: {tp+fn:.3f}, negative_intervals: {tn+fp:.3f}\n")

Processing:  correlated_signal_attack_1_masquerade
intervals:  331
total length (s):  33.9
attack interval (s):  9.191851 30.050109
tp: 208, tn: 0, fp: 111, fn: 11
precision: 0.652, recall: 0.950, f1: 0.773, fpr: 1.000, fnr: 0.050, mcc: -0.132
positive_intervals: 219.000, negative_intervals: 111.000

Processing:  correlated_signal_attack_2_masquerade
intervals:  281
total length (s):  28.9
attack interval (s):  6.830477 28.225908
tp: 197, tn: 4, fp: 78, fn: 1
precision: 0.716, recall: 0.995, f1: 0.833, fpr: 0.951, fnr: 0.005, mcc: 0.150
positive_intervals: 198.000, negative_intervals: 82.000

Processing:  correlated_signal_attack_3_masquerade
intervals:  161
total length (s):  16.9
attack interval (s):  4.318482 16.95706
tp: 79, tn: 0, fp: 82, fn: 0
precision: 0.491, recall: 1.000, f1: 0.658, fpr: 1.000, fnr: 0.000, mcc: 0.000
positive_intervals: 79.000, negative_intervals: 82.000

Processing:  max_speedometer_attack_1_masquerade
intervals:  881
total length (s):  88.9
attack interval 