In [51]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, MultiHeadAttention, LayerNormalization, RepeatVector, LeakyReLU, Flatten, TimeDistributed, Add, Conv1D, Concatenate, Lambda
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras import layers
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint, ReduceLROnPlateau
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import time
from datetime import datetime
import joblib

In [2]:
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]


2023-08-04 17:14:20.128756: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-08-04 17:14:20.130760: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2023-08-04 17:14:22.060639: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:86:00.0 name: Tesla V100-PCIE-32GB computeCapability: 7.0
coreClock: 1.38GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 836.37GiB/s
2023-08-04 17:14:22.062011: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 1 with properties: 
pciBusID: 0000:d8:00.0 name: Tesla V100-PCIE-32GB computeCapability: 7.0
coreClock: 1.38GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 836.37GiB/s
2023-08-04 17:14:22.062051: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2023-

In [3]:
data_dir = '../car_hacking_data/'
os.listdir(data_dir)

['Fuzzy_dataset.csv',
 'normal_run_data.txt',
 'gear_dataset.csv',
 '.DS_Store',
 'RPM_dataset.csv',
 'DoS_dataset.csv']

In [4]:
benign_data_path = os.path.join(data_dir, "normal_run_data.txt")
dos_data_path = os.path.join(data_dir, 'DoS_dataset.csv')

In [5]:
hex_to_dec = lambda x: int(x, 16)

## Since there are varying DLCs (2,5,6,8) in order to maintain data integrity
## The data must be padded with 00s when DLC < 8

def shift_columns(df):
    
    for dlc in [2,5,6]:

        df.loc[df['dlc'] == dlc, df.columns[3:]] = df.loc[df['dlc'] == dlc, df.columns[3:]].shift(periods=8-dlc, axis='columns', fill_value='00')

    return df

def pad_with_zeros(string, desired_length=16):
    if len(string) >= desired_length:
        return string
    else:
        return string.zfill(desired_length)
    
def split_string_into_list(string):
    # Initialize an empty list to store the result
    result_list = []

    # Iterate through the string with a step size of 2
    for i in range(0, len(string), 2):
        # Extract two characters at a time and add them to the result list
        item = string[i:i+2]
        result_list.append(item)

    return result_list

In [6]:
def read_attack_data(data_path):
    
    columns = ['timestamp','can_id', 'dlc', 'data0', 'data1', 'data2', 'data3', 'data4', 
           'data5', 'data6', 'data7', 'flag']
    
    data = pd.read_csv(data_path, names = columns)

    data = shift_columns(data)
    
    ##Replacing all NaNs with '00' 
    data = data.replace(np.NaN, '00')
    
    ##Joining all data columns to put all data in one column
    data_cols = ['data0', 'data1', 'data2', 'data3', 'data4', 'data5', 'data6', 'data7']
    
    ##The data column is in hexadecimal
#     data['data'] = data[data_cols].apply(''.join, axis=1)
#     data.drop(columns = data_cols, inplace = True, axis = 1)
    
    ##Converting columns to decimal
    data['can_id'] = data['can_id'].apply(hex_to_dec)
    data[data_cols] = data[data_cols].astype(str)
    
    data.sort_values(by = ['timestamp'], inplace = True)
    data = data.assign(IAT=data['timestamp'].diff().fillna(0))
    data.drop(['timestamp'], inplace = True, axis = 1)
    
    data[data_cols] = data[data_cols].applymap(hex_to_dec)
    

    return data

In [7]:
timestamps = []
ids = []
dlcs = []
data = []
data_cols = ['data0', 'data1', 'data2', 'data3', 'data4', 'data5', 'data6', 'data7']
    
# Read the data from the file
with open(benign_data_path, 'r') as file:
    for line in file:
        # Extract information from each line
        line = line.strip()
        ts = line.split('Timestamp: ')[1].split(' ')[0]
        can_id = line.split('ID: ')[1].split(' ')[0]
        dlc = line.split('DLC: ')[1].split(' ')[0]
        can_data = ''.join(line.split('DLC: ')[1].split(' ')[1:])
        
        can_data = pad_with_zeros(can_data)
        data_split = split_string_into_list(can_data)
               
        #Converting Hexadecimal entries to decimal format
        timestamps.append(float(ts))
        ids.append(hex_to_dec(can_id))
        dlcs.append(int(dlc))
        data.append([hex_to_dec(hex_str) for hex_str in data_split])


    
        
# data_dict = {f"data{i}": col for i, col in enumerate(data_split)}
        
benign = pd.DataFrame({
    'timestamp': timestamps,
    'can_id': ids,
    'dlc': dlcs})

data = pd.DataFrame(data, columns = data_cols)

benign_data = pd.concat([benign, data], axis=1)
benign_data.sort_values(by = ['timestamp'], inplace = True)

# # Creating IAT column
benign_data= benign_data.assign(IAT=benign_data['timestamp'].diff().fillna(0))
benign_data.drop(columns = ['timestamp'], axis = 1, inplace= True)

In [8]:
benign_data.head()

Unnamed: 0,can_id,dlc,data0,data1,data2,data3,data4,data5,data6,data7,IAT
0,848,8,5,40,132,102,109,0,0,162,0.0
1,704,8,20,0,0,0,0,0,0,0,0.000221
2,1072,8,0,0,0,0,0,0,0,0,0.000554
3,1201,8,0,0,0,0,0,0,0,0,0.000238
4,497,8,0,0,0,0,0,0,0,0,0.000248


In [9]:
dos_data = read_attack_data(dos_data_path)

In [10]:
dos_data.head()

Unnamed: 0,can_id,dlc,data0,data1,data2,data3,data4,data5,data6,data7,flag,IAT
0,790,8,5,33,104,9,33,33,0,111,R,0.0
1,399,8,254,91,0,0,0,60,0,0,R,0.000209
2,608,8,25,33,34,48,8,142,109,58,R,0.000228
3,672,8,100,0,154,29,151,2,189,0,R,0.000232
4,809,8,64,187,127,20,17,32,0,20,R,0.000237


In [11]:
X_test = dos_data.drop(['flag'], axis = 1)
y_test = dos_data['flag'].replace({'R' : 0, 'T' : 1})

In [12]:
np.unique(y_test, return_counts = True)

(array([0, 1]), array([3078250,  587521]))

In [13]:
scaler = joblib.load('first_run/scaler.sav')

X_train = scaler.transform(benign_data.values)
y_train = np.zeros(X_train.shape[0])

X_test = scaler.transform(X_test.values)
y_test = y_test.values

In [14]:
def sequencify(X, target, start, end, window):
    
    X_seq = []
    y_seq = []
    
    start = start + window
    if end is None:
        end = len(X)
        
    for i in range(start, end+1):
        indices = range(i - window, i)
        X_seq.append(X[indices])
        
        # Check if there is at least one element with value 1 in the sequence
        if any(target[j] == 1 for j in indices):
            y_seq.append(1)
        else:
            y_seq.append(0)
    
    return np.array(X_seq), np.array(y_seq)

In [15]:
seq_size = 10


X_train_seq, y_train_seq = sequencify(X_train, y_train, start = 0, end = None, window = seq_size)
X_test_seq, y_test_seq = sequencify(X_test, y_test, start = 0, end = None, window = seq_size)

In [16]:
strat = tf.distribute.MirroredStrategy()

with strat.scope():
    model = load_model('first_run/ae.h5')

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


2023-08-04 17:16:04.803388: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-04 17:16:04.877406: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-08-04 17:16:05.253655: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:86:00.0 name: Tesla V100-PCIE-32GB computeCapability: 7.0
coreClock: 1.38GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 836.37GiB/s
2023-08-04 17:16:05.255016: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 1 with properties: 
pciBusID: 0000:d8:00.0 name: Tesla V100-PCIE-32GB computeCapability: 7.0
coreClock: 1.38GHz 

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


In [17]:
train_recon = model.predict(X_train_seq)

2023-08-04 17:16:29.409840: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:656] In AUTO-mode, and switching to DATA-based sharding, instead of FILE-based sharding as we cannot find appropriate reader dataset op(s) to shard. Error: Did not find a shardable source, walked to a node which is not a dataset: name: "FlatMapDataset/_9"
op: "FlatMapDataset"
input: "PrefetchDataset/_8"
attr {
  key: "Targuments"
  value {
    list {
    }
  }
}
attr {
  key: "f"
  value {
    func {
      name: "__inference_Dataset_flat_map_slice_batch_indices_10859"
    }
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
        dim {
          size: -1
        }
      }
    }
  }
}
attr {
  key: "output_types"
  value {
    list {
      type: DT_INT64
    }
  }
}
. Consider either turning off auto-sharding or switching the auto_shard_policy to DATA to shard this dataset. You can do this by creating a new `tf.data.Options()` object then setting `options.experimental_distribute.au

In [18]:
train_recon.shape

(988862, 10, 11)

In [19]:
X_train_seq.shape

(988862, 10, 11)

In [20]:
def calculate_mse(y_true, y_pred):
  
    mse = np.mean((y_true - y_pred) ** 2)
    
    return mse

In [21]:
train_loss = calculate_mse(X_train_seq, train_recon)

In [22]:
train_loss

0.9252546720020174

In [23]:
test_recon = model.predict(X_test_seq)

2023-08-04 17:20:18.025211: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:656] In AUTO-mode, and switching to DATA-based sharding, instead of FILE-based sharding as we cannot find appropriate reader dataset op(s) to shard. Error: Did not find a shardable source, walked to a node which is not a dataset: name: "FlatMapDataset/_9"
op: "FlatMapDataset"
input: "PrefetchDataset/_8"
attr {
  key: "Targuments"
  value {
    list {
    }
  }
}
attr {
  key: "f"
  value {
    func {
      name: "__inference_Dataset_flat_map_slice_batch_indices_44163"
    }
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
        dim {
          size: -1
        }
      }
    }
  }
}
attr {
  key: "output_types"
  value {
    list {
      type: DT_INT64
    }
  }
}
. Consider either turning off auto-sharding or switching the auto_shard_policy to DATA to shard this dataset. You can do this by creating a new `tf.data.Options()` object then setting `options.experimental_distribute.au

In [66]:
thresh_percentages = [x for x in range(50,105,5)]
thresh_vals = [(x/100) * train_loss for x in thresh_percentages]

In [60]:
def classify_samples(sequencified_array, reconstructions, model, threshold):
    num_samples, seq_size, num_features = sequencified_array.shape
    
    classification_labels = []
    
    for i in range(len(reconstructions)):
        
        sample = sequencified_array[i]
        reconstructed_sample = reconstructions[i]
        
        loss = calculate_mse(sample, reconstructed_sample)
        
        if loss > threshold:
            classification_labels.append(1)
        else:
            classification_labels.append(0)
    
    return np.array(classification_labels)

In [68]:
for i in range(len(thresh_vals)):
    
    y_pred = classify_samples(X_test_seq, test_recon, model, thresh_vals[i])
    
    print("Threshold:", thresh_percentages[i])
    
    print("Classification Report:")
    
    print(classification_report(y_test_seq, y_pred))
    
    print("---------x-------------")

Threshold: 50
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00   2585834
           1       0.27      0.89      0.41   1079928

    accuracy                           0.26   3665762
   macro avg       0.14      0.44      0.21   3665762
weighted avg       0.08      0.26      0.12   3665762

---------x-------------
Threshold: 55
Classification Report:
              precision    recall  f1-score   support

           0       0.02      0.00      0.00   2585834
           1       0.25      0.81      0.39   1079928

    accuracy                           0.24   3665762
   macro avg       0.14      0.41      0.19   3665762
weighted avg       0.09      0.24      0.12   3665762

---------x-------------
Threshold: 60
Classification Report:
              precision    recall  f1-score   support

           0       0.09      0.01      0.02   2585834
           1       0.24      0.73      0.36   1079928

    accuracy           