In [12]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, MultiHeadAttention, LayerNormalization, RepeatVector, LeakyReLU, Flatten, TimeDistributed, Add, Conv1D, Concatenate, Lambda, GRU
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras import layers
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint, ReduceLROnPlateau
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
import time
from datetime import datetime
import joblib

print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]


In [2]:
data_dir = '../car_hacking_data/'
os.listdir(data_dir)

['Fuzzy_dataset.csv',
 'normal_run_data.txt',
 'gear_dataset.csv',
 '.DS_Store',
 'RPM_dataset.csv',
 'DoS_dataset.csv']

In [3]:
benign_data_path = os.path.join(data_dir, "normal_run_data.txt")

hex_to_dec = lambda x: int(x, 16)

## Since there are varying DLCs (2,5,6,8) in order to maintain data integrity
## The data must be padded with 00s when DLC < 8

def shift_columns(df):
    
    for dlc in [2,5,6]:

        df.loc[df['dlc'] == dlc, df.columns[3:]] = df.loc[df['dlc'] == dlc, df.columns[3:]].shift(periods=8-dlc, axis='columns', fill_value='00')

    return df

def pad_with_zeros(string, desired_length=16):
    if len(string) >= desired_length:
        return string
    else:
        return string.zfill(desired_length)
    
def split_string_into_list(string):
    # Initialize an empty list to store the result
    result_list = []

    # Iterate through the string with a step size of 2
    for i in range(0, len(string), 2):
        # Extract two characters at a time and add them to the result list
        item = string[i:i+2]
        result_list.append(item)

    return result_list


def read_attack_data(data_path):
    
    columns = ['timestamp','can_id', 'dlc', 'data0', 'data1', 'data2', 'data3', 'data4', 
           'data5', 'data6', 'data7', 'flag']
    
    data = pd.read_csv(data_path, names = columns)

    data = shift_columns(data)
    
    ##Replacing all NaNs with '00' 
    data = data.replace(np.NaN, '00')
    
    ##Joining all data columns to put all data in one column
    data_cols = ['data0', 'data1', 'data2', 'data3', 'data4', 'data5', 'data6', 'data7']
    
    ##The data column is in hexadecimal
#     data['data'] = data[data_cols].apply(''.join, axis=1)
#     data.drop(columns = data_cols, inplace = True, axis = 1)
    
    ##Converting columns to decimal
    data['can_id'] = data['can_id'].apply(hex_to_dec)
    data[data_cols] = data[data_cols].astype(str)
    
    data.sort_values(by = ['timestamp'], inplace = True)
    data = data.assign(IAT=data['timestamp'].diff().fillna(0))
    data.drop(['timestamp'], inplace = True, axis = 1)
    
    data[data_cols] = data[data_cols].applymap(hex_to_dec)
    

    return data

In [4]:
timestamps = []
ids = []
dlcs = []
data = []
data_cols = ['data0', 'data1', 'data2', 'data3', 'data4', 'data5', 'data6', 'data7']
    
# Read the data from the file
with open(benign_data_path, 'r') as file:
    for line in file:
        # Extract information from each line
        line = line.strip()
        ts = line.split('Timestamp: ')[1].split(' ')[0]
        can_id = line.split('ID: ')[1].split(' ')[0]
        dlc = line.split('DLC: ')[1].split(' ')[0]
        can_data = ''.join(line.split('DLC: ')[1].split(' ')[1:])
        
        can_data = pad_with_zeros(can_data)
        data_split = split_string_into_list(can_data)
               
        #Converting Hexadecimal entries to decimal format
        timestamps.append(float(ts))
        ids.append(hex_to_dec(can_id))
        dlcs.append(int(dlc))
        data.append([hex_to_dec(hex_str) for hex_str in data_split])

In [5]:
benign = pd.DataFrame({
    'timestamp': timestamps,
    'can_id': ids,
    'dlc': dlcs})

data = pd.DataFrame(data, columns = data_cols)

benign_data = pd.concat([benign, data], axis=1)
benign_data.sort_values(by = ['timestamp'], inplace = True)

# # Creating IAT column
benign_data= benign_data.assign(IAT=benign_data['timestamp'].diff().fillna(0))
benign_data.drop(columns = ['timestamp'], axis = 1, inplace= True)



X = benign_data.values

# test = read_attack_data(dos_data_path)
# x_test = test.drop(['flag'], axis = 1)
# y_test = test['flag'].replace({'R' : 0, 'T' : 1})

# x_test = x_test.values

val_idx = int(0.8 * len(X))

scaler = StandardScaler()
X_train = X[:val_idx]
X_val = X[val_idx:]

X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# scaler_filename = "second_run/scaler.sav"
# joblib.dump(scaler, scaler_filename) 
# X_test = scaler.transform(x_test)




## Function to create a sequencified dataset for time-series moodel
def sequencify(dataset, start, end, window):
  
    X = []
    
    start = start + window 
    if end is None:
        end = len(dataset)
        
    for i in range(start, end+1):
        indices = range(i-window, i) 
        X.append(dataset[indices])
			
    return np.array(X)


seq_size = 20

X_train_seq = sequencify(X_train, 0, None, seq_size)
X_val_seq = sequencify(X_val, 0, None, seq_size)

In [6]:
def TransformerBlock(inputs, num_heads, key_dim, ff_dim, dropout=0.3):
    multihead_attention = MultiHeadAttention(key_dim=key_dim, num_heads=num_heads)
    attention_output = multihead_attention(inputs, inputs, inputs)
    attention_output = Dropout(dropout)(attention_output)
    x = Add()([inputs, attention_output])
    x = LayerNormalization(epsilon=1e-6)(x)

    ffn_output = Dense(ff_dim, activation='relu')(x)
    ffn_output = Dense(x.shape[-1])(ffn_output)
    ffn_output = Dropout(dropout)(ffn_output)
    x = Add()([x, ffn_output])
    x = LayerNormalization(epsilon=1e-6)(x)
    return x

## Change loss fn, figure out issue related to shape

In [7]:
def make_AE(latent_dim = 3, input_shape = (20, 11), num_heads = 32, key_dim = 64, num_blocks = 32, 
            seq_size = seq_size):
    
    model = Sequential()
    
    model.add(Input(shape = input_shape))
    
    model.add(LSTM(128, activation = 'relu', return_sequences = True))
    
    model.add(LSTM(64, activation = 'relu', return_sequences = True))
    
    model.add(LSTM(32, activation = 'relu'))
    
    model.add(RepeatVector(input_shape[0]))
    
    model.add(LSTM(32, activation = 'relu', return_sequences = True))
    
    model.add(LSTM(64, activation = 'relu', return_sequences = True))
    
    model.add(LSTM(128, activation = 'relu', return_sequences = True))
    
    model.add(TimeDistributed(Dense(input_shape[2])))
     
    return model

In [8]:
strat = tf.distribute.MirroredStrategy()

with strat.scope():
    ae = make_AE()
    ae.compile(loss =  'mae', optimizer = 'adam')



NotImplementedError: Cannot convert a symbolic Tensor (lstm/strided_slice:0) to a numpy array. This error may indicate that you're trying to pass a Tensor to a NumPy call, which is not supported

In [10]:
print(np.__version__)

1.21.5


In [14]:
input_shape = (20, 11)

model = Sequential()
    
model.add(Input(shape = input_shape))

model.add(LSTM(128, activation = 'relu', return_sequences = True))

model.add(LSTM(64, activation = 'relu', return_sequences = True))

model.add(LSTM(32, activation = 'relu'))

model.add(RepeatVector(input_shape[0]))

model.add(LSTM(32, activation = 'relu', return_sequences = True))

model.add(LSTM(64, activation = 'relu', return_sequences = True))

model.add(LSTM(128, activation = 'relu', return_sequences = True))

model.add(TimeDistributed(Dense(input_shape[2])))



NotImplementedError: Cannot convert a symbolic Tensor (gru/strided_slice:0) to a numpy array. This error may indicate that you're trying to pass a Tensor to a NumPy call, which is not supported