In [8]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, save_model
from tensorflow.keras.layers import Dense, LSTM, Input, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE
import argparse
from joblib import dump
import matplotlib.pyplot as plt
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
import eli5
from eli5.sklearn import PermutationImportance

In [2]:
base_dir = '../data/Car Hacking Dataset/'

In [3]:
file_name = 'preprocessed_car_hacking.csv'
data = pd.read_csv(os.path.join(base_dir, file_name))

data = data.assign(IAT=data['Timestamp'].diff().fillna(0))
data.drop(['Timestamp'], axis = 1, inplace = True)

In [4]:
data.head()

Unnamed: 0,ID,DLC,Payload,IAT,label
0,0316,8,052168092121006f,0.0,0
1,018f,8,fe5b0000003c0000,0.000209,0
2,0260,8,19212230088e6d3a,0.000228,0
3,02a0,8,64009a1d9702bd00,0.000232,0
4,0329,8,40bb7f1411200014,0.000237,0


In [5]:
def hex_to_bin(hex_num):
    
    binary_value = bin(int(str(hex_num), 16))[2:]
    
    return binary_value

def int_to_bin(int_num):
    
    binary_value = bin(int_num)[2:]
    
    return binary_value

def pad(value, length):
    
    curr_length = len(str(value))
    
    zeros = '0' * (length - curr_length)
    
    return zeros + value

hex_to_dec = lambda x: int(x, 16)

def transform_data(data):

    data['ID'] = data['ID'].apply(hex_to_dec)
    data['Payload'] = data['Payload'].apply(hex_to_dec)

    return data

def sequencify_data(X, y, seq_size=10):
    max_index = len(X) - seq_size + 1

    X_seq = []
    y_seq = []

    for i in range(0, max_index, seq_size):
        X_seq.append(X[i:i+seq_size])  # Append the sequence from DataFrame 'X'
        y_seq.append(1 if 1 in y[i:i+seq_size] else 0)  # Check for '1' in 'y' values

    return np.array(X_seq), np.array(y_seq)

data = transform_data(data)

data.head()

Unnamed: 0,ID,DLC,Payload,IAT,label
0,790,8,369691032840896623,0.0,0
1,399,8,18328243108518297600,0.000209,0
2,608,8,1810766114876976442,0.000228,0
3,672,8,7205928855671061760,0.000232,0
4,809,8,4664461563235663892,0.000237,0


In [6]:
X = data.drop('label', axis = 1)
y = data['label']

X_seq, y_seq = sequencify_data(X.values, y.values)

#Splitting into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle=True)
X_seq_train, X_seq_test, y_seq_train, y_seq_test = train_test_split(X_seq, y_seq, test_size = 0.2, shuffle= True)

#Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

seq_scaler = StandardScaler()
num_train_samples, seq_length, num_features = X_seq_train.shape
num_test_samples, _, _ = X_seq_test.shape

X_train_seq_reshaped = X_seq_train.reshape(num_train_samples, -1)
X_test_seq_reshaped = X_seq_test.reshape(num_test_samples, -1)

X_train_seq_scaled = seq_scaler.fit_transform(X_train_seq_reshaped)
X_test_seq_scaled = seq_scaler.transform(X_test_seq_reshaped)

# Reshape the scaled data back to the original shape
X_seq_train = X_train_seq_scaled.reshape(num_train_samples, seq_length, num_features)
X_seq_test = X_test_seq_scaled.reshape(num_test_samples, seq_length, num_features)

oversample = SMOTE()
X_train_smote, y_train_smote = oversample.fit_resample(X_train, y_train) 

In [7]:
indexes = np.where(y_test == 1)[0]
X_test_attack = X_test[indexes]
y_test_attack = y_test.iloc[indexes]

In [8]:
# def build_mlp():
#     mlp = Sequential()
#     mlp.add(Input(shape = (4)))
#     mlp.add(Dense(128, activation = 'relu'))
#     mlp.add(Dense(128, activation = 'relu'))
#     mlp.add(Dense(1, activation = 'sigmoid'))

#     mlp.compile(optimizer='adam',
#                     loss='binary_crossentropy',
#                     metrics=['accuracy'])
    
#     return mlp


# es = EarlyStopping(monitor = 'val_loss', patience = 15, restore_best_weights = True)

# mlp = KerasClassifier(build_fn = build_mlp, epochs = 100, batch_size = 128, validation_split = 0.2, callbacks = [es])
# mlp.fit(X_train_smote, y_train_smote)

2024-02-22 14:00:13.616222: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2024-02-22 14:00:13.617913: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2024-02-22 14:00:13.641750: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:86:00.0 name: Tesla V100-PCIE-32GB computeCapability: 7.0
coreClock: 1.38GHz coreCount: 80 deviceMemorySize: 31.74GiB deviceMemoryBandwidth: 836.37GiB/s
2024-02-22 14:00:13.641788: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2024-02-22 14:00:16.877450: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.10
2024-02-22 14:00:16.877603: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.10
2

Epoch 1/100


2024-02-22 14:00:28.834792: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.10


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100


<tensorflow.python.keras.callbacks.History at 0x2b00de70a750>

In [9]:
# perm_mlp = PermutationImportance(mlp, random_state=1).fit(X_test_attack, y_test_attack)
# eli5.show_weights(perm_mlp, feature_names = X.columns.tolist())



  


Weight,Feature
0  ± 0.0000,IAT
0  ± 0.0000,Payload
0  ± 0.0000,DLC
0  ± 0.0000,ID


In [None]:
xgb = XGBClassifier()
xgb.fit(X_train_smote, y_train_smote)

perm_xgb = PermutationImportance(xgb, random_state = 1).fit(X_test, y_test)
eli5.show_weights(perm_xgb, feature_names = X.columns.tolist())

In [10]:
X_train_smote

array([[ 0.04215836,  0.09270541, -0.30081906, -0.36840644],
       [-0.66333764,  0.09270541, -0.23989316,  0.20275524],
       [ 1.15377871,  0.09270541, -0.57169923, -0.02776887],
       ...,
       [-1.36883364,  0.09270541, -0.57455211, -0.36292568],
       [-1.36883364,  0.09270541, -0.57455211,  3.24972594],
       [-1.36883364,  0.09270541, -0.57455211, -0.1900326 ]])

In [12]:
scaler.inverse_transform([[-1.,0,0,0]])

array([[-5.02588125e-01,  7.94887363e+00,  3.64946309e+18,
         7.66801381e-04]])

In [16]:
dt = DecisionTreeClassifier(max_depth = 3)
dt.fit(X_train_smote, y_train_smote)

perm_dt = PermutationImportance(dt, random_state = 1).fit(X_test, y_test)
eli5.show_weights(perm_dt, feature_names = X.columns.tolist())

Weight,Feature
0.5001  ± 0.0004,ID
0  ± 0.0000,IAT
0  ± 0.0000,Payload
0  ± 0.0000,DLC


In [None]:
rf = RandomForestClassifier(n_estimators = 100, max_depth = 2)
rf.fit(X_train_smote, y_train_smote)

perm_rf = PermutationImportance(rf, random_state = 1).fit(X_test, y_test)
eli5.show_weights(perm_rf, feature_names = X.columns.tolist())

In [10]:
X.columns

Index(['ID', 'DLC', 'Payload', 'IAT'], dtype='object')