In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_text, export_graphviz
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

2023-07-07 06:36:32.320570: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data_folder = 'car_hacking_data/'
print(os.listdir(data_folder))

['normal_run_data.txt', 'DoS_dataset.csv', 'Fuzzy_dataset.csv', 'gear_dataset.csv', 'RPM_dataset.csv']


In [3]:
rpm_data_path = os.path.join(data_folder, 'RPM_dataset.csv')
gear_data_path = os.path.join(data_folder, 'gear_dataset.csv')
dos_data_path = os.path.join(data_folder, 'DoS_dataset.csv')
fuzzy_data_path = os.path.join(data_folder, 'Fuzzy_dataset.csv')

In [4]:
def hex_to_bin(hex_num):
    
    binary_value = bin(int(str(hex_num), 16))[2:]
    
    return binary_value

def int_to_bin(int_num):
    
    binary_value = bin(int_num)[2:]
    
    return binary_value

def pad(value, length):
    
    curr_length = len(str(value))
    
    zeros = '0' * (length - curr_length)
    
    return zeros + value

hex_to_dec = lambda x: int(x, 16)

In [5]:
## Since there are varying DLCs (2,5,6,8) in order to maintain data integrity
## The data must be padded with 00s when DLC < 8

def shift_columns(df):
    
    for dlc in [2,5,6]:

        df.loc[df['dlc'] == dlc, df.columns[3:]] = df.loc[df['dlc'] == dlc, df.columns[3:]].shift(periods=8-dlc, axis='columns', fill_value='00')

    return df
    

In [6]:
def read_attack_data(data_path):
    
    columns = ['timestamp','can_id', 'dlc', 'data0', 'data1', 'data2', 'data3', 'data4', 
           'data5', 'data6', 'data7', 'flag']
    
    data = pd.read_csv(data_path, names = columns)

    data = shift_columns(data)
    
    ##Replacing all NaNs with '00' 
    data = data.replace(np.NaN, '00')
    
    ##Joining all data columns to put all data in one column
    data_cols = ['data0', 'data1', 'data2', 'data3', 'data4', 'data5', 'data6', 'data7']
    
    ##The data column is in hexadecimal
    data['data'] = data[data_cols].apply(''.join, axis=1)
    data.drop(columns = data_cols, inplace = True, axis = 1)
    
    ##Converting columns to decimal
    data['can_id'] = data['can_id'].apply(hex_to_dec)
    data['data'] = data['data'].apply(hex_to_dec)
    
    data = data.assign(IAT=data['timestamp'].diff().fillna(0))
    
    return data

    

In [7]:
normal_data_path = os.path.join(data_folder, 'normal_run_data.txt')

# Initialize empty lists to store data
timestamps = []
ids = []
dlcs = []
data = []

# Read the data from the file
with open(normal_data_path, 'r') as file:
    for line in file:
        # Extract information from each line
        line = line.strip()
        ts = line.split('Timestamp: ')[1].split(' ')[0]
        can_id = line.split('ID: ')[1].split(' ')[0]
        dlc = line.split('DLC: ')[1].split(' ')[0]
        can_data = ''.join(line.split('DLC: ')[1].split(' ')[1:])
        
        #Converting Hexadecimal entries to decimal format
        timestamps.append(float(ts))
        ids.append(hex_to_dec(can_id))
        dlcs.append(int(dlc))
        data.append(hex_to_dec(can_data))
        
normal_data = pd.DataFrame({
    'timestamp': timestamps,
    'can_id': ids,
    'dlc': dlcs,
    'data': data
})

normal_data.sort_values(by = ['timestamp'], inplace = True)

# Creating IAT column
normal_data = normal_data.assign(IAT=normal_data['timestamp'].diff().fillna(0))

In [8]:
rpm_data = read_attack_data(rpm_data_path)
gear_data = read_attack_data(gear_data_path)
dos_data = read_attack_data(dos_data_path)
fuzzy_data = read_attack_data(fuzzy_data_path)

In [18]:
rpm_data.head()

Unnamed: 0,timestamp,can_id,dlc,flag,data,IAT
0,1478191000.0,790,8,R,369972507834318965,0.0
1,1478191000.0,399,8,R,18319235909263556608,0.000239
2,1478191000.0,608,8,R,1811047593997725247,0.000227
3,1478191000.0,672,8,R,6917673190735133952,0.000235
4,1478191000.0,809,8,R,15904600708710662164,0.000228


In [10]:
## Defining labels

# Normal -> 0
# DOS -> 1
# Fuzzy -> 2
# Gear -> 3



gear_data['flag'].replace({'R' : 0, 'T' : 3}, inplace = True)
dos_data['flag'].replace({'R' : 0, 'T' : 1}, inplace = True)
fuzzy_data['flag'].replace({'R' : 0, 'T' : 2}, inplace = True)

In [11]:
normal_data['flag'] = 0

In [12]:
#Combining datasets
merged_df = pd.concat([gear_data, dos_data, fuzzy_data, normal_data], axis=0, ignore_index=True)

In [13]:
merged_df.sort_values(by = ['timestamp'], inplace = True)

In [14]:
merged_df.drop(columns = ['timestamp'], inplace = True)

In [15]:
## Function to create a sequencified dataset for LSTM moodel
def sequencify(dataset, target, start, end, window):
  
    X = []
    y = []
    
    start = start + window 
    if end is None:
        end = len(dataset)
        
    for i in range(start, end+1):
        indices = range(i-window, i) 
        X.append(dataset[indices])
        
        indicey = i -1
        y.append(target[indicey])
			
    return np.array(X), np.array(y)

In [38]:
X = merged_df[['can_id', 'dlc', 'data', 'IAT']].values
y = merged_df['flag'].values

In [39]:
X_seq, y_seq = sequencify(dataset = X, target = y, window = 10, start = 0, end = None)

In [40]:
## Since we are predicting the label for the current sequence, we can shuffle during train test split

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)
X_train_seq, X_test_seq, y_train_seq, y_test_seq = train_test_split(X_seq, y_seq, test_size = 0.3, shuffle = True)

In [42]:
np.unique(y_train_seq, return_counts = True)

(array([0, 1, 2, 3]), array([7881764,  411890,  343787,  418203]))

In [43]:
## Normalizing dataset
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


mean = X_train_seq.mean(axis=0)
std = X_train_seq.std(axis=0)

X_train_seq -= mean
X_train_seq /= std
X_test_seq -= mean
X_test_seq /= std

In [44]:
print(X_train.shape)
print(X_train_seq.shape)

(9055650, 4)
(9055644, 10, 4)


In [48]:
##MLP

print("-----MLP-------")

mlp = Sequential()
mlp.add(Input(shape = (4)))
mlp.add(Dense(128, activation = 'relu'))
mlp.add(Dense(128, activation = 'relu'))
mlp.add(Dense(4))

mlp.compile(optimizer='adam',
                loss=SparseCategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'])

es = EarlyStopping(monitor = 'val_loss', patience = 5, restore_best_weights = True)

mlp_hist = mlp.fit(X_train, y_train, epochs=100, callbacks = [es], validation_split=0.2, batch_size = 8192)

-----MLP-------
Epoch 1/100


2023-07-07 06:54:43.145452: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:606] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-07-07 06:54:43.148557: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7ef538c09ee0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-07-07 06:54:43.148583: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3090, Compute Capability 8.6
2023-07-07 06:54:43.154711: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-07-07 06:54:43.350242: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8600
2023-07-07 06:54:43.518255: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the p

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100


In [49]:
##MLP
print("-----MLP-------")

mlp_preds = mlp.predict(X_test, batch_size = 8196)
mlp_preds = mlp_preds.argmax(axis = 1)

print("ACCURACY: ", accuracy_score(y_test, mlp_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_test, mlp_preds))

-----MLP-------
ACCURACY:  0.9933694821481301
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00   3378118
           1       1.00      1.00      1.00    176044
           2       0.99      0.87      0.93    147797
           3       0.98      1.00      0.99    179035

    accuracy                           0.99   3880994
   macro avg       0.99      0.97      0.98   3880994
weighted avg       0.99      0.99      0.99   3880994



In [50]:
##LSTM

print("-----LSTM-------")

lstm = Sequential()

lstm.add(Input(shape = X_train_seq.shape[1:]))
lstm.add(LSTM(128, activation = 'relu'))
lstm.add(Dense(4, activation = 'softmax'))

lstm.compile(
    loss = SparseCategoricalCrossentropy(from_logits = False),
    optimizer = Adam(learning_rate = 0.001),
    metrics = ['accuracy'])

lstm_hist = lstm.fit(X_train_seq, y_train_seq, batch_size = 8196, validation_split = 0.2,
        callbacks = [es], epochs = 1000)

-----LSTM-------
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000


In [51]:
print("-----LSTM-------")

lstm_preds = lstm.predict(X_test_seq, batch_size=4096)
lstm_preds = lstm_preds.argmax(axis = 1)

print("ACCURACY: ", accuracy_score(y_test_seq, lstm_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_test_seq, lstm_preds))

-----LSTM-------
ACCURACY:  0.9904449662470229
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99   3378251
           1       0.97      0.99      0.98    175631
           2       0.93      0.88      0.91    148060
           3       0.99      0.99      0.99    179049

    accuracy                           0.99   3880991
   macro avg       0.97      0.97      0.97   3880991
weighted avg       0.99      0.99      0.99   3880991



In [52]:
## XGBOOST

xgb = XGBClassifier()
xgb.fit(X_train, y_train)
xgb_preds = xgb.predict(X_test)

print("-------XGBOOST-------")
print("ACCURACY: ", accuracy_score(y_test, xgb_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_test, xgb_preds))


## DECISION TREE

dt = DecisionTreeClassifier(max_depth = 4)
dt.fit(X_train, y_train)
dt_preds = dt.predict(X_test)

print("-------DECISION TREE--------")
print("ACCURACY: ", accuracy_score(y_test, dt_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_test, dt_preds))


## RANDOM FOREST

rf = RandomForestClassifier(n_estimators=100, max_depth=4)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

print("-------RANDOM FOREST-------")
print("ACCURACY: ", accuracy_score(y_test, rf_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_test, rf_preds))

-------XGBOOST-------
ACCURACY:  0.998768614432282
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00   3378118
           1       1.00      1.00      1.00    176044
           2       1.00      1.00      1.00    147797
           3       0.98      1.00      0.99    179035

    accuracy                           1.00   3880994
   macro avg       0.99      1.00      1.00   3880994
weighted avg       1.00      1.00      1.00   3880994

-------DECISION TREE--------
ACCURACY:  0.9546075051906805
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

           0       0.98      0.97      0.97   3378118
           1       1.00      1.00      1.00    176044
           2       0.69      0.49      0.57    147797
           3       0.72      1.00      0.84    179035

    accuracy                           0.95   3880994
   macro avg       0.85      0.86      0.85   3880994
weighted avg       0.96    