In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.losses import SparseCategoricalCrossentropy, BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_text, export_graphviz
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from copy import deepcopy
from sklearn.utils import class_weight

2024-10-15 10:00:38.818905: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data_folder = 'Car-Hacking/'
print(os.listdir(data_folder))

smart_attack_path = 'attack_10_10.csv'

['Fuzzy_dataset.csv', 'normal_run_data.7z', 'normal_run_data', 'DoS_dataset.csv', 'RPM_dataset.csv', 'gear_dataset.csv']


In [3]:
rpm_data_path = os.path.join(data_folder, 'RPM_dataset.csv')
gear_data_path = os.path.join(data_folder, 'gear_dataset.csv')
dos_data_path = os.path.join(data_folder, 'DoS_dataset.csv')

In [4]:
def hex_to_bin(hex_num):
    
    binary_value = bin(int(str(hex_num), 16))[2:]
    
    return binary_value

def int_to_bin(int_num):
    
    binary_value = bin(int_num)[2:]
    
    return binary_value

def pad(value, length):
    
    curr_length = len(str(value))
    
    zeros = '0' * (length - curr_length)
    
    return zeros + value

hex_to_dec = lambda x: int(x, 16)

def transform_data(data):

    data['ID'] = data['ID'].apply(hex_to_dec)
    data['Payload'] = data['Payload'].apply(hex_to_dec)

    return data

In [5]:
def shift_columns(df):
    
    for dlc in [2,5,6]:

        df.loc[df['dlc'] == dlc, df.columns[3:]] = df.loc[df['dlc'] == dlc, df.columns[3:]].shift(periods=8-dlc, axis='columns', fill_value='00')

    return df

In [6]:
def read_attack_data(data_path):
    
    columns = ['timestamp','can_id', 'dlc', 'data0', 'data1', 'data2', 'data3', 'data4', 
           'data5', 'data6', 'data7', 'flag']
    
    data = pd.read_csv(data_path, names = columns)

    data = shift_columns(data)
    
    ##Replacing all NaNs with '00' 
    data = data.replace(np.NaN, '00')
    
    ##Joining all data columns to put all data in one column
    data_cols = ['data0', 'data1', 'data2', 'data3', 'data4', 'data5', 'data6', 'data7']
    
    ##The data column is in hexadecimal
    data['data'] = data[data_cols].apply(''.join, axis=1)
    data.drop(columns = data_cols, inplace = True, axis = 1)
    
    ##Converting columns to decimal
    data['can_id'] = data['can_id'].apply(hex_to_dec)
    data['data'] = data['data'].apply(hex_to_dec)

    data = data.assign(IAT=data['timestamp'].diff().fillna(0))
    
    return data[:50_000]

    

In [7]:
rpm_data = read_attack_data(rpm_data_path)
gear_data = read_attack_data(gear_data_path)
dos_data = read_attack_data(dos_data_path)
smart_attack = pd.read_csv(smart_attack_path)

In [8]:
gear_data['flag'].replace({'R' : 0, 'T' : 1}, inplace = True)
dos_data['flag'].replace({'R' : 0, 'T' : 1}, inplace = True)
rpm_data['flag'].replace({'R' : 0, 'T' : 1}, inplace = True)

impersonation_data = pd.concat([gear_data,rpm_data], axis=0, ignore_index=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  gear_data['flag'].replace({'R' : 0, 'T' : 1}, inplace = True)
  gear_data['flag'].replace({'R' : 0, 'T' : 1}, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dos_data['flag'].replace({'R' : 0, 'T' : 1}, inplace = True)
  dos_data['flag'].replace({'R' : 0, 'T' : 1}

In [9]:
def sequencify_data(X, y, seq_size=10):
    max_index = len(X) - seq_size + 1

    X_seq = []
    y_seq = []

    for i in range(0, max_index, seq_size):
        X_seq.append(X[i:i+seq_size])  # Append the sequence from DataFrame 'X'
        try:
            y_seq.append(1 if 1 in y[i:i+seq_size].values else 0)  # Check for '1' in 'y' values
        except:
             y_seq.append(1 if 1 in y[i:i+seq_size] else 0)

    return np.array(X_seq), np.array(y_seq)

In [10]:
dos_data.drop(columns = ['timestamp'], inplace = True)
impersonation_data.drop(columns = ['timestamp'], inplace = True)
smart_attack.drop(columns = ['Timestamp'], inplace = True)

In [11]:
print("DOS:",dos_data['flag'].value_counts())
print()
print("Impersonation Combined:",impersonation_data['flag'].value_counts())
print()
print("Smart:",smart_attack['label'].value_counts())

DOS: flag
0    38580
1    11420
Name: count, dtype: int64

Impersonation Combined: flag
0    81402
1    18598
Name: count, dtype: int64

Smart: label
0.0    100000
2.0     29271
1.0     18191
Name: count, dtype: int64


In [12]:
X_dos = dos_data[['can_id', 'dlc', 'data', 'IAT']].values
y_dos = dos_data['flag'].values

X_imp = impersonation_data[['can_id', 'dlc', 'data', 'IAT']].values
y_imp = impersonation_data['flag'].values

X_smart = smart_attack.drop(['label'], axis = 1).values
y_smart = smart_attack['label']
y_tri = y_smart.copy(deep = True)
y_smart = y_smart.replace(2,1)

In [13]:
X_seq_dos, y_seq_dos = sequencify_data(X_dos, y_dos)
X_seq_imp, y_seq_imp = sequencify_data(X_imp, y_imp)
X_seq_smart, y_seq_smart = sequencify_data(X_smart, y_smart)

In [14]:
print(len(X_seq_dos))
print(len(X_seq_imp))
print(len(X_seq_smart))

5000
10000
14746


In [15]:
print(np.unique(y_seq_dos, return_counts=True))
print(np.unique(y_seq_imp, return_counts=True))
print(np.unique(y_seq_smart, return_counts=True))

(array([0, 1]), array([2881, 2119]))
(array([0, 1]), array([4186, 5814]))
(array([0, 1]), array([ 1350, 13396]))


In [16]:
def balance_data(X_seq, y_seq):
    # Get indices for label 0 and label 1
    zero_indices = np.where(y_seq == 0)[0]
    one_indices = np.where(y_seq == 1)[0]

    # Find the number of samples for label 0
    num_zeros = len(zero_indices)

    # Randomly sample an equal number of samples from label 1
    np.random.seed(42)  # Set seed for reproducibility
    sampled_one_indices = np.random.choice(one_indices, num_zeros, replace=False)

    # Combine the indices of label 0 and sampled label 1
    balanced_indices = np.concatenate([zero_indices, sampled_one_indices])

    # Shuffle the balanced indices to avoid any ordering issues
    np.random.shuffle(balanced_indices)

    # Subset X_seq and y_seq based on the balanced indices
    X_seq_balanced = X_seq[balanced_indices]
    y_seq_balanced = y_seq[balanced_indices]

    return X_seq_balanced, y_seq_balanced


In [17]:
X_seq_smart, y_seq_smart = balance_data(X_seq_smart, y_seq_smart)

In [18]:
print(np.unique(y_seq_smart, return_counts=True))

(array([0, 1]), array([1350, 1350]))


In [19]:
# def sequencify_data_test(X, y, seq_size=10):
#     max_index = len(X) - seq_size + 1

#     X_seq = []
#     y_seq = []

#     for i in range(1000, 10000, seq_size):
#         # print(X[i:i+seq_size])  # Append the sequence from DataFrame 'X'
#         print(y[i:i+seq_size])
#         print(1 if 1 in y[i:i+seq_size] else 0)  # Check for '1' in 'y' values
    
# sequencify_data_test(X_smart, y_smart)

In [20]:
X_train_dos, X_test_dos, y_train_dos, y_test_dos = train_test_split(X_dos, y_dos, test_size=0.3, random_state = 42)
X_train_seq_dos, X_test_seq_dos, y_train_seq_dos, y_test_seq_dos = train_test_split(X_seq_dos, y_seq_dos, test_size = 0.3, shuffle = True)

X_train_imp, X_test_imp, y_train_imp, y_test_imp = train_test_split(X_imp, y_imp, test_size=0.3, random_state = 42)
X_train_seq_imp, X_test_seq_imp, y_train_seq_imp, y_test_seq_imp = train_test_split(X_seq_imp, y_seq_imp, test_size = 0.3, shuffle = True)

X_train_smart, X_test_smart, y_train_smart, y_test_smart = train_test_split(X_smart, y_smart, test_size=0.3, random_state = 42)
X_train_seq_smart, X_test_seq_smart, y_train_seq_smart, y_test_seq_smart = train_test_split(X_seq_smart, y_seq_smart, test_size = 0.3, shuffle = True)

In [21]:
X_train_seq_dos = X_train_seq_dos[:2000]
X_train_seq_imp = X_train_seq_imp[:2000] 
X_train_seq_smart = X_train_seq_smart[:2000]

y_train_seq_dos = y_train_seq_dos[:2000]
y_train_seq_imp = y_train_seq_imp[:2000] 
y_train_seq_smart = y_train_seq_smart[:2000]

In [22]:
# print(len(X_train_seq_dos))
# print(len(X_train_seq_imp))
# print(len(X_train_seq_smart))

In [23]:
## Normalizing dataset
scaler = StandardScaler()

scaler.fit(X_train_dos)
scaler.fit(X_train_imp)
scaler.fit(X_train_smart)

X_train = np.concatenate((X_train_dos, X_train_imp, X_train_smart), axis = 0)
y_train = np.concatenate((y_train_dos, y_train_imp, y_train_smart), axis = 0)
X_train = scaler.transform(X_train)

X_test_dos = scaler.transform(X_test_dos)
X_test_imp = scaler.transform(X_test_imp)
X_test_smart = scaler.transform(X_test_smart)

mean = np.mean(np.concatenate((X_train_seq_dos, X_train_seq_imp, X_train_seq_smart), axis = 0), axis=(0,1))
std = np.mean(np.concatenate((X_train_seq_dos, X_train_seq_imp, X_train_seq_smart), axis = 0), axis=(0,1))

X_train_seq = np.concatenate((X_train_seq_dos, X_train_seq_imp, X_train_seq_smart), axis = 0)
y_train_seq = np.concatenate((y_train_seq_dos, y_train_seq_imp, y_train_seq_smart), axis = 0)

X_train_seq -= mean
X_train_seq /= std

X_test_seq_dos -= mean
X_test_seq_dos /= std

X_test_seq_imp -= mean
X_test_seq_imp /= std

X_test_seq_smart -= mean
X_test_seq_smart /= std

oversample = SMOTE()
X_train_smote, y_train_smote = oversample.fit_resample(X_train, y_train) 

# seq_class_weights = class_weight.compute_class_weight('balanced',
#                                                  classes = np.unique(y_train_seq),
#                                                  y = y_train_seq)

In [24]:
print(np.unique(y_train_seq_dos, return_counts=True))
print(np.unique(y_test_seq_dos, return_counts=True))

(array([0, 1]), array([1146,  854]))
(array([0, 1]), array([867, 633]))


In [25]:
print(np.unique(y_train_seq_imp, return_counts=True))
print(np.unique(y_test_seq_imp, return_counts=True))

(array([0, 1]), array([ 809, 1191]))
(array([0, 1]), array([1293, 1707]))


In [26]:
print(np.unique(y_train_seq_smart, return_counts=True))
print(np.unique(y_test_seq_smart, return_counts=True))

(array([0, 1]), array([943, 947]))
(array([0, 1]), array([407, 403]))


In [27]:
##MLP

print("-----MLP-------")

mlp = Sequential()
mlp.add(Input(shape = (4)))
mlp.add(Dense(128, activation = 'relu'))
mlp.add(Dense(128, activation = 'relu'))
mlp.add(Dense(1, activation = 'sigmoid'))

mlp.compile(optimizer='adam',
                loss=BinaryCrossentropy(from_logits=False),
                metrics=['accuracy'])

es = EarlyStopping(monitor = 'val_loss', patience = 5, restore_best_weights = True)

mlp_hist = mlp.fit(X_train_smote, y_train_smote, epochs=100, callbacks = [es], validation_split=0.2, batch_size = 32)

-----MLP-------
Epoch 1/100


2024-10-15 10:02:03.493485: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-10-15 10:02:03.498165: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100


In [28]:
##MLP
print("-----MLP-------")
threshold = 0.5
mlp_preds = mlp.predict(X_test_dos, batch_size = 8196)
mlp_preds = (mlp_preds >= threshold).astype(int)

print("--------DOS--------")
print("ACCURACY: ", accuracy_score(y_test_dos, mlp_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_test_dos, mlp_preds))

mlp_preds = mlp.predict(X_test_imp, batch_size = 8196)
mlp_preds = (mlp_preds >= threshold).astype(int)

print("--------Impersonation--------")
print("ACCURACY: ", accuracy_score(y_test_imp, mlp_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_test_imp, mlp_preds))


mlp_preds = mlp.predict(X_test_smart, batch_size = 8196)
mlp_preds = (mlp_preds >= threshold).astype(int)

print("--------Smart--------")
print("ACCURACY: ", accuracy_score(y_test_smart, mlp_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_test_smart, mlp_preds))


-----MLP-------
--------DOS--------
ACCURACY:  0.8924
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

           0       1.00      0.86      0.93     11569
           1       0.68      0.99      0.81      3431

    accuracy                           0.89     15000
   macro avg       0.84      0.93      0.87     15000
weighted avg       0.92      0.89      0.90     15000

--------Impersonation--------
ACCURACY:  0.8888
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

           0       0.99      0.87      0.93     24475
           1       0.63      0.97      0.76      5525

    accuracy                           0.89     30000
   macro avg       0.81      0.92      0.85     30000
weighted avg       0.93      0.89      0.90     30000

--------Smart--------
ACCURACY:  0.7538823210289564
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

         0.0       0.81      0.83      0.82     29873
         1.

In [29]:
# seq_class_weights = dict(enumerate(seq_class_weights))
# print(seq_class_weights)

In [30]:
# print(dict(enumerate(class_weight.compute_class_weight('balanced',
#                                   classes = np.unique(y_train_seq_smart),
#                                 y = y_train_seq_smart))))

In [31]:
##LSTM

print("-----LSTM-------")

lstm = Sequential()

lstm.add(Input(shape = X_train_seq.shape[1:]))
lstm.add(LSTM(128, activation = 'relu'))
lstm.add(Dense(64, activation = 'relu'))
lstm.add(Dense(1, activation = 'sigmoid'))

lstm.compile(
    loss = BinaryCrossentropy(from_logits = False),
    optimizer = Adam(learning_rate = 0.001),
    metrics = ['accuracy'])

es = EarlyStopping(monitor = 'val_loss', patience = 5, restore_best_weights = True)
lstm_hist = lstm.fit(X_train_seq, y_train_seq, batch_size = 32, validation_split = 0.2,
        callbacks = [es], epochs = 1000)


-----LSTM-------
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000


In [32]:
print("-----LSTM-------")

lstm_preds = lstm.predict(X_test_seq_dos, batch_size=4096)
lstm_preds = (lstm_preds >= threshold).astype(int)

print("--------DOS--------")
print("ACCURACY: ", accuracy_score(y_test_seq_dos, lstm_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_test_seq_dos, lstm_preds))


lstm_preds = lstm.predict(X_test_seq_imp, batch_size=4096)
lstm_preds = (lstm_preds >= threshold).astype(int)

print("--------Impersonation--------")
print("ACCURACY: ", accuracy_score(y_test_seq_imp, lstm_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_test_seq_imp, lstm_preds))


lstm_preds = lstm.predict(X_test_seq_smart, batch_size=4096)
lstm_preds = (lstm_preds >= threshold).astype(int)

print("--------Smart--------")
print("ACCURACY: ", accuracy_score(y_test_seq_smart, lstm_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_test_seq_smart, lstm_preds))

-----LSTM-------


--------DOS--------
ACCURACY:  0.97
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

           0       1.00      0.95      0.97       867
           1       0.93      1.00      0.97       633

    accuracy                           0.97      1500
   macro avg       0.97      0.97      0.97      1500
weighted avg       0.97      0.97      0.97      1500

--------Impersonation--------
ACCURACY:  0.9463333333333334
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

           0       0.96      0.92      0.94      1293
           1       0.94      0.97      0.95      1707

    accuracy                           0.95      3000
   macro avg       0.95      0.94      0.94      3000
weighted avg       0.95      0.95      0.95      3000

--------Smart--------
ACCURACY:  0.8345679012345679
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

           0       0.88      0.77      0.82       407
           1     

In [33]:
dt = DecisionTreeClassifier(max_depth = 4)
dt.fit(X_train_smote, y_train_smote)

dt_preds = dt.predict(X_test_dos)

print("-------DECISION TREE--------")

print("--------DOS--------")
print("ACCURACY: ", accuracy_score(y_test_dos, dt_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_test_dos, dt_preds))
    

dt_preds = dt.predict(X_test_imp)

print("--------Impersonation--------")
print("ACCURACY: ", accuracy_score(y_test_imp, dt_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_test_imp, dt_preds))

dt_preds = dt.predict(X_test_smart)

print("--------Smart--------")
print("ACCURACY: ", accuracy_score(y_test_smart, dt_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_test_smart, dt_preds))


-------DECISION TREE--------
--------DOS--------
ACCURACY:  0.6616
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

           0       1.00      0.56      0.72     11569
           1       0.40      1.00      0.57      3431

    accuracy                           0.66     15000
   macro avg       0.70      0.78      0.65     15000
weighted avg       0.86      0.66      0.69     15000

--------Impersonation--------
ACCURACY:  0.6415666666666666
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

           0       0.99      0.57      0.72     24475
           1       0.34      0.97      0.50      5525

    accuracy                           0.64     30000
   macro avg       0.66      0.77      0.61     30000
weighted avg       0.87      0.64      0.68     30000

--------Smart--------
ACCURACY:  0.6186396618368408
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

         0.0       0.80      0.58      0