In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.losses import SparseCategoricalCrossentropy, BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_text, export_graphviz
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.utils import class_weight
import joblib

2024-10-15 08:35:01.673023: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data_folder = 'Car-Hacking/'
print(os.listdir(data_folder))
smart_attack_path = 'attack_10_10.csv'

['Fuzzy_dataset.csv', 'normal_run_data.7z', 'normal_run_data', 'DoS_dataset.csv', 'RPM_dataset.csv', 'gear_dataset.csv']


In [3]:
rpm_data_path = os.path.join(data_folder, 'RPM_dataset.csv')
gear_data_path = os.path.join(data_folder, 'gear_dataset.csv')
dos_data_path = os.path.join(data_folder, 'DoS_dataset.csv')
# fuzzy_data_path = os.path.join(data_folder, 'Fuzzy_dataset.csv')

In [4]:
def hex_to_bin(hex_num):
    
    binary_value = bin(int(str(hex_num), 16))[2:]
    
    return binary_value

def int_to_bin(int_num):
    
    binary_value = bin(int_num)[2:]
    
    return binary_value

def pad(value, length):
    
    curr_length = len(str(value))
    
    zeros = '0' * (length - curr_length)
    
    return zeros + value

hex_to_dec = lambda x: int(x, 16)

def transform_data(data):

    data['ID'] = data['ID'].apply(hex_to_dec)
    data['Payload'] = data['Payload'].apply(hex_to_dec)

    return data

In [5]:
def shift_columns(df):
    
    for dlc in [2,5,6]:

        df.loc[df['dlc'] == dlc, df.columns[3:]] = df.loc[df['dlc'] == dlc, df.columns[3:]].shift(periods=8-dlc, axis='columns', fill_value='00')

    return df

In [6]:
def read_attack_data(data_path):
    
    columns = ['timestamp','can_id', 'dlc', 'data0', 'data1', 'data2', 'data3', 'data4', 
           'data5', 'data6', 'data7', 'flag']
    
    data = pd.read_csv(data_path, names = columns)

    data = shift_columns(data)
    
    ##Replacing all NaNs with '00' 
    data = data.replace(np.NaN, '00')
    
    ##Joining all data columns to put all data in one column
    data_cols = ['data0', 'data1', 'data2', 'data3', 'data4', 'data5', 'data6', 'data7']
    
    ##The data column is in hexadecimal
    data['data'] = data[data_cols].apply(''.join, axis=1)
    data.drop(columns = data_cols, inplace = True, axis = 1)
    
    ##Converting columns to decimal
    data['can_id'] = data['can_id'].apply(hex_to_dec)
    data['data'] = data['data'].apply(hex_to_dec)

    data = data.assign(IAT=data['timestamp'].diff().fillna(0))
    
    return data[:50_000]

    

In [7]:
rpm_data = read_attack_data(rpm_data_path)
gear_data = read_attack_data(gear_data_path)
dos_data = read_attack_data(dos_data_path)
smart_attack = pd.read_csv(smart_attack_path)

In [8]:
gear_data['flag'].replace({'R' : 0, 'T' : 1}, inplace = True)
dos_data['flag'].replace({'R' : 0, 'T' : 1}, inplace = True)
rpm_data['flag'].replace({'R' : 0, 'T' : 1}, inplace = True)

impersonation_data = pd.concat([gear_data,rpm_data], axis=0, ignore_index=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  gear_data['flag'].replace({'R' : 0, 'T' : 1}, inplace = True)
  gear_data['flag'].replace({'R' : 0, 'T' : 1}, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dos_data['flag'].replace({'R' : 0, 'T' : 1}, inplace = True)
  dos_data['flag'].replace({'R' : 0, 'T' : 1}

In [9]:
print("DOS:",dos_data['flag'].value_counts())
print()
print("Gear:",gear_data['flag'].value_counts())
print()
print("RPM:",rpm_data['flag'].value_counts())
print()
print("Impersonation Combined:",impersonation_data['flag'].value_counts())

DOS: flag
0    38580
1    11420
Name: count, dtype: int64

Gear: flag
0    40848
1     9152
Name: count, dtype: int64

RPM: flag
0    40554
1     9446
Name: count, dtype: int64

Impersonation Combined: flag
0    81402
1    18598
Name: count, dtype: int64


In [10]:
def sequencify_data(X, y, seq_size=10):
    max_index = len(X) - seq_size + 1

    X_seq = []
    y_seq = []

    for i in range(0, max_index, seq_size):
        X_seq.append(X[i:i+seq_size])  # Append the sequence from DataFrame 'X'
        try:
            y_seq.append(1 if 1 in y[i:i+seq_size].values else 0)  # Check for '1' in 'y' values
        except:
             y_seq.append(1 if 1 in y[i:i+seq_size] else 0)

    return np.array(X_seq), np.array(y_seq)

In [11]:
smart_attack.drop(columns = ['Timestamp'], inplace = True)

In [12]:
X_dos = dos_data[['can_id', 'dlc', 'data', 'IAT']].values
y_dos = dos_data['flag'].values

X_imp = impersonation_data[['can_id', 'dlc', 'data', 'IAT']].values
y_imp = impersonation_data['flag'].values


X_smart = smart_attack.drop(['label'], axis = 1).values
y_smart = smart_attack['label']
y_tri = y_smart.copy(deep = True)
y_smart = y_smart.replace(2,1)

In [13]:
X_seq_dos, y_seq_dos = sequencify_data(X_dos, y_dos)
X_seq_imp, y_seq_imp = sequencify_data(X_imp, y_imp)
X_seq_smart, y_seq_smart = sequencify_data(X_smart, y_smart)

In [14]:
def balance_data(X_seq, y_seq):
    # Get indices for label 0 and label 1
    zero_indices = np.where(y_seq == 0)[0]
    one_indices = np.where(y_seq == 1)[0]

    # Find the number of samples for label 0
    num_zeros = len(zero_indices)

    # Randomly sample an equal number of samples from label 1
    np.random.seed(42)  # Set seed for reproducibility
    sampled_one_indices = np.random.choice(one_indices, num_zeros, replace=False)

    # Combine the indices of label 0 and sampled label 1
    balanced_indices = np.concatenate([zero_indices, sampled_one_indices])

    # Shuffle the balanced indices to avoid any ordering issues
    np.random.shuffle(balanced_indices)

    # Subset X_seq and y_seq based on the balanced indices
    X_seq_balanced = X_seq[balanced_indices]
    y_seq_balanced = y_seq[balanced_indices]

    return X_seq_balanced, y_seq_balanced


In [15]:
X_seq_smart, y_seq_smart = balance_data(X_seq_smart, y_seq_smart)

In [16]:
X_train_dos, X_test_dos, y_train_dos, y_test_dos = train_test_split(X_dos, y_dos, test_size=0.3, random_state = 42)
X_train_seq_dos, X_test_seq_dos, y_train_seq_dos, y_test_seq_dos = train_test_split(X_seq_dos, y_seq_dos, test_size = 0.3, shuffle = True)

X_train_imp, X_test_imp, y_train_imp, y_test_imp = train_test_split(X_imp, y_imp, test_size=0.3, random_state = 42)
X_train_seq_imp, X_test_seq_imp, y_train_seq_imp, y_test_seq_imp = train_test_split(X_seq_imp, y_seq_imp, test_size = 0.3, shuffle = True)

In [17]:
print("DOS Train:", np.unique(y_train_dos, return_counts = True))
print()
print("Impersonation Train:", np.unique(y_train_imp, return_counts = True))

print()
print()

print("DOS Train Sequencified:", np.unique(y_train_seq_dos, return_counts = True))
print()
print("Impersonation Train Sequencified:", np.unique(y_train_seq_imp, return_counts = True))

DOS Train: (array([0, 1]), array([27011,  7989]))

Impersonation Train: (array([0, 1]), array([56927, 13073]))


DOS Train Sequencified: (array([0, 1]), array([2014, 1486]))

Impersonation Train Sequencified: (array([0, 1]), array([2893, 4107]))


In [18]:
## Normalizing dataset
scaler = StandardScaler()

scaler.fit(X_train_dos)
scaler.fit(X_train_imp)

X_train = np.concatenate((X_train_dos, X_train_imp), axis = 0)
y_train = np.concatenate((y_train_dos, y_train_imp), axis = 0)
X_train = scaler.transform(X_train)

X_test_dos = scaler.transform(X_test_dos)
X_test_imp = scaler.transform(X_test_imp)


mean = np.mean(np.concatenate((X_train_seq_dos, X_train_seq_imp), axis = 0),axis=(0,1))
std = np.mean(np.concatenate((X_train_seq_dos, X_train_seq_imp), axis = 0), axis=(0,1))

X_train_seq = np.concatenate((X_train_seq_dos, X_train_seq_imp), axis = 0)
y_train_seq = np.concatenate((y_train_seq_dos, y_train_seq_imp), axis = 0)

X_train_seq -= mean
X_train_seq /= std

X_test_seq_dos -= mean
X_test_seq_dos /= std

X_test_seq_imp -= mean
X_test_seq_imp /= std

oversample = SMOTE()
X_train_smote, y_train_smote = oversample.fit_resample(X_train, y_train) 

seq_class_weights = class_weight.compute_class_weight('balanced',
                                                 classes = np.unique(y_train_seq),
                                                 y = y_train_seq)

In [19]:
joblib.dump(scaler, 'scaler_m0.sav')

['scaler_m0.sav']

In [20]:
X_train_seq, y_train_seq = balance_data(X_train_seq, y_train_seq)

In [21]:
print(np.unique(y_train_smote, return_counts = True))

(array([0, 1]), array([83938, 83938]))


In [22]:
# seq_class_weights = dict(enumerate(seq_class_weights))
# print(seq_class_weights)

In [23]:
##MLP

print("-----MLP-------")

mlp = Sequential()
mlp.add(Input(shape = (4)))
mlp.add(Dense(128, activation = 'relu'))
mlp.add(Dense(64, activation = 'relu'))
mlp.add(Dense(1, activation = 'sigmoid'))

mlp.compile(optimizer='adam',
                loss=BinaryCrossentropy(from_logits=False),
                metrics=['accuracy'])

es = EarlyStopping(monitor = 'val_loss', patience = 5, restore_best_weights = True)

mlp_hist = mlp.fit(X_train_smote, y_train_smote, epochs=100, callbacks = [es], validation_split=0.2, batch_size = 32)

-----MLP-------


Epoch 1/100


2024-10-15 08:36:12.753412: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-10-15 08:36:12.758271: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


In [24]:
mlp.save('mlp_m0.h5')

In [25]:
##MLP
print("-----MLP-------")
threshold = 0.5
mlp_preds = mlp.predict(X_test_dos, batch_size = 8196)
mlp_preds = (mlp_preds >= threshold).astype(int)

print("--------DOS--------")
print("ACCURACY: ", accuracy_score(y_test_dos, mlp_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_test_dos, mlp_preds))

mlp_preds = mlp.predict(X_test_imp, batch_size = 8196)
mlp_preds = (mlp_preds >= threshold).astype(int)

print("--------Impersonation--------")
print("ACCURACY: ", accuracy_score(y_test_imp, mlp_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_test_imp, mlp_preds))

-----MLP-------
--------DOS--------
ACCURACY:  0.9395333333333333
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

           0       1.00      0.92      0.96     11569
           1       0.79      0.99      0.88      3431

    accuracy                           0.94     15000
   macro avg       0.90      0.96      0.92     15000
weighted avg       0.95      0.94      0.94     15000

--------Impersonation--------
ACCURACY:  0.9453333333333334
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

           0       1.00      0.94      0.97     24475
           1       0.78      0.98      0.87      5525

    accuracy                           0.95     30000
   macro avg       0.89      0.96      0.92     30000
weighted avg       0.96      0.95      0.95     30000



In [26]:
##LSTM

print("-----LSTM-------")

lstm = Sequential()

lstm.add(Input(shape = X_train_seq.shape[1:]))
lstm.add(LSTM(128, activation = 'relu'))
lstm.add(Dense(1, activation = 'sigmoid'))

lstm.compile(
    loss = BinaryCrossentropy(from_logits = False),
    optimizer = Adam(learning_rate = 0.001),
    metrics = ['accuracy'])

es = EarlyStopping(monitor = 'val_loss', patience = 5, restore_best_weights = True)
lstm_hist = lstm.fit(X_train_seq, y_train_seq, batch_size = 32, validation_split = 0.2,
        callbacks = [es], epochs = 1000)


-----LSTM-------
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000


In [27]:
lstm.save('lstm_m0.h5')

In [28]:
print("-----LSTM-------")

lstm_preds = lstm.predict(X_test_seq_dos, batch_size=4096)
lstm_preds = (lstm_preds >= threshold).astype(int)

print("--------DOS--------")
print("ACCURACY: ", accuracy_score(y_test_seq_dos, lstm_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_test_seq_dos, lstm_preds))


lstm_preds = lstm.predict(X_test_seq_imp, batch_size=4096)
lstm_preds = (lstm_preds >= threshold).astype(int)

print("--------Impersonation--------")
print("ACCURACY: ", accuracy_score(y_test_seq_imp, lstm_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_test_seq_imp, lstm_preds))

-----LSTM-------
--------DOS--------
ACCURACY:  0.9866666666666667
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99       867
           1       0.98      0.99      0.98       633

    accuracy                           0.99      1500
   macro avg       0.99      0.99      0.99      1500
weighted avg       0.99      0.99      0.99      1500

--------Impersonation--------
ACCURACY:  0.971
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

           0       0.95      0.98      0.97      1293
           1       0.98      0.96      0.97      1707

    accuracy                           0.97      3000
   macro avg       0.97      0.97      0.97      3000
weighted avg       0.97      0.97      0.97      3000



In [29]:
dt = DecisionTreeClassifier(max_depth = 4)
dt.fit(X_train_smote, y_train_smote)

dt_preds = dt.predict(X_test_dos)

print("-------DECISION TREE--------")

print("--------DOS--------")
print("ACCURACY: ", accuracy_score(y_test_dos, dt_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_test_dos, dt_preds))
    

dt_preds = dt.predict(X_test_imp)

print("--------Impersonation--------")
print("ACCURACY: ", accuracy_score(y_test_imp, dt_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_test_imp, dt_preds))


-------DECISION TREE--------
--------DOS--------
ACCURACY:  0.8732666666666666
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

           0       1.00      0.84      0.91     11569
           1       0.64      1.00      0.78      3431

    accuracy                           0.87     15000
   macro avg       0.82      0.92      0.85     15000
weighted avg       0.92      0.87      0.88     15000

--------Impersonation--------
ACCURACY:  0.8664333333333334
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

           0       1.00      0.84      0.91     24475
           1       0.58      1.00      0.73      5525

    accuracy                           0.87     30000
   macro avg       0.79      0.92      0.82     30000
weighted avg       0.92      0.87      0.88     30000



In [30]:
joblib.dump(dt, 'dt_m0.pkl')

['dt_m0.pkl']

In [31]:
smart_attack = pd.read_csv('aux_attacks_new_new.csv')

In [32]:
smart_X = smart_attack.drop('label', axis = 1)
smart_y = smart_attack['label']

In [33]:
smart_y_tri = smart_y.copy(deep=True)
smart_y = smart_y.replace(2,1)

In [34]:
smart_X.drop(['Timestamp'], inplace = True, axis = 1)

In [35]:
smart_X

Unnamed: 0,ID,DLC,Payload,IAT
0,848.0,8.0,3.716925e+17,0.000000
1,704.0,8.0,1.441152e+18,0.000221
2,1072.0,8.0,0.000000e+00,0.000554
3,1201.0,8.0,0.000000e+00,0.000238
4,497.0,8.0,0.000000e+00,0.000248
...,...,...,...,...
152069,608.0,8.0,3.202177e+18,0.000238
152070,672.0,8.0,1.177723e+14,0.000236
152071,809.0,8.0,9.779705e+18,0.000230
152072,880.0,8.0,9.851624e+15,0.000247


In [36]:
def sequencify_data(X, y, seq_size=10):
    max_index = len(X) - seq_size + 1

    X_seq = []
    y_seq = []

    for i in range(0, max_index, seq_size):
        X_seq.append(X[i:i+seq_size])  # Append the sequence from DataFrame 'X'
        y_seq.append(1 if 1 in y[i:i+seq_size].values else 0)  # Check for '1' in 'y' values

    return np.array(X_seq), np.array(y_seq)

smart_X_seq, smart_y_seq_dos = sequencify_data(smart_X, smart_y)

smart_X_seq -= mean
smart_X_seq /= std


In [37]:
smart_X = scaler.transform(smart_X)



In [38]:
mlp_preds = mlp.predict(smart_X)
mlp_preds = (mlp_preds >= threshold).astype(int)

print("SMART ATTACK EVAL")

print("ACCURACY: ", accuracy_score(smart_y, mlp_preds))
print("CLASSIFICATION REPORT:\n", classification_report(smart_y, mlp_preds))


SMART ATTACK EVAL
ACCURACY:  0.6447716243407815
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

         0.0       0.67      0.91      0.77    100000
         1.0       0.44      0.14      0.21     52074

    accuracy                           0.64    152074
   macro avg       0.55      0.52      0.49    152074
weighted avg       0.59      0.64      0.58    152074



In [39]:
print("-----LSTM-------")
print("SMART ATTACK EVAL")
lstm_preds = lstm.predict(smart_X_seq, batch_size=4096)
lstm_preds = (lstm_preds >= threshold).astype(int)

print("ACCURACY: ", accuracy_score(smart_y_seq_dos, lstm_preds))
print("CLASSIFICATION REPORT:\n", classification_report(smart_y_seq_dos, lstm_preds))


-----LSTM-------
SMART ATTACK EVAL
ACCURACY:  0.3623331360557638
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

           0       0.11      0.89      0.19      1320
           1       0.97      0.31      0.47     13887

    accuracy                           0.36     15207
   macro avg       0.54      0.60      0.33     15207
weighted avg       0.89      0.36      0.45     15207



In [40]:
dt_preds = dt.predict(smart_X)

print("-------DECISION TREE--------")

print("SMART ATTACK EVAL")
print("ACCURACY: ", accuracy_score(smart_y, dt_preds))
print("CLASSIFICATION REPORT:\n", classification_report(smart_y, dt_preds))
    

-------DECISION TREE--------
SMART ATTACK EVAL
ACCURACY:  0.5969791022791535
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

         0.0       0.66      0.81      0.73    100000
         1.0       0.34      0.19      0.24     52074

    accuracy                           0.60    152074
   macro avg       0.50      0.50      0.48    152074
weighted avg       0.55      0.60      0.56    152074

