In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.losses import SparseCategoricalCrossentropy, BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_text, export_graphviz
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from copy import deepcopy
from sklearn.utils import class_weight
import joblib
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

2024-10-15 09:49:17.281833: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
smart_attack_path = 'attack_10_10.csv'

In [3]:
def hex_to_bin(hex_num):
    
    binary_value = bin(int(str(hex_num), 16))[2:]
    
    return binary_value

def int_to_bin(int_num):
    
    binary_value = bin(int_num)[2:]
    
    return binary_value

def pad(value, length):
    
    curr_length = len(str(value))
    
    zeros = '0' * (length - curr_length)
    
    return zeros + value

hex_to_dec = lambda x: int(x, 16)

def transform_data(data):

    data['ID'] = data['ID'].apply(hex_to_dec)
    data['Payload'] = data['Payload'].apply(hex_to_dec)

    return data

In [4]:
def shift_columns(df):
    
    for dlc in [2,5,6]:

        df.loc[df['dlc'] == dlc, df.columns[3:]] = df.loc[df['dlc'] == dlc, df.columns[3:]].shift(periods=8-dlc, axis='columns', fill_value='00')

    return df

In [5]:
def read_attack_data(data_path):
    
    columns = ['timestamp','can_id', 'dlc', 'data0', 'data1', 'data2', 'data3', 'data4', 
           'data5', 'data6', 'data7', 'flag']
    
    data = pd.read_csv(data_path, names = columns)

    data = shift_columns(data)
    
    ##Replacing all NaNs with '00' 
    data = data.replace(np.NaN, '00')
    
    ##Joining all data columns to put all data in one column
    data_cols = ['data0', 'data1', 'data2', 'data3', 'data4', 'data5', 'data6', 'data7']
    
    ##The data column is in hexadecimal
    data['data'] = data[data_cols].apply(''.join, axis=1)
    data.drop(columns = data_cols, inplace = True, axis = 1)
    
    ##Converting columns to decimal
    data['can_id'] = data['can_id'].apply(hex_to_dec)
    data['data'] = data['data'].apply(hex_to_dec)

    data = data.assign(IAT=data['timestamp'].diff().fillna(0))
    
    return data[:150_000]

    

In [6]:
smart_attack = pd.read_csv(smart_attack_path)

In [7]:
def sequencify_data(X, y, seq_size=10):
    max_index = len(X) - seq_size + 1

    X_seq = []
    y_seq = []

    for i in range(0, max_index, seq_size):
        X_seq.append(X[i:i+seq_size])  # Append the sequence from DataFrame 'X'
        try:
            y_seq.append(1 if 1 in y[i:i+seq_size].values else 0)  # Check for '1' in 'y' values
        except:
             y_seq.append(1 if 1 in y[i:i+seq_size] else 0)

    return np.array(X_seq), np.array(y_seq)

In [8]:
smart_attack.drop(columns = ['Timestamp'], inplace = True)

In [9]:
X_smart = smart_attack.drop(['label'], axis = 1).values
y_smart = smart_attack['label']
y_tri = y_smart.copy(deep = True)
y_smart = y_smart.replace(2,1)

In [10]:
np.unique(y_smart, return_counts = True)

(array([0., 1.]), array([100000,  47462]))

In [11]:
X_seq_smart, y_seq_smart = sequencify_data(X_smart, y_smart)

In [12]:
print(np.unique(y_seq_smart, return_counts=True))

(array([0, 1]), array([ 1350, 13396]))


In [13]:
def balance_data(X_seq, y_seq):
    # Get indices for label 0 and label 1
    zero_indices = np.where(y_seq == 0)[0]
    one_indices = np.where(y_seq == 1)[0]

    # Find the number of samples for label 0
    num_zeros = len(zero_indices)

    # Randomly sample an equal number of samples from label 1
    np.random.seed(42)  # Set seed for reproducibility
    sampled_one_indices = np.random.choice(one_indices, num_zeros, replace=False)

    # Combine the indices of label 0 and sampled label 1
    balanced_indices = np.concatenate([zero_indices, sampled_one_indices])

    # Shuffle the balanced indices to avoid any ordering issues
    np.random.shuffle(balanced_indices)

    # Subset X_seq and y_seq based on the balanced indices
    X_seq_balanced = X_seq[balanced_indices]
    y_seq_balanced = y_seq[balanced_indices]

    return X_seq_balanced, y_seq_balanced


In [14]:
# X_seq_smart, y_seq_smart = balance_data(X_seq_smart, y_seq_smart)

In [15]:
print(np.unique(y_seq_smart, return_counts=True))

(array([0, 1]), array([ 1350, 13396]))


In [16]:
X_train_smart, X_test_smart, y_train_smart, y_test_smart = train_test_split(X_smart, y_smart, test_size=0.3, random_state = 42)
X_train_seq_smart, X_test_seq_smart, y_train_seq_smart, y_test_seq_smart = train_test_split(X_seq_smart, y_seq_smart, test_size = 0.3, shuffle = True)

In [17]:
X_train_seq_smart.shape

(10322, 10, 4)

In [18]:
scaler = joblib.load('scaler_m0.sav')

X_train_smart = scaler.fit_transform(X_train_smart)
X_test_smart = scaler.transform(X_test_smart)

In [19]:
train_means = np.mean(X_train_seq_smart, axis=(0, 1))  # Mean of each feature across training samples and timesteps
train_stds = np.std(X_train_seq_smart, axis=(0, 1))    # Standard deviation of each feature across training samples and timesteps

# Handle case where std is zero (to avoid division by zero)
train_stds[train_stds == 0] = 1e-8

# Standardize the training set
X_train_seq_smart = (X_train_seq_smart - train_means) / train_stds

# Standardize the test set using the training set's mean and std
X_test_seq_smart = (X_test_seq_smart - train_means) / train_stds

In [20]:
np.shape(X_train_seq_smart)

(10322, 10, 4)

In [21]:
np.shape(X_test_seq_smart)

(4424, 10, 4)

In [22]:
np.shape(X_train_smart)

(103223, 4)

In [23]:
# class_weights = class_weight.compute_class_weight('balanced',
#                                                  classes = np.unique(y_train_smart),
#                                                  y = y_train_smart)
# class_weights = dict(enumerate(class_weights))


seq_class_weights = class_weight.compute_class_weight('balanced',
                                                 classes = np.unique(y_train_seq_smart),
                                                 y = y_train_seq_smart)
seq_class_weights = dict(enumerate(seq_class_weights))

In [24]:
print(seq_class_weights)

{0: 5.508004268943436, 1: 0.5499200852424081}


In [25]:
oversample = SMOTE()
X_train_smote, y_train_smote = oversample.fit_resample(X_train_smart, y_train_smart) 
# X_train_smote, y_train_smote = X_train, y_train_smart

In [26]:
np.unique(y_train_smote, return_counts = True)

(array([0., 1.]), array([70127, 70127]))

In [27]:
##MLP

print("-----MLP-------")

mlp = load_model('mlp_m0.h5')

mlp.compile(optimizer='adam',
                loss=BinaryCrossentropy(from_logits=False),
                metrics=['accuracy'])

es = EarlyStopping(monitor = 'val_loss', patience = 5, restore_best_weights = True)

mlp_hist = mlp.fit(X_train_smote, y_train_smote, epochs=100, callbacks = [es], validation_split=0.2, batch_size = 32)

-----MLP-------
Epoch 1/100


2024-10-15 09:49:19.395060: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-10-15 09:49:19.399757: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


In [28]:
##MLP
print("-----MLP-------")
threshold = 0.5
mlp_preds = mlp.predict(X_test_smart, batch_size = 32)
mlp_preds = (mlp_preds >= threshold).astype(int)

print("--------Smart--------")
print("ACCURACY: ", accuracy_score(y_test_smart, mlp_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_test_smart, mlp_preds))


-----MLP-------
 195/1383 [===>..........................] - ETA: 2s

--------Smart--------
ACCURACY:  0.6421935396369719
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

         0.0       0.78      0.65      0.71     29873
         1.0       0.46      0.62      0.53     14366

    accuracy                           0.64     44239
   macro avg       0.62      0.64      0.62     44239
weighted avg       0.68      0.64      0.65     44239



In [29]:
##LSTM

print("-----LSTM-------")

lstm = load_model('lstm_m0_weighted.h5')

lstm.compile(
    loss = BinaryCrossentropy(from_logits = False),
    optimizer = Adam(learning_rate = 0.001),
    metrics = ['accuracy'])

es = EarlyStopping(monitor = 'val_loss', patience = 5, restore_best_weights = True)
lstm_hist = lstm.fit(X_train_seq_smart, y_train_seq_smart, batch_size = 32, validation_split = 0.2,
        callbacks = [es], epochs = 1000, class_weight=seq_class_weights)


-----LSTM-------
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000


In [None]:
print("-----LSTM-------")

lstm_preds = lstm.predict(X_test_seq_smart, batch_size=32)
lstm_preds = (lstm_preds >= threshold).astype(int)

print("--------Smart--------")
print("ACCURACY: ", accuracy_score(y_test_seq_smart, lstm_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_test_seq_smart, lstm_preds))

-----LSTM-------
--------Smart--------
ACCURACY:  0.9579566003616636
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

           0       0.74      0.85      0.79       413
           1       0.98      0.97      0.98      4011

    accuracy                           0.96      4424
   macro avg       0.86      0.91      0.88      4424
weighted avg       0.96      0.96      0.96      4424



In [31]:
dt = joblib.load('dt_m0.pkl')
dt.fit(X_train_smote, y_train_smote)

dt_preds = dt.predict(X_test_smart)

print("--------Smart--------")
print("ACCURACY: ", accuracy_score(y_test_smart, dt_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_test_smart, dt_preds))


--------Smart--------
ACCURACY:  0.7346685051651258
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

         0.0       0.77      0.87      0.82     29873
         1.0       0.63      0.45      0.52     14366

    accuracy                           0.73     44239
   macro avg       0.70      0.66      0.67     44239
weighted avg       0.72      0.73      0.72     44239

