In [3]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, save_model
from tensorflow.keras.layers import Dense, LSTM, Input, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE
import argparse
from joblib import dump
import matplotlib.pyplot as plt


parser = argparse.ArgumentParser()

# parser.add_argument('directory', 
#                     type=str, 
#                     help = 'directory where files are stored')
# args = parser.parse_args()

base_dir = '../data/Car Hacking Dataset/'


file_name = 'smart_output.csv'
data = pd.read_csv(os.path.join(base_dir, file_name))

# save_loc = os.path.join(base_dir, 'Adversarial Training Evaluation')
# os.makedirs(save_loc)

data = data.assign(IAT=data['Timestamp'].diff().fillna(0))
data.drop(['Timestamp'], axis = 1, inplace = True)

def hex_to_bin(hex_num):
    
    binary_value = bin(int(str(hex_num), 16))[2:]
    
    return binary_value

def int_to_bin(int_num):
    
    binary_value = bin(int_num)[2:]
    
    return binary_value

def pad(value, length):
    
    curr_length = len(str(value))
    
    zeros = '0' * (length - curr_length)
    
    return zeros + value

hex_to_dec = lambda x: int(x, 16)

def transform_data(data):

    data['ID'] = data['ID'].apply(hex_to_dec)
    data['Payload'] = data['Payload'].apply(hex_to_dec)

    return data

def sequencify_data(X, y, seq_size=10):
    max_index = len(X) - seq_size + 1

    X_seq = []
    y_seq = []

    for i in range(0, max_index, seq_size):
        X_seq.append(X[i:i+seq_size])  # Append the sequence from DataFrame 'X'
        y_seq.append(1 if 1 in y[i:i+seq_size] else 0)  # Check for '1' in 'y' values

    return np.array(X_seq), np.array(y_seq)

# data = transform_data(data)

X = data.drop('label', axis = 1)
y = data['label']

X_seq, y_seq = sequencify_data(X.values, y.values)

#Splitting into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle=True)
X_seq_train, X_seq_test, y_seq_train, y_seq_test = train_test_split(X_seq, y_seq, test_size = 0.2, shuffle= True)

#Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

seq_scaler = StandardScaler()
num_train_samples, seq_length, num_features = X_seq_train.shape
num_test_samples, _, _ = X_seq_test.shape

X_train_seq_reshaped = X_seq_train.reshape(num_train_samples, -1)
X_test_seq_reshaped = X_seq_test.reshape(num_test_samples, -1)

X_train_seq_scaled = seq_scaler.fit_transform(X_train_seq_reshaped)
X_test_seq_scaled = seq_scaler.transform(X_test_seq_reshaped)

# Reshape the scaled data back to the original shape
X_seq_train = X_train_seq_scaled.reshape(num_train_samples, seq_length, num_features)
X_seq_test = X_test_seq_scaled.reshape(num_test_samples, seq_length, num_features)

oversample = SMOTE()
X_train_smote, y_train_smote = oversample.fit_resample(X_train, y_train) 

In [5]:
np.unique(y_seq, return_counts=True)

(array([0, 1]), array([93783,  8220]))

In [2]:
print(np.unique(y_train_smote, return_counts = True))

(array([0., 1.]), array([791201, 791201]))


In [4]:
##Models

print("-----MLP-------")

mlp = Sequential()
mlp.add(Input(shape = (4)))
mlp.add(Dense(128, activation = 'relu'))
mlp.add(Dense(128, activation = 'relu'))
mlp.add(Dense(1, activation = 'sigmoid'))

mlp.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])

es = EarlyStopping(monitor = 'val_loss', patience = 5, restore_best_weights = True)

mlp_hist = mlp.fit(X_train_smote, y_train_smote, epochs=100, callbacks = [es], validation_split=0.2, batch_size = 128)

##MLP
print("-----MLP-------")

threshold = 0.5
mlp_preds = mlp.predict(X_test)
mlp_preds = (mlp_preds >= threshold).astype(int)

print("ACCURACY: ", accuracy_score(y_test, mlp_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_test, mlp_preds))

with open(os.path.join(save_loc,'train_and_eval_results.txt'),'w') as file:
    file.write("-------MLP-------\n")
    file.write(f"Accuracy Score: ")
    file.write(str(accuracy_score(y_test, mlp_preds)))
    file.write("\n")
    file.write('Classification Report:\n')
    file.write(str(classification_report(y_test, mlp_preds)))
    file.write("\n\n\n\n")

mlp.save(os.path.join(save_loc, 'mlp.h5'))

plt.figure(figsize=(10, 10))
plt.plot(mlp_hist.history['loss'])
plt.plot(mlp_hist.history['val_loss'])
plt.title('MLP Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.savefig(os.path.join(save_loc,'mlp_training_history.png'))

-----MLP-------
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
-----MLP-------
ACCURACY:  0.9997843221834438
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    197670
         1.0       0.99      1.00      1.00      6338

    accuracy                           1.00    204008
   macro avg       1.00      1.00      1.00    204008
weighted avg       1.00      1.00      1.00    204008



In [5]:
##LSTM

print("-----LSTM-------")

lstm = Sequential()

lstm.add(Input(shape = X_seq_train.shape[1:]))
lstm.add(LSTM(128, activation = 'relu'))
lstm.add(Dense(1, activation = 'sigmoid'))

lstm.compile(
    loss = 'binary_crossentropy',
    optimizer = 'adam',
    metrics = ['accuracy'])

es = EarlyStopping(monitor = 'val_loss', patience = 5, restore_best_weights = True)

lstm_hist = lstm.fit(X_seq_train, y_seq_train, batch_size = 128, validation_split = 0.2,
        callbacks = [es], epochs = 1000)

print("-----LSTM-------")

lstm_preds = lstm.predict(X_seq_test, batch_size=4096)
lstm_preds = (lstm_preds >= threshold).astype(int)

print("ACCURACY: ", accuracy_score(y_seq_test, lstm_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_seq_test, lstm_preds))

with open(os.path.join(save_loc,'train_and_eval_results.txt'),'a') as file:
    file.write("-------LSTM-------\n")
    file.write(f"Accuracy Score: ")
    file.write(str(accuracy_score(y_seq_test, lstm_preds)))
    file.write("\n")
    file.write('Classification Report:\n')
    file.write(str(classification_report(y_seq_test, lstm_preds)))
    file.write("\n\n\n\n")

lstm.save(os.path.join(save_loc, 'lstm.h5'))

plt.figure(figsize=(10, 10))
plt.plot(lstm_hist.history['loss'])
plt.plot(lstm_hist.history['val_loss'])
plt.title('LSTM Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.savefig(os.path.join(save_loc,'lstm_training_history.png'))

-----LSTM-------
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
-----LSTM-------
ACCURACY:  0.9991667075143376
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     18771
           1       0.99      1.00      0.99      1630

    accuracy                           1.00     20401
   macro avg       1.00      1.00      1.00     20401
weighted avg       

In [6]:
## XGBOOST
xgb = XGBClassifier()
xgb.fit(X_train_smote, y_train_smote)
xgb_preds = xgb.predict(X_test)

print("-------XGBOOST-------")
print("ACCURACY: ", accuracy_score(y_test, xgb_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_test, xgb_preds))
xgb.save_model(os.path.join(save_loc, 'xgb.json'))

with open(os.path.join(save_loc,'train_and_eval_results.txt'),'a') as file:
    file.write("-------XGB-------\n")
    file.write(f"Accuracy Score: ")
    file.write(str(accuracy_score(y_test, xgb_preds)))
    file.write("\n")
    file.write('Classification Report:\n')
    file.write(str(classification_report(y_test, xgb_preds)))
    file.write("\n\n\n\n")

## DECISION TREE
dt = DecisionTreeClassifier(max_depth = 4)
dt.fit(X_train_smote, y_train_smote)
dt_preds = dt.predict(X_test)

print("-------DECISION TREE--------")
print("ACCURACY: ", accuracy_score(y_test, dt_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_test, dt_preds))
dump(dt, os.path.join(save_loc, 'dt.pkl'))

with open(os.path.join(save_loc,'train_and_eval_results.txt'),'a') as file:
    file.write("-------Decision Tree-------\n")
    file.write(f"Accuracy Score: ")
    file.write(str(accuracy_score(y_test, dt_preds)))
    file.write("\n")
    file.write('Classification Report:\n')
    file.write(str(classification_report(y_test, dt_preds)))
    file.write("\n\n\n\n")

## RANDOM FOREST

rf = RandomForestClassifier(n_estimators=100, max_depth=4)
rf.fit(X_train_smote, y_train_smote)
rf_preds = rf.predict(X_test)

print("-------RANDOM FOREST-------\n")
print("ACCURACY: ", accuracy_score(y_test, rf_preds))
print("CLASSIFICATION REPORT:\n", classification_report(y_test, rf_preds))
# dump(rf, os.path.join(save_loc, 'rf.pkl'))

with open(os.path.join(save_loc,'train_and_eval_results.txt'),'a') as file:
    file.write("-------Random Forest-------")
    file.write(f"Accuracy Score: ")
    file.write(str(accuracy_score(y_test, rf_preds)))
    file.write("\n")
    file.write('Classification Report:\n')
    file.write(str(classification_report(y_test, rf_preds)))
    file.write("\n\n\n\n")



-------XGBOOST-------
ACCURACY:  0.999995098231442
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    197670
         1.0       1.00      1.00      1.00      6338

    accuracy                           1.00    204008
   macro avg       1.00      1.00      1.00    204008
weighted avg       1.00      1.00      1.00    204008

-------DECISION TREE--------
ACCURACY:  0.9648690247441277
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

         0.0       1.00      0.96      0.98    197670
         1.0       0.47      1.00      0.64      6338

    accuracy                           0.96    204008
   macro avg       0.73      0.98      0.81    204008
weighted avg       0.98      0.96      0.97    204008

-------RANDOM FOREST-------

ACCURACY:  0.999593153209678
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    1976