# Internet of Vehicles (IoV) Network Packet Analysis (NPA) for Intrusion Detection Systems (IDS) - DeepLearning

This notebook contains the DeepLearning portion of the this project. After pipelining the data, a model was created using dense layers with relu and sigmoid activation for binary classification of attack and benign byte streams.

This section requires the installation of PySpark, FindSpark, tensorflow, sklearn and pandas

### Importing Libraries and Initiating PySpark

In [None]:
import numpy as np
import pandas as pd
from pyspark.sql.functions import col
from pyspark.sql.functions import array
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
import sklearn
from sklearn.preprocessing import MinMaxScaler #
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.layers import LSTM, Dense, Activation

In [2]:
import findspark
findspark.init()

In [3]:
try:
    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SparkSession
except ImportError as e:
    printmd('<<<<<!!!!! Please restart your kernel after installing Apache Spark !!!!!>>>>>')

In [None]:
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

spark = SparkSession \
    .builder \
    .getOrCreate()

### Data Loading

In [5]:
attack_df = spark.read.csv('./decimal_IOT_Dataset/decimal_attack.csv', header = True)
attack_df.createOrReplaceTempView('attack')
attack_df.show()

                                                                                

+---+------+------+------+------+------+------+------+------+------+--------+--------------+
| ID|DATA_0|DATA_1|DATA_2|DATA_3|DATA_4|DATA_5|DATA_6|DATA_7| label|category|specific_class|
+---+------+------+------+------+------+------+------+------+------+--------+--------------+
|291|     0|     0|     0|     0|     0|     0|     0|     0|ATTACK|     DoS|           DoS|
|291|    14|    11|     4|     4|     3|     3|     8|    12|ATTACK|     DoS|           DoS|
|291|    14|    11|     4|     4|     3|     3|     8|    12|ATTACK|     DoS|           DoS|
|291|    14|    11|     4|     4|     3|     3|     8|    12|ATTACK|     DoS|           DoS|
|291|    14|    11|     4|     4|     3|     3|     8|    12|ATTACK|     DoS|           DoS|
|291|    14|    11|     4|     4|     3|     3|     8|    12|ATTACK|     DoS|           DoS|
|291|    14|    11|     4|     4|     3|     3|     8|    12|ATTACK|     DoS|           DoS|
|291|    14|    11|     4|     4|     3|     3|     8|    12|ATTACK|  

In [6]:
benign_df = spark.read.csv('./decimal_IOT_Dataset/decimal_benign.csv', header = True)
benign_df.createOrReplaceTempView('benign')
benign_df.show()

+----+------+------+------+------+------+------+------+------+------+--------+--------------+
|  ID|DATA_0|DATA_1|DATA_2|DATA_3|DATA_4|DATA_5|DATA_6|DATA_7| label|category|specific_class|
+----+------+------+------+------+------+------+------+------+------+--------+--------------+
|  65|    96|     0|     0|     0|     0|     0|     0|     0|BENIGN|  BENIGN|        BENIGN|
|1068|   132|    13|   160|     0|     0|     0|     0|     0|BENIGN|  BENIGN|        BENIGN|
| 535|   127|   255|   127|   255|   127|   255|   127|   255|BENIGN|  BENIGN|        BENIGN|
| 131|    15|   224|     0|     0|     0|     0|     0|     0|BENIGN|  BENIGN|        BENIGN|
| 936|     1|     0|    39|    16|     0|     0|     0|     0|BENIGN|  BENIGN|        BENIGN|
| 359|     0|   128|     0|     0|     0|     1|   227|     0|BENIGN|  BENIGN|        BENIGN|
| 369|    16|   108|     0|     0|     0|     0|     0|     0|BENIGN|  BENIGN|        BENIGN|
| 516|   192|     0|   125|     0|     0|     0|     0|     

### Data Cleaning and Processing

In [21]:
# renaming an existing label column for downstream feature engineering
attack_df = attack_df.withColumnRenamed("label", "string")
benign_df = benign_df.withColumnRenamed("label", "string")

# dropping irrelevant columns
attack_df = attack_df.drop('ID', 'category', 'specific_class')
benign_df = benign_df.drop('ID', 'category', 'specific_class')

# changing data types of feature columns
cols_to_cast = ['DATA_0','DATA_1','DATA_2','DATA_3','DATA_4','DATA_5','DATA_6','DATA_7']
for col_name in cols_to_cast:
    attack_df = attack_df.withColumn(col_name, col(col_name).cast("int"))
    benign_df = benign_df.withColumn(col_name, col(col_name).cast("int"))
                                              

### Test and Train Dataset Creation

In [22]:
# 70% train data, 30% test from both attack and benign datasets
split = [0.7,0.3]
atk_dfs = attack_df.randomSplit(split)
benign_dfs = benign_df.randomSplit(split)

# combine splits from both datasets into new train and test datasets with equal ratios
train_df = benign_dfs[0].union(atk_dfs[0])
test_df = benign_dfs[1].union(atk_dfs[1])

train_df.show()
test_df.show()

                                                                                

+------+------+------+------+------+------+------+------+------+
|DATA_0|DATA_1|DATA_2|DATA_3|DATA_4|DATA_5|DATA_6|DATA_7|string|
+------+------+------+------+------+------+------+------+------+
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|   

[Stage 25:>                                                         (0 + 1) / 1]

+------+------+------+------+------+------+------+------+------+
|DATA_0|DATA_1|DATA_2|DATA_3|DATA_4|DATA_5|DATA_6|DATA_7|string|
+------+------+------+------+------+------+------+------+------+
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|   

                                                                                

### Feature Engineering

In [23]:
indexer = StringIndexer(inputCol = 'string', outputCol = 'label')

train_data = indexer.fit(train_df).transform(train_df)
test_data = indexer.fit(test_df).transform(test_df)


                                                                                

In [27]:
pandas_df = train_data.toPandas()

# Convert Pandas DataFrame to NumPy array
numpy_array = pandas_df.to_numpy()

X_train = numpy_array[:, :-1]  # Input features (all columns except the last one)
y_train = numpy_array[:, -1]   # Target variable (last column)

                                                                                

In [28]:
pandas_df = test_data.toPandas()

# Convert Pandas DataFrame to NumPy array
numpy_array = pandas_df.to_numpy()

X_test = numpy_array[:, :-1]  # Input features (all columns except the last one)
y_test = numpy_array[:, -1]   # Target variable (last column)

                                                                                

In [29]:
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)
y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

### Model Creation

In [None]:
model = Sequential()
model.add(Dense(64, input_shape = (X_train.shape[1],), activation = 'relu'))
model.add(Dense(8, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

### Model Training

In [31]:
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2)

Epoch 1/10
[1m12306/12306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 2ms/step - accuracy: 0.9999 - loss: 0.0015 - val_accuracy: 0.3438 - val_loss: 123.5675
Epoch 2/10
[1m12306/12306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 2ms/step - accuracy: 1.0000 - loss: 4.8553e-09 - val_accuracy: 0.3438 - val_loss: 147.3027
Epoch 3/10
[1m12306/12306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 2ms/step - accuracy: 1.0000 - loss: 8.0410e-11 - val_accuracy: 0.3438 - val_loss: 155.3538
Epoch 4/10
[1m12306/12306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 2ms/step - accuracy: 1.0000 - loss: 2.6039e-11 - val_accuracy: 0.3438 - val_loss: 158.9601
Epoch 5/10
[1m12306/12306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 2ms/step - accuracy: 1.0000 - loss: 1.5425e-11 - val_accuracy: 0.3438 - val_loss: 161.2300
Epoch 6/10
[1m12306/12306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 2ms/step - accuracy: 1.0000 - loss: 1.0836e-11 - val_accu

<keras.src.callbacks.history.History at 0x19609baa0>

### Model Evaluation

In [32]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

[1m13243/13243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 1ms/step - accuracy: 0.9911 - loss: 2.5252
Test Loss: 33.67734909057617, Test Accuracy: 0.8695608973503113


In [None]:
spark.stop()