# Internet of Vehicles (IoV) Network Packet Analysis (NPA) for Intrusion Detection Systems (IDS) - Machine Learning

This notebook contains the Machine Learning, logistic regression portion of the this project. After pipelining the data, a model was created and a prediction was generated to determine binary classification of benign and attack byte streams.

This section requires the installation of PySpark, FindSpark and pandas

### Importing Libraries and Initiating PySpark

In [1]:
import numpy as np
import pandas as pd
from pyspark.sql.functions import col
from pyspark.sql.functions import array
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

In [2]:
import findspark
findspark.init()

In [3]:
try:
    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SparkSession
except ImportError as e:
    printmd('<<<<<!!!!! Please restart your kernel after installing Apache Spark !!!!!>>>>>')

In [None]:
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

spark = SparkSession \
    .builder \
    .getOrCreate()

### Data Loading

In [5]:
attack_df = spark.read.csv('./decimal_IOT_Dataset/decimal_attack.csv', header = True)
attack_df.createOrReplaceTempView('attack')
attack_df.show()

+---+------+------+------+------+------+------+------+------+------+--------+--------------+
| ID|DATA_0|DATA_1|DATA_2|DATA_3|DATA_4|DATA_5|DATA_6|DATA_7| label|category|specific_class|
+---+------+------+------+------+------+------+------+------+------+--------+--------------+
|291|     0|     0|     0|     0|     0|     0|     0|     0|ATTACK|     DoS|           DoS|
|291|    14|    11|     4|     4|     3|     3|     8|    12|ATTACK|     DoS|           DoS|
|291|    14|    11|     4|     4|     3|     3|     8|    12|ATTACK|     DoS|           DoS|
|291|    14|    11|     4|     4|     3|     3|     8|    12|ATTACK|     DoS|           DoS|
|291|    14|    11|     4|     4|     3|     3|     8|    12|ATTACK|     DoS|           DoS|
|291|    14|    11|     4|     4|     3|     3|     8|    12|ATTACK|     DoS|           DoS|
|291|    14|    11|     4|     4|     3|     3|     8|    12|ATTACK|     DoS|           DoS|
|291|    14|    11|     4|     4|     3|     3|     8|    12|ATTACK|  

In [6]:
benign_df = spark.read.csv('./decimal_IOT_Dataset/decimal_benign.csv', header = True)
benign_df.createOrReplaceTempView('benign')
benign_df.show()

+----+------+------+------+------+------+------+------+------+------+--------+--------------+
|  ID|DATA_0|DATA_1|DATA_2|DATA_3|DATA_4|DATA_5|DATA_6|DATA_7| label|category|specific_class|
+----+------+------+------+------+------+------+------+------+------+--------+--------------+
|  65|    96|     0|     0|     0|     0|     0|     0|     0|BENIGN|  BENIGN|        BENIGN|
|1068|   132|    13|   160|     0|     0|     0|     0|     0|BENIGN|  BENIGN|        BENIGN|
| 535|   127|   255|   127|   255|   127|   255|   127|   255|BENIGN|  BENIGN|        BENIGN|
| 131|    15|   224|     0|     0|     0|     0|     0|     0|BENIGN|  BENIGN|        BENIGN|
| 936|     1|     0|    39|    16|     0|     0|     0|     0|BENIGN|  BENIGN|        BENIGN|
| 359|     0|   128|     0|     0|     0|     1|   227|     0|BENIGN|  BENIGN|        BENIGN|
| 369|    16|   108|     0|     0|     0|     0|     0|     0|BENIGN|  BENIGN|        BENIGN|
| 516|   192|     0|   125|     0|     0|     0|     0|     

### Data Cleansing and Processing

In [19]:
# renaming an existing label column for downstream feature engineering
attack_df = attack_df.withColumnRenamed("label", "string")
benign_df = benign_df.withColumnRenamed("label", "string")

# dropping irrelevant columns
attack_df = attack_df.drop('ID', 'category', 'specific_class')
benign_df = benign_df.drop('ID', 'category', 'specific_class')

# changing data types of feature columns
cols_to_cast = ['DATA_0','DATA_1','DATA_2','DATA_3','DATA_4','DATA_5','DATA_6','DATA_7']
for col_name in cols_to_cast:
    attack_df = attack_df.withColumn(col_name, col(col_name).cast("int"))
    benign_df = benign_df.withColumn(col_name, col(col_name).cast("int"))
                                              

### Test and Train Dataset Creation

In [20]:
# 70% train data, 30% test from both attack and benign datasets
split = [0.7,0.3]
atk_dfs = attack_df.randomSplit(split)
benign_dfs = benign_df.randomSplit(split)

# combine splits from both datasets into new train and test datasets with equal ratios
train_df = benign_dfs[0].union(atk_dfs[0])
test_df = benign_dfs[1].union(atk_dfs[1])

train_df.show()
test_df.show()

                                                                                

+------+------+------+------+------+------+------+------+------+
|DATA_0|DATA_1|DATA_2|DATA_3|DATA_4|DATA_5|DATA_6|DATA_7|string|
+------+------+------+------+------+------+------+------+------+
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|   

[Stage 18:>                                                         (0 + 1) / 1]

+------+------+------+------+------+------+------+------+------+
|DATA_0|DATA_1|DATA_2|DATA_3|DATA_4|DATA_5|DATA_6|DATA_7|string|
+------+------+------+------+------+------+------+------+------+
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|     0|     0|     0|     0|     0|BENIGN|
|     0|     0|     0|   

                                                                                

### Feature Engineering

In [21]:
#Pipeline creation
indexer = StringIndexer(inputCol = 'string', outputCol = 'label')

vectorAssembler = VectorAssembler(inputCols=['DATA_0','DATA_1','DATA_2','DATA_3','DATA_4','DATA_5','DATA_6','DATA_7'\
                                            ], outputCol='features')

lr = LogisticRegression(maxIter = 10, regParam = 0.3, elasticNetParam = 0.8)


In [22]:
pipeline = Pipeline(stages = [indexer, vectorAssembler, lr])

### Model Creation

In [23]:
model = pipeline.fit(train_df)

                                                                                

### Model Training

In [24]:
prediction = model.transform(train_df)

### Model Evaluation

In [25]:
eval = MulticlassClassificationEvaluator().setMetricName('accuracy').setLabelCol('label').setPredictionCol('prediction')

In [26]:
eval.evaluate(prediction)

                                                                                

0.869193025034954

In [27]:
prediction = model.transform(test_df)

In [28]:
eval = MulticlassClassificationEvaluator().setMetricName('accuracy').setLabelCol('label').setPredictionCol('prediction')

In [44]:
eval.evaluate(prediction)

                                                                                

0.8685373026971136

In [45]:
spark.stop()