In [1]:
import numpy as np
import random
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
#Create spark session
spark = SparkSession.builder.getOrCreate()

In [3]:
#load data
data = spark.read.csv("./kdd.data.txt",inferSchema=True)

In [4]:
data.show(3)

+---+---+---+---+---+---+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-------+
|_c0|_c1|_c2|_c3|_c4|_c5|_c6|_c7|_c8|_c9|_c10|_c11|_c12|_c13|_c14|_c15|_c16|_c17|_c18|_c19|_c20|_c21|_c22|_c23|_c24|_c25|_c26|_c27|_c28|_c29|_c30|_c31|_c32|_c33|_c34|_c35|_c36|_c37|_c38|_c39|_c40|   _c41|
+---+---+---+---+---+---+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-------+
|  0|  0| 17|  9|491|  0|  0|  0|  0|  0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   2|   2| 0.0| 0.0| 0.0| 0.0| 1.0| 0.0| 0.0| 150|  25|0.17|0.03|0.17| 0.0| 0.0| 0.0|0.05| 0.0| normal|
|  0|  1| 42|  9|146|  0|  0|  0|  0|  0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|  13|   1| 0.0| 0.0| 0.0| 0.0|0.08|0.15| 0.0| 255|   1| 0.0| 0.6|0.88| 0.0| 0.0

In [5]:
#make list with features
features = np.char.add(np.repeat("_c",41).astype(str),(np.arange(0,41).astype(str)))
label = "_c41"

In [6]:
#transform label from normal/anomaly into 0/1
indexer = StringIndexer(inputCol=label,outputCol="label")
indexed = indexer.fit(data).transform(data)

In [7]:
#gather all features into one column
assembler = VectorAssembler(inputCols=features,outputCol="features")

In [8]:
dataTransform = assembler.transform(indexed)

In [9]:
dataTransform.show(3)

+---+---+---+---+---+---+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-------+-----+--------------------+
|_c0|_c1|_c2|_c3|_c4|_c5|_c6|_c7|_c8|_c9|_c10|_c11|_c12|_c13|_c14|_c15|_c16|_c17|_c18|_c19|_c20|_c21|_c22|_c23|_c24|_c25|_c26|_c27|_c28|_c29|_c30|_c31|_c32|_c33|_c34|_c35|_c36|_c37|_c38|_c39|_c40|   _c41|label|            features|
+---+---+---+---+---+---+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-------+-----+--------------------+
|  0|  0| 17|  9|491|  0|  0|  0|  0|  0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   2|   2| 0.0| 0.0| 0.0| 0.0| 1.0| 0.0| 0.0| 150|  25|0.17|0.03|0.17| 0.0| 0.0| 0.0|0.05| 0.0| normal|  1.0|(41,[2,3,4,22,23,...|
|  0|  1| 42|  9|146|  0|  0|  0|  0|  0|   0|   0|   0|   0|   0|   0| 

In [10]:
#Decision Tree
iterations = 10
accuraciesDT = np.array([])
for i in range(iterations):
    random.seed(i)
    dt = DecisionTreeClassifier(labelCol="label",featuresCol="features")

    (trainingData, testData) = dataTransform.randomSplit([0.7, 0.3])

    model = dt.fit(trainingData)

    pred = model.transform(testData)

    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

    accuracy = evaluator.evaluate(pred)
    accuraciesDT = np.append(accuraciesDT,accuracy)

minDT = np.min(accuraciesDT)
maxDT = np.max(accuraciesDT)
meanDT = np.mean(accuraciesDT)
stdDT = np.std(accuraciesDT)
print("For DT the accuracies were: min=%.3f, max=%.3f, mean=%.3f, std=%.3f" % (minDT, maxDT, meanDT,stdDT))

For DT the accuracies were: min=0.945, max=0.957, mean=0.948, std=0.003


In [11]:
#Logistic Regression
iterations = 10
accuraciesLR = np.array([])
for i in range(iterations):
    random.seed(i)
    lr = LogisticRegression(labelCol="label",featuresCol="features")

    (trainingData, testData) = dataTransform.randomSplit([0.7,0.3])

    model = lr.fit(trainingData)

    pred = model.transform(testData)

    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

    accuracy = evaluator.evaluate(pred)
    accuraciesLR = np.append(accuraciesLR,accuracy)

minLR = np.min(accuraciesLR)
maxLR = np.max(accuraciesLR)
meanLR = np.mean(accuraciesLR)
stdLR = np.std(accuraciesLR)
print("For LR the accuracies were: min=%.3f, max=%.3f, mean=%.3f, std=%.3f" % (minLR, maxLR, meanLR,stdLR))

For LR the accuracies were: min=0.885, max=0.891, mean=0.888, std=0.001
