In [25]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.evaluation import Evaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [26]:
spark = SparkSession.builder.getOrCreate()

In [27]:
data = spark.read.csv("./kdd.data.txt",inferSchema=True)

In [28]:
data.show()

+---+---+---+---+---+-----+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-------+
|_c0|_c1|_c2|_c3|_c4|  _c5|_c6|_c7|_c8|_c9|_c10|_c11|_c12|_c13|_c14|_c15|_c16|_c17|_c18|_c19|_c20|_c21|_c22|_c23|_c24|_c25|_c26|_c27|_c28|_c29|_c30|_c31|_c32|_c33|_c34|_c35|_c36|_c37|_c38|_c39|_c40|   _c41|
+---+---+---+---+---+-----+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-------+
|  0|  0| 17|  9|491|    0|  0|  0|  0|  0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   2|   2| 0.0| 0.0| 0.0| 0.0| 1.0| 0.0| 0.0| 150|  25|0.17|0.03|0.17| 0.0| 0.0| 0.0|0.05| 0.0| normal|
|  0|  1| 42|  9|146|    0|  0|  0|  0|  0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|  13|   1| 0.0| 0.0| 0.0| 0.0|0.08|0.15| 0.0| 255|   1| 0.0| 0.6|0.88

In [29]:
features = np.char.add(np.repeat("_c",41).astype(str),(np.arange(0,41).astype(str)))
label = "_c41"

In [30]:
indexer = StringIndexer(inputCol=label,outputCol="label")
indexed = indexer.fit(data).transform(data)

In [31]:
assembler = VectorAssembler(inputCols=features,outputCol="features")

In [32]:
dataTransform = assembler.transform(indexed)

In [33]:
dataTransform.show()

+---+---+---+---+---+-----+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-------+-----+--------------------+
|_c0|_c1|_c2|_c3|_c4|  _c5|_c6|_c7|_c8|_c9|_c10|_c11|_c12|_c13|_c14|_c15|_c16|_c17|_c18|_c19|_c20|_c21|_c22|_c23|_c24|_c25|_c26|_c27|_c28|_c29|_c30|_c31|_c32|_c33|_c34|_c35|_c36|_c37|_c38|_c39|_c40|   _c41|label|            features|
+---+---+---+---+---+-----+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-------+-----+--------------------+
|  0|  0| 17|  9|491|    0|  0|  0|  0|  0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   2|   2| 0.0| 0.0| 0.0| 0.0| 1.0| 0.0| 0.0| 150|  25|0.17|0.03|0.17| 0.0| 0.0| 0.0|0.05| 0.0| normal|  1.0|(41,[2,3,4,22,23,...|
|  0|  1| 42|  9|146|    0|  0|  0|  0|  0|   0|   0|   0|   0| 

In [34]:
dt = DecisionTreeClassifier(labelCol="label",featuresCol="features")

In [35]:
(trainingData, testData) = dataTransform.randomSplit([0.7, 0.3])

In [36]:
model = dt.fit(trainingData)

In [37]:
pred = model.transform(testData)

In [39]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [40]:
accuracy = evaluator.evaluate(pred)
print(accuracy)

0.9469344608879493
