### Network Intrusion Detection - MultiLabel DecisionTree

Import and initialize findspark followed by importing pyspark 

In [1]:
import findspark
findspark.init('/usr/local/spark')
import pyspark

Create SparkContext in the variable sc

In [2]:
sc = pyspark.SparkContext(appName='networkIntrusionDecisionTreeMultiLabel')

Import the required libraries

In [3]:
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils

from pyspark.mllib.regression import LabeledPoint
from numpy import array

from pyspark.mllib.evaluation import MulticlassMetrics

from pyspark.mllib.linalg import DenseVector

Read the input data file and create the RDD

In [4]:
networkData = sc.textFile("kddcup.data_10_percent")
networkData.count()

494021

Get the total number of fields from line of the input data which is the comma separated.

In [5]:
numfields = len(networkData.take(1)[0].split(","))
print(numfields)

42


Get the distinct values of last field from the input lines.
Create a dictionary assigning a number with the help of zipWithIndex, so the key will be the string from the last field and the value will be the unique number assigned to each string.

In [6]:
labels = networkData.map(lambda line: (line.split(",")[numfields-1])).distinct()
labels.count()

23

In [7]:
for x in labels.collect():
    print(x)

multihop.
rootkit.
warezclient.
ipsweep.
teardrop.
warezmaster.
pod.
perl.
back.
land.
portsweep.
guess_passwd.
smurf.
satan.
imap.
ftp_write.
loadmodule.
nmap.
neptune.
buffer_overflow.
normal.
spy.
phf.


In [8]:
labelDict = dict(labels.zipWithIndex().collect())

In [9]:
for k, v in labelDict.items():
    print(k, v)

warezmaster. 5
rootkit. 1
ftp_write. 15
guess_passwd. 11
normal. 20
ipsweep. 3
teardrop. 4
perl. 7
nmap. 17
smurf. 12
loadmodule. 16
phf. 22
pod. 6
spy. 21
land. 9
multihop. 0
portsweep. 10
satan. 13
imap. 14
neptune. 18
back. 8
warezclient. 2
buffer_overflow. 19


Define a function to create a LabeledPoint from a comma delimited line

In [10]:
def get_cleanline(pstr):
    line = pstr.split(",")
    data = [line[0]]+line[4:numfields-1]
    return LabeledPoint(labelDict[line[numfields-1]], array([float(x) for x in data]))


Run it as a lambda function on the input data. This will create an RDD in which each element is a LabeledPoint.
And display the label and the features which is a vector.

In [40]:
cleandata=networkData.map(lambda line: get_cleanline(line))
type(cleandata)

pyspark.rdd.PipelinedRDD

In [12]:
for line in cleandata.take(10):
    print(line)

(20.0,[0.0,181.0,5450.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,8.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9.0,9.0,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0])
(20.0,[0.0,239.0,486.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,8.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,19.0,19.0,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0])
(20.0,[0.0,235.0,1337.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,8.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,29.0,29.0,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0])
(20.0,[0.0,219.0,1337.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,6.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,39.0,39.0,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0])
(20.0,[0.0,217.0,2032.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,6.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,49.0,49.0,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0])
(20.0,[0.0,217.0,2032.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,6.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,59.0,59.0,1.0,0.0,0.02,0.0,0.0,0.0

In [13]:
cleandata.first().label

20.0

In [14]:
cleandata.first().features

DenseVector([0.0, 181.0, 5450.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.0, 8.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 9.0, 9.0, 1.0, 0.0, 0.11, 0.0, 0.0, 0.0, 0.0, 0.0])

Split this RDD into training and testing portions and cache them

In [15]:
(trainingData, testData) = cleandata.randomSplit([0.9, 0.1])

In [16]:
trainingData.cache()
testData.cache()

PythonRDD[16] at RDD at PythonRDD.scala:49

Build a multi class label Decisiontree model with the training data. Used gini as the impurity parameter. 

In [17]:
model = DecisionTree.trainClassifier(trainingData, numClasses=labels.count(), categoricalFeaturesInfo={}, impurity='gini', maxDepth=4, maxBins=100)

Run the model on test data using the predict method 

In [41]:
predictions = model.predict(testData.map(lambda x: x.features))

Get the predicted values along with the the labels

In [19]:
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)

In [20]:
for line in labelsAndPredictions.take(10):
    print(line)

(20.0, 20.0)
(20.0, 20.0)
(20.0, 20.0)
(20.0, 20.0)
(20.0, 20.0)
(20.0, 20.0)
(20.0, 20.0)
(20.0, 20.0)
(20.0, 20.0)
(20.0, 20.0)


Get the metrics from the predicted values

In [21]:
metrics = MulticlassMetrics(labelsAndPredictions)

In [22]:
metrics.confusionMatrix()

DenseMatrix(7, 7, [110.0, 0.0, 0.0, 0.0, 0.0, 0.0, 13.0, 0.0, ..., 55.0, 10.0, 0.0, 2.0, 0.0, 0.0, 0.0, 9740.0], 0)

In [23]:
metrics.accuracy

0.9913400552935436

Display the decision tree

In [24]:
print('Learned classification tree model:')
print(model.toDebugString())

Learned classification tree model:
DecisionTreeModel classifier of depth 4 with 23 nodes
  If (feature 20 <= 68.0)
   If (feature 25 <= 0.48)
    If (feature 31 <= 0.14)
     If (feature 1 <= 0.0)
      Predict: 18.0
     Else (feature 1 > 0.0)
      Predict: 20.0
    Else (feature 31 > 0.14)
     If (feature 1 <= 28.0)
      Predict: 13.0
     Else (feature 1 > 28.0)
      Predict: 20.0
   Else (feature 25 > 0.48)
    If (feature 9 <= 0.0)
     If (feature 33 <= 0.48)
      Predict: 20.0
     Else (feature 33 > 0.48)
      Predict: 3.0
    Else (feature 9 > 0.0)
     If (feature 1 <= 1907.0)
      Predict: 20.0
     Else (feature 1 > 1907.0)
      Predict: 8.0
  Else (feature 20 > 68.0)
   If (feature 1 <= 519.0)
    If (feature 1 <= 28.0)
     If (feature 1 <= 17.0)
      Predict: 13.0
     Else (feature 1 > 17.0)
      Predict: 4.0
    Else (feature 1 > 28.0)
     Predict: 20.0
   Else (feature 1 > 519.0)
    Predict: 12.0



Build another model this time with entropy as the impurity parameter. And perform the same tasks as above.

In [25]:
model2 = DecisionTree.trainClassifier(trainingData, numClasses=labels.count(), categoricalFeaturesInfo={}, impurity='entropy', maxDepth=4, maxBins=100)

In [26]:
predictions2 = model2.predict(testData.map(lambda x: x.features))

In [27]:
labelsAndPredictions2 = testData.map(lambda lp: lp.label).zip(predictions2)

In [28]:
for line in labelsAndPredictions2.take(10):
    print(line)

(20.0, 20.0)
(20.0, 20.0)
(20.0, 20.0)
(20.0, 20.0)
(20.0, 20.0)
(20.0, 20.0)
(20.0, 20.0)
(20.0, 20.0)
(20.0, 20.0)
(20.0, 20.0)


In [29]:
metrics2 = MulticlassMetrics(labelsAndPredictions2)

In [30]:
metrics2.confusionMatrix()

DenseMatrix(6, 6, [42.0, 0.0, 0.0, 0.0, 0.0, 71.0, 0.0, 205.0, ..., 10678.0, 55.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9752.0], 0)

In [31]:
metrics2.accuracy

0.9893681899495853

In [32]:
print('Learned classification tree model2:')
print(model2.toDebugString())

Learned classification tree model2:
DecisionTreeModel classifier of depth 4 with 23 nodes
  If (feature 20 <= 68.0)
   If (feature 25 <= 0.48)
    If (feature 31 <= 0.1)
     If (feature 1 <= 0.0)
      Predict: 18.0
     Else (feature 1 > 0.0)
      Predict: 20.0
    Else (feature 31 > 0.1)
     If (feature 1 <= 7.0)
      Predict: 13.0
     Else (feature 1 > 7.0)
      Predict: 20.0
   Else (feature 25 > 0.48)
    If (feature 6 <= 0.0)
     If (feature 1 <= 28.0)
      Predict: 20.0
     Else (feature 1 > 28.0)
      Predict: 20.0
    Else (feature 6 > 0.0)
     If (feature 1 <= 1907.0)
      Predict: 20.0
     Else (feature 1 > 1907.0)
      Predict: 8.0
  Else (feature 20 > 68.0)
   If (feature 1 <= 519.0)
    If (feature 1 <= 28.0)
     If (feature 1 <= 17.0)
      Predict: 13.0
     Else (feature 1 > 17.0)
      Predict: 4.0
    Else (feature 1 > 28.0)
     Predict: 20.0
   Else (feature 1 > 519.0)
    Predict: 12.0



Using the model to predict for new set of data points.

* Write functions to load a file, parse and classify new data using the predict method of the given model.
* Create a new dict by reversing the key, values of label dictionary so that the names of the predicted values can be displayed instead of only the corresponding the number.

In [33]:
def get_currentline(pstr):
    cline = pstr.split(",")
    return [cline[0]]+cline[4:numfields-2]


In [34]:
def classify(filename, pmodel):
    currentData = sc.textFile(filename)
    fields = currentData.map(lambda gline: get_currentline(gline))
    return pmodel.predict(fields.map(lambda x: x))

In [35]:
labelDictRev = dict((v,k) for k, v in labelDict.items())

In [36]:
# Alternate syntax to reverse key, values of a dictionary
# labelDictRev2 = {v: k for k, v in labelDict.items()}

In [37]:
for k, v in labelDictRev.items():
    print(k, v)


0 multihop.
1 rootkit.
2 warezclient.
3 ipsweep.
4 teardrop.
5 warezmaster.
6 pod.
7 perl.
8 back.
9 land.
10 portsweep.
11 guess_passwd.
12 smurf.
13 satan.
14 imap.
15 ftp_write.
16 loadmodule.
17 nmap.
18 neptune.
19 buffer_overflow.
20 normal.
21 spy.
22 phf.


Classify new data given in current.txt file using the model with the help of above functions.

In [38]:
cLabelsAndPredictions = classify("current.txt", model)

In [39]:
for pred in cLabelsAndPredictions.collect():
    print(pred, labelDictRev[int(pred)])

20.0 normal.
20.0 normal.
20.0 normal.
18.0 neptune.
20.0 normal.
3.0 ipsweep.
20.0 normal.
18.0 neptune.
20.0 normal.
20.0 normal.


We can pass model2 also as the parameter to classify function and do the predictions if we find the accuracy of model2 to be better.