# **KDDCup Data Analytics with PySpark RDD: A structured case study**

##### data source: http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html


In [1]:
########## ONLY in Colab ##########
!pip3 install pyspark
########## ONLY in Colab ##########

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488493 sha256=d788c962870a2bc0cc1832c3c3a4e6b95b7574723ba5bb6ae64403e0ed4cec87
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
########## ONLY in Ubuntu Machine ##########
# Load Spark engine
!pip3 install -q findspark
import findspark
findspark.init()
########## ONLY in Ubuntu Machine ##########

In [3]:
from pyspark import SparkContext, SparkConf

# Initializing Spark
conf = SparkConf().setAppName("KDDCup_PySpark").setMaster("local[*]")
sc = SparkContext(conf=conf)
print(sc)
print("Ready to go!")

<SparkContext master=local[*] appName=KDDCup_PySpark>
Ready to go!


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Read and Load Data to Spark
# Data source: http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html

rdd = sc.textFile("/content/drive/MyDrive/Colab Notebooks/kddcup.data.gz")


In [6]:
# Repartition and Cache Data:

rdd.repartition(10)


print(rdd.getNumPartitions) #shuffle data


rdd.persist()



<bound method RDD.getNumPartitions of /content/drive/MyDrive/Colab Notebooks/kddcup.data.gz MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0>


/content/drive/MyDrive/Colab Notebooks/kddcup.data.gz MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

## Question 1: Get ten records randomly


In [9]:
rdd.takeSample(False, 10, 1234)

['0,icmp,ecr_i,SF,1032,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,511,511,0.00,0.00,0.00,0.00,1.00,0.00,0.00,255,255,1.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,smurf.',
 '1,tcp,smtp,SF,1272,364,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,3,0.00,0.00,0.00,0.00,1.00,0.00,1.00,231,124,0.54,0.03,0.00,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,icmp,ecr_i,SF,1032,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,510,510,0.00,0.00,0.00,0.00,1.00,0.00,0.00,255,255,1.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,smurf.',
 '0,icmp,ecr_i,SF,1032,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,511,511,0.00,0.00,0.00,0.00,1.00,0.00,0.00,255,255,1.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,smurf.',
 '0,icmp,ecr_i,SF,1032,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,509,509,0.00,0.00,0.00,0.00,1.00,0.00,0.00,255,255,1.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,smurf.',
 '0,icmp,ecr_i,SF,1032,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,508,508,0.00,0.00,0.00,0.00,1.00,0.00,0.00,255,255,1.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,smurf.',
 '0,icmp,ecr_i,SF,1032,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,511,

## Question 2: Count elements

In [8]:
rdd.count()

4898431

## Question 3: Calculate the ratio of `normal` connections


In [18]:
Normal_rdd = rdd.filter(lambda line: 'normal.' in line)

ratio = Normal_rdd.count() / rdd.count()

print("the ratio of normal connections {} %" .format(round(ratio,4)*100))

the ratio of normal connections 19.86 %


## Question 4: Get the list of labels


In [21]:
Split_rdd = rdd.map(lambda line: line.split(","))

label_rdd = Split_rdd.map(lambda item: item[-1]).distinct()

label_rdd.collect()


['normal.',
 'buffer_overflow.',
 'loadmodule.',
 'perl.',
 'neptune.',
 'smurf.',
 'guess_passwd.',
 'pod.',
 'teardrop.',
 'portsweep.',
 'ipsweep.',
 'land.',
 'ftp_write.',
 'back.',
 'imap.',
 'satan.',
 'phf.',
 'nmap.',
 'multihop.',
 'warezmaster.',
 'warezclient.',
 'spy.',
 'rootkit.']

## Question 5: Count the number of connections for each label

In [None]:
#make the rdd as value/keys
Label_rdd_kv = Split_rdd.map(lambda x: (x[-1], 1))
Label_rdd_reduce = Label_rdd_kv.reduceByKey(lambda x,y: x+y)

In [30]:
#better visualization using Pandas
import pandas as pd

Keys = Label_rdd_reduce.keys().collect()
Values = Label_rdd_reduce.values().collect()


DF = pd.DataFrame(
    {
        "Label Name": Keys,
        "Number of times": Values,
    }
)

DF.sort_values(by="Number of times", ascending=False)



Unnamed: 0,Label Name,Number of times
5,smurf.,2807886
4,neptune.,1072017
0,normal.,972781
15,satan.,15892
10,ipsweep.,12481
9,portsweep.,10413
17,nmap.,2316
13,back.,2203
20,warezclient.,1020
8,teardrop.,979


## Question 6: Get the connection type with successful `root_shell` connections to servers, where the number of data bytes from source (`src_bytes`) is 500 times more than from server (`dst_bytes`)

In [32]:
Split_rdd.filter(lambda x: x[13] == '1') \
          .map(lambda x: (x[1], x[4], x[5]))\
          .filter(lambda x: int(x[2]) > int(x[1]) * 500) \
          .collect()

[('tcp', '296', '507534'),
 ('tcp', '296', '507534'),
 ('tcp', '266', '507534'),
 ('tcp', '296', '507534'),
 ('tcp', '353', '759161'),
 ('tcp', '351', '759161'),
 ('tcp', '246', '866032'),
 ('tcp', '317', '394616'),
 ('tcp', '262', '744605'),
 ('tcp', '173', '744605'),
 ('tcp', '262', '744605'),
 ('tcp', '255', '574784'),
 ('tcp', '433', '1524348'),
 ('tcp', '1794', '3851730'),
 ('tcp', '0', '2072'),
 ('tcp', '224', '2776333'),
 ('tcp', '0', '2072'),
 ('tcp', '465', '320362'),
 ('tcp', '0', '2072'),
 ('tcp', '0', '2072')]

## Question 7:  Get the list of `Protocols`that are `normal` and `vulnerable to attacks`, where there is NOT `guest login` to the destination addresses


In [33]:
normal_protocols_rdd = Split_rdd.filter(lambda line: "normal" in line[-1] and line[21] !='1') \
         .map(lambda line: (line[1], 1)).reduceByKey(lambda x,y: x+y)

attack_protocols_rdd = Split_rdd.filter(lambda line: "normal" not in line[-1] and line[21] !='1') \
         .map(lambda line: (line[1], 1)).reduceByKey(lambda x,y: x+y)

normal_KeyValue = pd.DataFrame({'Label': normal_protocols_rdd.keys().collect(), 'State': 'normal', 'Count': normal_protocols_rdd.values().collect()})
attack_KeyValue = pd.DataFrame({'Label': attack_protocols_rdd.keys().collect(), 'State': 'attack', 'Count': attack_protocols_rdd.values().collect()})

results = normal_KeyValue.append(attack_KeyValue)
results.sort_values(by = "Label", ascending=False)

  results = normal_KeyValue.append(attack_KeyValue)


Unnamed: 0,Label,State,Count
1,udp,normal,191348
2,udp,attack,2940
0,tcp,normal,764894
0,tcp,attack,1101613
2,icmp,normal,12763
1,icmp,attack,2820782



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



## Question 8: Get a summary statistics for the sum of `tcp` connections to the same destination IP address (hint: `protocol_type` and `dst_host_count` features)

In [34]:
# Source: https://spark.apache.org/docs/latest/mllib-statistics.html


from pyspark.mllib.stat import Statistics
from math import sqrt

summary = Statistics.colStats(Split_rdd.filter(lambda line: line[1] == "tcp").map(lambda line: [int(line[31])])) # the input should be a "vector"

tcp_mean = round(float(summary.mean()),3)
tcp_std = round(float(sqrt(summary.variance())),3)
tcp_min = round(float(summary.min()),3)
tcp_max = round(float(summary.max()),3)

print([tcp_mean, tcp_std, tcp_min, tcp_max])

[201.752, 90.726, 0.0, 255.0]


  tcp_mean = round(float(summary.mean()),3)
  tcp_std = round(float(sqrt(summary.variance())),3)
  tcp_min = round(float(summary.min()),3)
  tcp_max = round(float(summary.max()),3)
