In [5]:
import pyspark

from pyspark.sql import SparkSession

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=0c881e3efaedab9d10ba0304697b7adf502199b07aa9d0a14687eafad407146b
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [6]:
spark = SparkSession.builder.config("spark.driver.memory", "6g").appName('chapter_5').getOrCreate()

# Identifying Anomalous Network Traffic

In [7]:
!head -n 1 /content/kddcup.data.corrected

0,tcp,http,SF,215,45076,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0,0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal.


# A First Take on Clustering

In [8]:
data_without_header = spark.read.option("inferSchema", True).\
                                  option("header", False).\
                                  csv("/content/kddcup.data.corrected")

column_names = [  "duration", "protocol_type", "service", "flag",
  "src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent",
  "hot", "num_failed_logins", "logged_in", "num_compromised",
  "root_shell", "su_attempted", "num_root", "num_file_creations",
  "num_shells", "num_access_files", "num_outbound_cmds",
  "is_host_login", "is_guest_login", "count", "srv_count",
  "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate",
  "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate",
  "dst_host_count", "dst_host_srv_count",
  "dst_host_same_srv_rate", "dst_host_diff_srv_rate",
  "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate",
  "dst_host_serror_rate", "dst_host_srv_serror_rate",
  "dst_host_rerror_rate", "dst_host_srv_rerror_rate",
  "label"]

data = data_without_header.toDF(*column_names)

In [9]:
from pyspark.sql.functions import col

data.select("label").groupBy("label").count(). \
      orderBy(col("count").desc()).show(25)

+----------------+-------+
|           label|  count|
+----------------+-------+
|          smurf.|2661606|
|        neptune.| 860307|
|         normal.| 784172|
|          satan.|  15888|
|        ipsweep.|  11167|
|      portsweep.|   8609|
|           nmap.|   2316|
|           back.|   2103|
|    warezclient.|   1020|
|       teardrop.|    879|
|            pod.|    242|
|   guess_passwd.|     53|
|    warezmaster.|     20|
|           land.|     18|
|buffer_overflow.|     17|
|           imap.|     12|
|     loadmodule.|      9|
|      ftp_write.|      8|
|        rootkit.|      8|
|       multihop.|      7|
|           perl.|      3|
|            phf.|      3|
|            spy.|      2|
+----------------+-------+



In [10]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans, KMeansModel
from pyspark.ml import Pipeline
from pprint import pprint

numeric_only = data.drop("protocol_type", "service", "flag").cache()

assembler = VectorAssembler().setInputCols(numeric_only.columns[:-1]). \
                              setOutputCol("featureVector")

kmeans = KMeans().setPredictionCol("cluster").setFeaturesCol("featureVector")

pipeline = Pipeline().setStages([assembler, kmeans])
pipeline_model = pipeline.fit(numeric_only)
kmeans_model = pipeline_model.stages[1]

pprint(kmeans_model.clusterCenters())

[array([2.15322734e+01, 1.99169416e+03, 8.40737098e+02, 5.74915120e-06,
       6.57702897e-04, 8.96867586e-06, 1.24804874e-02, 3.44949072e-05,
       1.41891811e-01, 8.31534232e-03, 7.12894748e-05, 3.49548393e-05,
       1.30482736e-02, 1.11694509e-03, 7.40490674e-05, 9.23773614e-04,
       0.00000000e+00, 2.29966048e-07, 8.21438723e-04, 3.50975836e+02,
       3.14619054e+02, 1.98776160e-01, 1.98845736e-01, 1.55306421e-02,
       1.55835158e-02, 8.10746279e-01, 1.85234133e-02, 2.77148619e-02,
       2.33300882e+02, 1.96904250e+02, 7.83367846e-01, 2.34704222e-02,
       6.36116642e-01, 6.43600919e-03, 1.98897793e-01, 1.98700165e-01,
       1.57072951e-02, 1.55297268e-02]),
 array([1.0999000e+04, 0.0000000e+00, 1.3099374e+09, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00

In [11]:
with_cluster = pipeline_model.transform(numeric_only)

with_cluster.select("cluster", "label").groupBy("cluster", "label").count().\
              orderBy(col("cluster"), col("count").desc()).show(25)

+-------+----------------+-------+
|cluster|           label|  count|
+-------+----------------+-------+
|      0|          smurf.|2661606|
|      0|        neptune.| 860307|
|      0|         normal.| 784172|
|      0|          satan.|  15888|
|      0|        ipsweep.|  11167|
|      0|      portsweep.|   8608|
|      0|           nmap.|   2316|
|      0|           back.|   2103|
|      0|    warezclient.|   1020|
|      0|       teardrop.|    879|
|      0|            pod.|    242|
|      0|   guess_passwd.|     53|
|      0|    warezmaster.|     20|
|      0|           land.|     18|
|      0|buffer_overflow.|     17|
|      0|           imap.|     12|
|      0|     loadmodule.|      9|
|      0|      ftp_write.|      8|
|      0|        rootkit.|      8|
|      0|       multihop.|      7|
|      0|           perl.|      3|
|      0|            phf.|      3|
|      0|            spy.|      2|
|      1|      portsweep.|      1|
+-------+----------------+-------+

