In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=94e3bf173c761a7e4abe822eedf49d4f7f0d65701b0bdb998d678e8e52d6a6eb
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3


In [42]:
!pip install -q findspark
import findspark
findspark.init()

In [29]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.master("local[*]").appName("clustername").getOrCreate()
spark

In [47]:
df=spark.read.text("/kddcup.data.gz.zip")
df.printSchema()

root
 |-- value: string (nullable = true)



# New Section

In [48]:
df.show(1)

+------------+
|       value|
+------------+
|PK-   \b i|
+------------+
only showing top 1 row



# New Section

In [49]:
from pyspark.sql.functions import split
split_col = split(df["value"],",")
df=df.withColumn("Protocal",split_col.getItem(1)) \
      .withColumn("Service",split_col.getItem(2)) \
      .withColumn("flag",split_col.getItem(3)) \
      .withColumn("src_bytes",split_col.getItem(4)) \
      .withColumn("dst_bytes",split_col.getItem(5)) \
      .withColumn("urgent",split_col.getItem(8)) \
      .withColumn("num_failed_logins",split_col.getItem(10)) \
      .withColumn("root_shell",split_col.getItem(13)) \
      .withColumn("guest_login",split_col.getItem(21)) \
      .withColumn("label",split_col.getItem(41)) \
      .drop('value')
df.show(1)

+--------+-------+----+---------+---------+------+-----------------+----------+-----------+-----+
|Protocal|Service|flag|src_bytes|dst_bytes|urgent|num_failed_logins|root_shell|guest_login|label|
+--------+-------+----+---------+---------+------+-----------------+----------+-----------+-----+
|    NULL|   NULL|NULL|     NULL|     NULL|  NULL|             NULL|      NULL|       NULL| NULL|
+--------+-------+----+---------+---------+------+-----------------+----------+-----------+-----+
only showing top 1 row



In [50]:
df=df.repartition(10)
print(df.rdd.getNumPartitions())
df.createOrReplaceTempView("df_KDDCup")

10


# Questions 1
Count the number of connection for each labels

In [54]:
df.groupBy('label').count().orderBy('count',ascending=False).show(1)

+-----+------+
|label| count|
+-----+------+
| NULL|744836|
+-----+------+
only showing top 1 row



# Questions 2
Get the list of Protocols that are normal and vulnerable to attacks, where there is not guest login to the destination address

In [57]:
sql_query = """
select Protocal,
case
when label = 'normal' then 'no attack'
else 'attack'
END as State,
count(*) as freq
from df_KDDCup
where guest_login != '1'
group By Protocal,State
order by Protocal Desc
"""
spark.sql(sql_query).show(1)

+--------------------+------+----+
|            Protocal| State|freq|
+--------------------+------+----+
|�����l�a<w�����`�...|attack|   1|
+--------------------+------+----+
only showing top 1 row



# Question 3
Apply some Descriptive Satistics on numerical Data

In [62]:
from pyspark.sql.functions import *
summary = df.select(mean(df.src_bytes).alias("Avg"),
                    stddev(df.src_bytes).alias("std"),
                    min(df.src_bytes).alias("min"),
                    max(df.src_bytes).alias("max"),
                    skewness(df.src_bytes).alias("skewness"))
summary.show()

+-----------------+-----------------+---+-------+-------------------+
|              Avg|              std|min|    max|           skewness|
+-----------------+-----------------+---+-------+-------------------+
|5.538461538461538|2.106157030208678|   |󩔏�����|-0.4179375208829422|
+-----------------+-----------------+---+-------+-------------------+



In [71]:
groups = df.groupBy("Protocol")
groups.agg({'src_bytes':'mean'}).show()

# Questions 4

In [72]:
sql_query = """

select protocol from df_KDDCup

"""