In [2]:
import pyspark
from pyspark.sql import *
from pyspark.sql import functions as F
from pyspark.sql.functions import *

spark = SparkSession.builder\
       .appName("Pyspark_dataframe")\
       .getOrCreate()

In [3]:
raw_data = sc.textFile('file:///D:\\spark-2.3.2-bin-hadoop2.7\\kddcup.txt').cache()
raw_data.take(5)

['0,tcp,http,SF,181,5450,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,9,9,1.00,0.00,0.11,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,19,19,1.00,0.00,0.05,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,29,29,1.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,219,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.00,0.00,0.00,0.00,1.00,0.00,0.00,39,39,1.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,217,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.00,0.00,0.00,0.00,1.00,0.00,0.00,49,49,1.00,0.00,0.02,0.00,0.00,0.00,0.00,0.00,normal.']

In [4]:
#schema inference

csv_data = raw_data.map(lambda l: l.split(","))
row_data=csv_data.map(lambda p: Row(
    duration=int(p[0]), 
    protocol_type=p[1],
    service=p[2],
    flag=p[3],
    src_bytes=int(p[4]),
    dst_bytes=int(p[5])
    )
)

In [5]:
interactions_df = spark.createDataFrame(row_data)

#interactions_df.show()

interactions_df.createOrReplaceTempView("interactions")
#interactions_df.registerTempTable("interactions")



In [None]:
spark.sql("select * from interactions").show()

+---------+--------+----+-------------+-------+---------+
|dst_bytes|duration|flag|protocol_type|service|src_bytes|
+---------+--------+----+-------------+-------+---------+
|     5450|       0|  SF|          tcp|   http|      181|
|      486|       0|  SF|          tcp|   http|      239|
|     1337|       0|  SF|          tcp|   http|      235|
|     1337|       0|  SF|          tcp|   http|      219|
|     2032|       0|  SF|          tcp|   http|      217|
|     2032|       0|  SF|          tcp|   http|      217|
|     1940|       0|  SF|          tcp|   http|      212|
|     4087|       0|  SF|          tcp|   http|      159|
|      151|       0|  SF|          tcp|   http|      210|
|      786|       0|  SF|          tcp|   http|      212|
|      624|       0|  SF|          tcp|   http|      210|
|     1985|       0|  SF|          tcp|   http|      177|
|      773|       0|  SF|          tcp|   http|      222|
|     1169|       0|  SF|          tcp|   http|      256|
|      259|   

In [None]:
interactions_df.printSchema()

root
 |-- dst_bytes: long (nullable = true)
 |-- duration: long (nullable = true)
 |-- flag: string (nullable = true)
 |-- protocol_type: string (nullable = true)
 |-- service: string (nullable = true)
 |-- src_bytes: long (nullable = true)



In [None]:
# Select tcp network interactions with more than 1 second duration and no transfer from destination
tcp_interactions = spark.sql("""
    SELECT duration, dst_bytes FROM interactions WHERE protocol_type = 'tcp' AND duration > 1000 AND dst_bytes = 0
""")
tcp_interactions.show()

+--------+---------+
|duration|dst_bytes|
+--------+---------+
|    5057|        0|
|    5059|        0|
|    5051|        0|
|    5056|        0|
|    5051|        0|
|    5039|        0|
|    5062|        0|
|    5041|        0|
|    5056|        0|
|    5064|        0|
|    5043|        0|
|    5061|        0|
|    5049|        0|
|    5061|        0|
|    5048|        0|
|    5047|        0|
|    5044|        0|
|    5063|        0|
|    5068|        0|
|    5062|        0|
+--------+---------+
only showing top 20 rows



In [None]:
interactions_df.head()

Row(dst_bytes=5450, duration=0, flag='SF', protocol_type='tcp', service='http', src_bytes=181)

In [None]:
interactions_df.first()

Row(dst_bytes=5450, duration=0, flag='SF', protocol_type='tcp', service='http', src_bytes=181)

In [None]:
# print out elements in each row

tcp_interactions_out=tcp_interactions.rdd.map(lambda p:'duration:{}:dest_bytes:{}'.format(p.duration,p.dst_bytes))

for t in tcp_interactions_out.collect():
    print(t)
    


duration:5057:dest_bytes:0
duration:5059:dest_bytes:0
duration:5051:dest_bytes:0
duration:5056:dest_bytes:0
duration:5051:dest_bytes:0
duration:5039:dest_bytes:0
duration:5062:dest_bytes:0
duration:5041:dest_bytes:0
duration:5056:dest_bytes:0
duration:5064:dest_bytes:0
duration:5043:dest_bytes:0
duration:5061:dest_bytes:0
duration:5049:dest_bytes:0
duration:5061:dest_bytes:0
duration:5048:dest_bytes:0
duration:5047:dest_bytes:0
duration:5044:dest_bytes:0
duration:5063:dest_bytes:0
duration:5068:dest_bytes:0
duration:5062:dest_bytes:0
duration:5046:dest_bytes:0
duration:5052:dest_bytes:0
duration:5044:dest_bytes:0
duration:5054:dest_bytes:0
duration:5039:dest_bytes:0
duration:5058:dest_bytes:0
duration:5051:dest_bytes:0
duration:5032:dest_bytes:0
duration:5063:dest_bytes:0
duration:5040:dest_bytes:0
duration:5051:dest_bytes:0
duration:5066:dest_bytes:0
duration:5044:dest_bytes:0
duration:5051:dest_bytes:0
duration:5036:dest_bytes:0
duration:5055:dest_bytes:0
duration:2426:dest_bytes:0
d

In [None]:
tcp_interactions_out.take(5)

['duration:5057:dest_bytes:0',
 'duration:5059:dest_bytes:0',
 'duration:5051:dest_bytes:0',
 'duration:5056:dest_bytes:0',
 'duration:5051:dest_bytes:0']

In [None]:
df_data=tcp_interactions_out.map(lambda x: x.split(':'))
df_data.take(5)

[['duration', '5057', 'dest_bytes', '0'],
 ['duration', '5059', 'dest_bytes', '0'],
 ['duration', '5051', 'dest_bytes', '0'],
 ['duration', '5056', 'dest_bytes', '0'],
 ['duration', '5051', 'dest_bytes', '0']]

In [None]:
df_1=df_data.map(lambda y: Row(duration=y[1],dest_bytes=y[3]))
df=spark.createDataFrame(df_1).show()