## [*Занятие 4*](https://hackmd.io/@J_qqq0PjTGK1be0341GpYA/BJEYLlK-X#/ "Spark Streaming - HackMD")

https://hackmd.io/@J_qqq0PjTGK1be0341GpYA/BJEYLlK-X#/

### Spark Streaming: [Structured Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html "Structured Streaming Programming Guide")

In [1]:
import findspark
findspark.init()

import pyspark

In [2]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [3]:
stream_in = "/stream/strutured/"

!mkdir -p $stream_in

In [6]:
from pyspark.sql import types

schema = types.StructType().add('rank', 'integer').add('site', 'string')

In [7]:
input_df = spark.readStream.csv(path=stream_in, schema=schema)

In [17]:
verbatim_output = input_df.writeStream.trigger(processingTime='10 seconds').format('memory').queryName('input2')

In [19]:
verbatim_output.start()

<pyspark.sql.streaming.StreamingQuery at 0x7f88cb300c50>

In [47]:
spark.sql('SELECT * from input2 ORDER BY rank').show()

+----+--------------------+
|rank|                site|
+----+--------------------+
| 493|        cricbuzz.com|
| 617|        3c.tmall.com|
| 718|      gmanetwork.com|
| 731|   yespornplease.com|
| 866|    secureserver.net|
| 906|        lifewire.com|
|1066|            miui.com|
|1114|       interpark.com|
|1175|           nta.go.jp|
|1245|           jjwxc.net|
|1263|    tomshardware.com|
|1379|kuronekoyamato.co.jp|
|1761|    onclicksuper.com|
|1818|   globalsources.com|
|1886|            vlive.tv|
|2094|adultfriendfinder...|
|2186|     itslearning.com|
|2354|     khabaronline.ir|
|2717|    shahrekhabar.com|
|2774|             medu.ir|
+----+--------------------+
only showing top 20 rows



In [43]:
!shuf /data/top-1m.csv | head -n1k > `tempfile -d $stream_in`

shuf: write error: Broken pipe
shuf: write error


In [32]:
from pyspark.sql import functions

agg_stream = (
    input_df
    .select(
        functions.explode(
            functions.split("site", r"\.")
        ).alias('token')
    )
    .groupBy('token')
    .count()
    .orderBy(functions.desc('count'))
)

In [33]:
(
    agg_stream
    .writeStream
    .outputMode('complete')
    .trigger(processingTime='10 seconds')
    .format('memory')
    .queryName('tokens')
    .start()
)


<pyspark.sql.streaming.StreamingQuery at 0x7f88ca237780>

In [46]:
spark.sql("SELECT * FROM tokens").show()

+--------+-----+
|   token|count|
+--------+-----+
|     com| 3109|
|     org|  374|
|      ru|  277|
|     net|  272|
|      co|  233|
|      de|  215|
|      br|  108|
|      uk|  106|
|blogspot|  102|
|      in|   97|
|      pl|   83|
|      ir|   82|
|      it|   76|
|      jp|   74|
|      au|   73|
|      fr|   68|
|     gov|   67|
|    info|   61|
|     edu|   59|
|      cz|   53|
+--------+-----+
only showing top 20 rows



In [49]:
ips_df = (
    spark
    .read
    .csv('/data/ips')
    .withColumnRenamed('_c0', 'site')
    .withColumnRenamed('_c1', 'ip')
)

ips_df.show()

+-------------+---------------+
|         site|             ip|
+-------------+---------------+
|   google.com| 172.217.17.110|
|  youtube.com| 216.58.211.110|
| facebook.com|  157.240.20.35|
|    baidu.com|123.125.114.144|
|    baidu.com| 220.181.38.148|
|wikipedia.org| 91.198.174.192|
|       qq.com|  111.161.64.48|
|       qq.com|  111.161.64.40|
|   taobao.com| 140.205.94.189|
|   taobao.com| 140.205.220.96|
|    yahoo.com|    72.30.35.10|
|    yahoo.com|   98.137.246.7|
|    yahoo.com|   98.137.246.8|
|    yahoo.com|     72.30.35.9|
|    yahoo.com| 98.138.219.231|
|    yahoo.com| 98.138.219.232|
|    tmall.com| 140.205.94.193|
|    tmall.com| 140.205.130.99|
|   amazon.com|  176.32.98.166|
|   amazon.com|205.251.242.103|
+-------------+---------------+
only showing top 20 rows



In [51]:
annotated_stream =(
    input_df
    .join(ips_df, on='site')
    .writeStream
    .trigger(processingTime='10 seconds')
    .format('memory')
    .queryName('annotated')
    .start()
)

In [53]:
spark.sql('select * from annotated').show()

+--------------------+----+---------------+
|                site|rank|             ip|
+--------------------+----+---------------+
|        cricbuzz.com| 493| 35.200.167.142|
|      gmanetwork.com| 718|   52.74.223.93|
|   yespornplease.com| 731|  104.31.112.30|
|   yespornplease.com| 731|  104.31.113.30|
|    secureserver.net| 866| 208.109.192.71|
|        lifewire.com| 906|151.101.130.114|
|        lifewire.com| 906|  151.101.2.114|
|        lifewire.com| 906| 151.101.66.114|
|        lifewire.com| 906|151.101.194.114|
|            miui.com|1066|   120.92.96.58|
|       interpark.com|1114|  211.233.74.23|
|           jjwxc.net|1245|   171.8.71.172|
|    tomshardware.com|1263| 34.238.197.235|
|    tomshardware.com|1263|   52.45.70.146|
|    tomshardware.com|1263|  52.54.187.227|
|    onclicksuper.com|1761|    35.190.8.27|
|   globalsources.com|1818|  203.92.211.31|
|            vlive.tv|1886|125.209.218.157|
|            vlive.tv|1886|125.209.234.134|
|adultfriendfinder...|2094|  69.

In [54]:
deduped_stream =(
    input_df
    .join(ips_df, on='site')
    .select('site', 'ip')
    .groupBy('site')
    .agg(functions.collect_set('ip').alias('ips'))
    .writeStream
    .outputMode('complete')
    .trigger(processingTime='10 seconds')
    .format('memory')
    .queryName('deduped')
    .start()
)

In [56]:
spark.sql('select * from deduped').show()

+-------------------+--------------------+
|               site|                 ips|
+-------------------+--------------------+
|laurenceanthony.net|    [69.195.124.184]|
|         kuvalda.tv|    [83.170.111.192]|
|        kakunin.net|    [160.16.113.178]|
|      searchyen.com|[18.214.154.9, 52...|
|     onlinedizi.net|[104.26.12.179, 1...|
|        eetimes.com|      [13.65.85.121]|
|           siptv.eu|   [213.239.228.148]|
|        untappd.com|[104.25.22.104, 1...|
|   soubarato.com.br|    [34.203.199.183]|
|    four-thirds.org|     [27.34.139.113]|
|intelliresponse.com|    [173.193.182.83]|
| automobilrevue.net|   [178.218.208.194]|
|          house.gov|[35.168.94.129, 3...|
| waylandgames.co.uk|    [185.161.16.245]|
|   torrentpower.com|    [103.20.104.176]|
|           01hr.com|      [47.95.70.117]|
|      le-comedia.fr|    [195.210.43.224]|
|     independent.ie|[13.32.43.60, 13....|
|       adservme.com|     [68.65.122.165]|
|  lerugbynistere.fr|[104.27.184.29, 1...|
+----------