## [*Занятие 4*](https://hackmd.io/@J_qqq0PjTGK1be0341GpYA/BJEYLlK-X#/ "Spark Streaming - HackMD")

https://hackmd.io/@J_qqq0PjTGK1be0341GpYA/BJEYLlK-X#/

### Spark Streaming: [Structured Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html "Structured Streaming Programming Guide")

In [1]:
import findspark
findspark.init()

import pyspark


In [2]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [3]:
stream_in = "/stream/structured/"
!mkdir -p $stream_in

In [4]:
from pyspark.sql import types

schema = types.StructType().add("rank", "integer").add("site", "string")

In [5]:
input_df = spark.readStream.schema(schema).csv(stream_in)

In [6]:
verbatim_output = (
    input_df
    .writeStream
    .trigger(processingTime="10 seconds")
    .format("memory")
    .queryName("input2")
)

In [7]:
verbatim_output.start()

<pyspark.sql.streaming.StreamingQuery at 0x7f5dec6861d0>

In [22]:
spark.sql("SELECT count(*) FROM input2").show()

+--------+
|count(1)|
+--------+
|    5120|
+--------+



In [9]:
!shuf /data/top-1m.csv | head -n1k > `tempfile -d $stream_in`

shuf: write error: Broken pipe
shuf: write error


In [10]:
from pyspark.sql import functions

agg_stream = (
    input_df
    .select(
        functions.explode(
            functions.split("site", "\.")
        ).alias("token")
    )
    .groupBy("token")
    .count()
    .orderBy(functions.desc("count"))
)

In [11]:
(
    agg_stream
    .writeStream
    .outputMode("complete")
    .trigger(processingTime="10 seconds")
    .format("memory")
    .queryName("tokens")
    .start()
)

<pyspark.sql.streaming.StreamingQuery at 0x7f5deb612a90>

In [21]:
spark.sql("select * from tokens").show()

+--------+-----+
|   token|count|
+--------+-----+
|     com| 2157|
|     org|  243|
|     net|  180|
|      ru|  167|
|      co|  164|
|      de|  117|
|      br|   92|
|      uk|   77|
|      pl|   70|
|blogspot|   61|
|      au|   55|
|      in|   51|
|      jp|   47|
|      ir|   45|
|      it|   43|
|     gov|   40|
|     edu|   37|
|      tw|   33|
|      cz|   33|
|      fr|   33|
+--------+-----+
only showing top 20 rows



In [23]:
!shuf /data/top-1m.csv | head -n1k > `tempfile -d $stream_in`

shuf: write error: Broken pipe
shuf: write error


In [24]:
spark.sql("select * from tokens").show()

+--------+-----+
|   token|count|
+--------+-----+
|     com| 2682|
|     org|  306|
|     net|  221|
|      ru|  207|
|      co|  202|
|      de|  157|
|      br|  112|
|      uk|   98|
|      pl|   86|
|blogspot|   86|
|      au|   70|
|      in|   68|
|      jp|   59|
|      ir|   54|
|      it|   54|
|     gov|   49|
|      fr|   49|
|     edu|   48|
|      cz|   39|
|      tw|   38|
+--------+-----+
only showing top 20 rows



In [15]:
ips_df = (
    spark
    .read
    .csv("/data/ips")
    .withColumnRenamed("_c0", "site")
    .withColumnRenamed("_c1", "ip")
)

ips_df.show()

+-------------+---------------+
|         site|             ip|
+-------------+---------------+
|   google.com| 172.217.17.110|
|  youtube.com| 216.58.211.110|
| facebook.com|  157.240.20.35|
|    baidu.com|123.125.114.144|
|    baidu.com| 220.181.38.148|
|wikipedia.org| 91.198.174.192|
|       qq.com|  111.161.64.48|
|       qq.com|  111.161.64.40|
|   taobao.com| 140.205.94.189|
|   taobao.com| 140.205.220.96|
|    yahoo.com|    72.30.35.10|
|    yahoo.com|   98.137.246.7|
|    yahoo.com|   98.137.246.8|
|    yahoo.com|     72.30.35.9|
|    yahoo.com| 98.138.219.231|
|    yahoo.com| 98.138.219.232|
|    tmall.com| 140.205.94.193|
|    tmall.com| 140.205.130.99|
|   amazon.com|  176.32.98.166|
|   amazon.com|205.251.242.103|
+-------------+---------------+
only showing top 20 rows



In [16]:
annotated_stream = (
    input_df
    .join(ips_df, on="site")
    .writeStream
    .trigger(processingTime="10 seconds")
    .format("memory")
    .queryName("annotated")
    .start()
)


In [25]:
spark.sql("select * from annotated").show()

+---------------+----+---------------+
|           site|rank|             ip|
+---------------+----+---------------+
|         gmw.cn| 161|   111.202.12.1|
| 45eijvhgj2.com| 172|198.134.112.241|
| 45eijvhgj2.com| 172|198.134.112.244|
| 45eijvhgj2.com| 172|198.134.112.243|
| 45eijvhgj2.com| 172|198.134.112.242|
|      orange.fr| 349|193.252.148.140|
|      orange.fr| 349| 193.252.133.34|
|list-manage.com| 527| 205.201.132.96|
|milliyet.com.tr| 771| 34.249.120.252|
|  getintopc.com|1636|130.185.250.154|
|  letras.mus.br|1920| 177.54.157.200|
|   malavida.com|1930| 91.192.108.161|
|      01net.com|1947|  63.32.252.147|
|      01net.com|1947|     52.31.29.8|
|      01net.com|1947|   52.49.70.213|
|  teachable.com|2290|  104.20.81.110|
|  teachable.com|2290|  104.20.80.110|
| tripadvisor.in|2689|192.229.162.112|
| tripadvisor.in|2689|192.229.182.112|
| tripadvisor.in|2689| 192.229.189.15|
+---------------+----+---------------+
only showing top 20 rows



In [18]:
deduped_stream = (
    input_df
    .join(ips_df, on="site")
    .select("site", "ip")
    .groupBy("site")
    .agg(functions.collect_set("ip").alias("ips"))
    .writeStream
    .outputMode("complete")
    .trigger(processingTime="10 seconds")
    .format("memory")
    .queryName("deduped")
    .start()
)

In [26]:
spark.sql("select * from deduped").show()

+--------------------+--------------------+
|                site|                 ips|
+--------------------+--------------------+
|           1form.com|[3.104.131.222, 1...|
|creative-proteomi...|     [54.176.148.20]|
|   piecesauto-pro.fr|[104.17.55.36, 10...|
|  playspellbreak.com|[198.185.159.144,...|
|   thepiratebay.tips|    [184.168.221.53]|
|unionbankonline.c...|     [59.160.35.102]|
|        webtrekk.com|    [185.102.94.245]|
|      caibaojian.com|     [120.24.76.145]|
|puthiyathalaimura...|[13.32.42.180, 13...|
|        jwpepper.com|      [208.28.133.1]|
|           rulai.org|     [130.211.240.4]|
|         eclipse.org|     [198.41.30.198]|
|      unlocklink.com|[104.27.191.231, ...|
|go4worldbusiness.com|[52.4.108.53, 52....|
|     bustyfilmes.com|[104.18.33.207, 1...|
|           sm160.com|     [114.119.7.100]|
|damascusuniversit...|      [178.253.95.9]|
|      keywordkeg.com|      [78.46.165.85]|
|       searpages.com|   [149.210.235.193]|
|        knigolub.net|    [212.1