In [None]:
#create a spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").\
                                     appName("spark_on_docker").\
                                     getOrCreate()

In [2]:
# in Python
spark.conf.set("spark.sql.shuffle.partitions", 5)

static = spark.read.json("work/TheDefinitiveGuide/Spark-The-Definitive-Guide/data/activity-data")

streaming = spark\
    .readStream\
    .schema(static.schema)\
    .option("maxFilesPerTrigger", 10)\
    .json("work/TheDefinitiveGuide/Spark-The-Definitive-Guide/data/activity-data")

streaming.printSchema()

                                                                                

root
 |-- Arrival_Time: long (nullable = true)
 |-- Creation_Time: long (nullable = true)
 |-- Device: string (nullable = true)
 |-- Index: long (nullable = true)
 |-- Model: string (nullable = true)
 |-- User: string (nullable = true)
 |-- gt: string (nullable = true)
 |-- x: double (nullable = true)
 |-- y: double (nullable = true)
 |-- z: double (nullable = true)



In [3]:
withEventTime = streaming.selectExpr("*","cast(cast(Creation_Time as double)/1000000000 as timestamp) as event_time")

In [None]:
from pyspark.sql.functions import window, col
withEventTime.groupBy(window(col("event_time"), "10 minutes")).count()\
.writeStream\
.queryName("pyevents_per_window")\
.format("memory")\
.outputMode("complete")\
.start()

In [5]:
spark.sql("SELECT * FROM pyevents_per_window").printSchema()

root
 |-- window: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- count: long (nullable = false)



22/01/18 02:04:03 WARN FileStreamSource: Listed 80 file(s) in 5822 ms

In [None]:
spark.sql("SELECT * FROM pyevents_per_window").show(5)

In [None]:
, this does apply to any window-style aggregation (or stateful
computation) we would like:

In [13]:
from pyspark.sql.functions import window, col
withEventTime.groupBy(window(col("event_time"), "10 minutes"), "User").count()\
    .writeStream\
    .queryName("pyevents_per_window2")\
    .format("memory")\
    .outputMode("complete")\
    .start()

22/01/18 05:28:00 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-9d52c2a9-edf6-4ca6-a46b-bd6a1bffb091. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/01/18 05:28:00 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.StreamingQuery at 0x7f713022a400>

22/01/18 05:28:01 WARN FileStreamSource: Listed 80 file(s) in 5565 ms
22/01/18 05:28:07 WARN FileStreamSource: Listed 80 file(s) in 4882 ms + 2) / 10]
22/01/18 05:28:13 WARN FileStreamSource: Listed 80 file(s) in 5512 ms + 8) / 10]
22/01/18 05:28:17 WARN FileStreamSource: Listed 80 file(s) in 4111 ms + 8) / 10]


In [15]:
spark.sql("SELECT * FROM pyevents_per_window2").printSchema()

root
 |-- window: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- User: string (nullable = true)
 |-- count: long (nullable = false)



22/01/18 05:28:25 WARN FileStreamSource: Listed 80 file(s) in 3931 ms + 2) / 10]
22/01/18 05:28:30 WARN FileStreamSource: Listed 80 file(s) in 4488 ms + 2) / 10]
                                                                                

Of importance is the fact that we can also perform an aggregation on multiple columns, including the event time column. Just like we saw in the previous chapter, we can even perform these aggregations using methods like cube. 

In [None]:
from pyspark.sql.functions import window, col
withEventTime.groupBy(window(col("event_time"), "10 minutes", "5 minutes"))\
.count()\
.writeStream\
.queryName("pyevents_per_window")\
.format("memory")\
.outputMode("complete")\
.start()

In [None]:
spark.sql("SELECT * FROM pyevents_per_window2").show(5)

+--------------------+----+------+
|              window|User| count|
+--------------------+----+------+
|{2015-02-24 12:20...|   f|133623|
|{2015-02-24 13:00...|   f| 33366|
|{2015-02-24 14:50...|   e|126282|
|{2015-02-23 14:30...|   h| 94669|
|{2015-02-24 14:10...|   e| 67577|
+--------------------+----+------+

In this example, we have 10-minute windows, starting every five minutes. 

Therefore each event will fall into two different windows. You can tweak this further according to your needs:

In [19]:
from pyspark.sql.functions import window, col
withEventTime.groupBy(window(col("event_time"), "10 minutes", "5 minutes"))\
.count()\
.writeStream\
.queryName("pyevents_per_window3")\
.format("memory")\
.outputMode("complete")\
.start()

22/01/18 05:50:35 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-f9cfc10a-9af1-4981-9544-356cfb9f4588. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/01/18 05:50:35 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.StreamingQuery at 0x7f71301d9940>

22/01/18 05:50:36 WARN FileStreamSource: Listed 80 file(s) in 2536 ms
22/01/18 05:50:37 WARN FileStreamSource: Listed 80 file(s) in 2532 ms
22/01/18 05:50:38 WARN FileStreamSource: Listed 80 file(s) in 2552 ms
22/01/18 05:50:39 WARN FileStreamSource: Listed 80 file(s) in 2961 ms
22/01/18 05:50:42 WARN FileStreamSource: Listed 80 file(s) in 4724 ms + 8) / 10]
22/01/18 05:50:45 WARN FileStreamSource: Listed 80 file(s) in 5587 ms + 2) / 10]
22/01/18 05:50:46 WARN FileStreamSource: Listed 80 file(s) in 4372 ms           
22/01/18 05:50:50 WARN FileStreamSource: Listed 80 file(s) in 5015 ms + 8) / 10]
22/01/18 05:50:51 WARN FileStreamSource: Listed 80 file(s) in 4454 ms + 2) / 10]
22/01/18 05:50:53 WARN FileStreamSource: Listed 80 file(s) in 2497 ms           
22/01/18 05:50:54 WARN FileStreamSource: Listed 80 file(s) in 3222 ms + 8) / 10]
22/01/18 05:50:56 WARN FileStreamSource: Listed 80 file(s) in 3697 ms           
22/01/18 05:50:58 WARN FileStreamSource: Listed 80 file(s) in 3902 ms
22

In [21]:
spark.sql("SELECT * FROM pyevents_per_window3").printSchema()

root
 |-- window: struct (nullable = true)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- count: long (nullable = false)



22/01/18 05:52:13 WARN FileStreamSource: Listed 80 file(s) in 2041 ms
22/01/18 05:52:14 WARN FileStreamSource: Listed 80 file(s) in 2029 ms
22/01/18 05:52:15 WARN FileStreamSource: Listed 80 file(s) in 2344 ms
22/01/18 05:52:16 WARN FileStreamSource: Listed 80 file(s) in 2410 ms
22/01/18 05:52:17 WARN FileStreamSource: Listed 80 file(s) in 2617 ms
22/01/18 05:52:18 WARN FileStreamSource: Listed 80 file(s) in 2911 ms
22/01/18 05:52:19 WARN FileStreamSource: Listed 80 file(s) in 3155 ms
22/01/18 05:52:20 WARN FileStreamSource: Listed 80 file(s) in 3422 ms
22/01/18 05:52:22 WARN FileStreamSource: Listed 80 file(s) in 3563 ms
22/01/18 05:52:23 WARN FileStreamSource: Listed 80 file(s) in 3743 ms
22/01/18 05:52:24 WARN FileStreamSource: Listed 80 file(s) in 3169 ms
22/01/18 05:52:25 WARN FileStreamSource: Listed 80 file(s) in 2843 ms
22/01/18 05:52:26 WARN FileStreamSource: Listed 80 file(s) in 2478 ms
22/01/18 05:52:26 WARN FileStreamSource: Listed 80 file(s) in 2513 ms


In [None]:
spark.sql("SELECT * FROM pyevents_per_window3").show(5)

+--------------------+------+
|              window| count|
+--------------------+------+
|{2015-02-23 14:15...|107668|
|{2015-02-24 11:50...|150773|
|{2015-02-24 13:00...|133323|
|{2015-02-22 00:35...|    35|
|{2015-02-23 12:30...|100853|
+--------------------+------+

Handling Late Data with Watermarks

Concretely, a watermark is an amount of time following a given event or set of events after which we do not expect to see any more data from that time. We know this can happen due to delays on the network, devices that lose a connection, or any number of other issues.

 If we specify a watermark of 10 minutes. When doing this, we instruct Spark that any event that occurs more than 10 “event-time” minutes past a previous event should be ignored.

In [28]:
from pyspark.sql.functions import window, col
withEventTime\
    .withWatermark("event_time", "30 minutes")\
    .groupBy(window(col("event_time"), "10 minutes", "5 minutes"))\
    .count()\
    .writeStream\
    .queryName("pyevents_per_window4")\
    .format("memory")\
    .outputMode("complete")\
    .start()

22/01/18 06:28:54 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-18e732e5-b69a-49ec-9292-3901b02f7d0a. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/01/18 06:28:54 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.StreamingQuery at 0x7f71301d9cd0>

22/01/18 06:28:57 WARN FileStreamSource: Listed 80 file(s) in 2089 ms + 8) / 10]
22/01/18 06:29:01 WARN FileStreamSource: Listed 80 file(s) in 4080 ms + 2) / 10]
22/01/18 06:29:01 WARN FileStreamSource: Listed 80 file(s) in 4102 ms
22/01/18 06:29:01 WARN FileStreamSource: Listed 80 file(s) in 3710 ms + 1) / 10]
22/01/18 06:29:04 WARN FileStreamSource: Listed 80 file(s) in 3193 ms + 8) / 10]
22/01/18 06:29:04 WARN FileStreamSource: Listed 80 file(s) in 3070 ms


In [29]:
spark.sql("SELECT * FROM pyevents_per_window4").printSchema()

root
 |-- window: struct (nullable = true)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- count: long (nullable = false)



22/01/18 06:29:05 WARN FileStreamSource: Listed 80 file(s) in 4059 ms + 2) / 10]
22/01/18 06:29:07 WARN FileStreamSource: Listed 80 file(s) in 2899 ms           
22/01/18 06:29:07 WARN FileStreamSource: Listed 80 file(s) in 2963 ms


In [30]:
spark.sql("SELECT * FROM pyevents_per_window4").show(5)

+--------------------+-----+
|              window|count|
+--------------------+-----+
|{2015-02-23 14:15...|26936|
|{2015-02-24 11:50...|37714|
|{2015-02-24 13:00...|33324|
|{2015-02-22 00:35...|    6|
|{2015-02-23 12:30...|25218|
+--------------------+-----+
only showing top 5 rows

Dropping Duplicates in a Stream

One of the more difficult operations in record-at-a-time systems is removing duplicates from the stream.  ...  A perfect example of this are Internet of Things (IoT) applications that have upstream producers generating messages in nonstable network environments, and the same message might end up being sent multiple times. Your downstream applications and aggregations should be able to assume that there is only one of each message.


Essentially, Structured Streaming makes it easy to take message systems that provide at-least-once semantics, and convert them into exactly-once by dropping duplicate messages as they come in, based on arbitrary keys. To de-duplicate data, Spark will maintain a number of user specified keys and ensure that duplicates are ignored.

The core assumption is that duplicate events will have the same timestamp as well as identifier. In this model, rows with two different timestamps are two different records:

In [31]:
from pyspark.sql.functions import expr

withEventTime\
    .withWatermark("event_time", "5 seconds")\
    .dropDuplicates(["User", "event_time"])\
    .groupBy("User")\
    .count()\
    .writeStream\
    .queryName("pydeduplicated")\
    .format("memory")\
    .outputMode("complete")\
    .start()

22/01/18 07:04:08 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-7e0f9eca-e9c0-4244-89d7-41c3f765d11d. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/01/18 07:04:08 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.StreamingQuery at 0x7f713015ef70>

22/01/18 07:04:08 WARN FileStreamSource: Listed 80 file(s) in 2032 ms
22/01/18 07:04:09 WARN FileStreamSource: Listed 80 file(s) in 2030 ms
22/01/18 07:04:09 WARN FileStreamSource: Listed 80 file(s) in 2137 ms
22/01/18 07:04:10 WARN FileStreamSource: Listed 80 file(s) in 2370 ms
22/01/18 07:04:13 WARN FileStreamSource: Listed 80 file(s) in 4055 ms + 8) / 10]
22/01/18 07:04:14 WARN FileStreamSource: Listed 80 file(s) in 4019 ms + 2) / 10]
22/01/18 07:04:14 WARN FileStreamSource: Listed 80 file(s) in 4024 ms
22/01/18 07:04:15 WARN FileStreamSource: Listed 80 file(s) in 4607 ms
22/01/18 07:04:16 WARN FileStreamSource: Listed 80 file(s) in 2535 ms
22/01/18 07:04:17 WARN FileStreamSource: Listed 80 file(s) in 2824 ms0 + 5) / 5]
22/01/18 07:04:20 WARN FileStreamSource: Listed 80 file(s) in 4676 ms + 8) / 10]
22/01/18 07:04:21 WARN FileStreamSource: Listed 80 file(s) in 5568 ms
22/01/18 07:04:22 WARN FileStreamSource: Listed 80 file(s) in 6042 ms
22/01/18 07:04:22 WARN FileStreamSource: Liste

In [34]:
spark.sql("select * from pydeduplicated").printSchema()

root
 |-- User: string (nullable = true)
 |-- count: long (nullable = false)



22/01/18 07:05:56 WARN FileStreamSource: Listed 80 file(s) in 3806 ms
22/01/18 07:05:57 WARN FileStreamSource: Listed 80 file(s) in 3917 ms
22/01/18 07:05:57 WARN FileStreamSource: Listed 80 file(s) in 3049 ms
22/01/18 07:05:57 WARN FileStreamSource: Listed 80 file(s) in 3000 ms
22/01/18 07:05:58 WARN FileStreamSource: Listed 80 file(s) in 2983 ms
22/01/18 07:05:58 WARN FileStreamSource: Listed 80 file(s) in 2790 ms
22/01/18 07:05:59 WARN FileStreamSource: Listed 80 file(s) in 2574 ms
22/01/18 07:06:00 WARN FileStreamSource: Listed 80 file(s) in 2077 ms
22/01/18 07:06:02 WARN FileStreamSource: Listed 80 file(s) in 2231 ms
22/01/18 07:06:02 WARN FileStreamSource: Listed 80 file(s) in 2137 ms


In [None]:
spark.sql("select * from pydeduplicated").show(10)

+----+-----+
|User|count|
+----+-----+
|   a|80850|
|   b|91230|
|   c|77150|
|   g|91679|
|   h|77330|
|   e|96897|
|   f|92060|
|   d|81240|
|   i|92550|
+----+-----+