# Spark Structured Streaming - Demo
## Fire alarm

In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
import io
from pyspark.sql.functions import *
import time
import json
import struct
import requests 

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1,org.apache.spark:spark-streaming-kafka-0-10_2.11:2.4.5,org.apache.kafka:kafka-clients:2.6.0 pyspark-shell'
                                    
spark = (SparkSession.builder 
    .master("local[*]")
    .appName("test")
    .getOrCreate()
        )

spark

set up the environment variables

In [2]:
smoke_topic = 'SmokeSensorEvent'
temperature_topic = 'TemperatureSensorEvent'
servers = "kafka:9092"

## Understanding spark-kafka integration
Let's treat first kafka as a bulk source

In [3]:
smoke_df = (spark
  .read
  .format("kafka")
  .option("kafka.bootstrap.servers", servers)
  .option("subscribe", smoke_topic)
  .option("startingOffsets", "earliest")
  .option("endingOffsets", "latest")
  .load())

In [4]:
smoke_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [5]:
smoke_df.show(5)

+-------+--------------------+----------------+---------+------+--------------------+-------------+
|    key|               value|           topic|partition|offset|           timestamp|timestampType|
+-------+--------------------+----------------+---------+------+--------------------+-------------+
|[53 31]|[7B 22 73 65 6E 7...|SmokeSensorEvent|        0|     0|2020-11-24 16:19:...|            0|
|[53 31]|[7B 22 73 65 6E 7...|SmokeSensorEvent|        0|     1|2020-11-24 16:19:...|            0|
|[53 31]|[7B 22 73 65 6E 7...|SmokeSensorEvent|        0|     2|2020-11-24 16:19:...|            0|
|[53 31]|[7B 22 73 65 6E 7...|SmokeSensorEvent|        0|     3|2020-11-24 16:19:...|            0|
|[53 31]|[7B 22 73 65 6E 7...|SmokeSensorEvent|        0|     4|2020-11-24 16:20:...|            0|
+-------+--------------------+----------------+---------+------+--------------------+-------------+
only showing top 5 rows



In [6]:
stringified_smoke_df = smoke_df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
stringified_smoke_df.show(5,False)

+---+--------------------------------------------------+
|key|value                                             |
+---+--------------------------------------------------+
|S1 |{"sensor": "S1", "smoke": false, "ts": 1606234762}|
|S1 |{"sensor": "S1", "smoke": false, "ts": 1606234773}|
|S1 |{"sensor": "S1", "smoke": false, "ts": 1606234783}|
|S1 |{"sensor": "S1", "smoke": false, "ts": 1606234793}|
|S1 |{"sensor": "S1", "smoke": false, "ts": 1606234803}|
+---+--------------------------------------------------+
only showing top 5 rows



In [8]:
from pyspark.sql.types import *

smoke_schema = StructType([
    StructField("sensor", StringType(), True),
    StructField("smoke", BooleanType(), True),
    StructField("ts", TimestampType(), True)])

In [9]:
decoded_smoke_df = stringified_smoke_df.select(col("key").cast("string"),from_json(col("value"), smoke_schema).alias("value"))

In [10]:
decoded_smoke_df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: struct (nullable = true)
 |    |-- sensor: string (nullable = true)
 |    |-- smoke: boolean (nullable = true)
 |    |-- ts: timestamp (nullable = true)



In [20]:
decoded_smoke_df.select("value.*").show(5)

+------+-----+-------------------+
|sensor|smoke|                 ts|
+------+-----+-------------------+
|    S1|false|2020-11-24 16:19:22|
|    S1|false|2020-11-24 16:19:33|
|    S1|false|2020-11-24 16:19:43|
|    S1|false|2020-11-24 16:19:53|
|    S1|false|2020-11-24 16:20:03|
+------+-----+-------------------+
only showing top 5 rows



## DEMO
Please refer to [Gitter](https://gitter.im/USDE2020/EPL) for the EPL version of the following queries.

In [12]:
streaming_smoke_df = (spark
  .readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", servers)
  .option("startingOffsets", "earliest")
  .option("subscribe", smoke_topic)
  .load())

In [13]:
decoded_streaming_smoke_df=(streaming_smoke_df
                      .select(from_json(col("value").cast("string"), smoke_schema).alias("value"))
                      .select("value.*"))

In [14]:
decoded_streaming_smoke_df.printSchema()

root
 |-- sensor: string (nullable = true)
 |-- smoke: boolean (nullable = true)
 |-- ts: timestamp (nullable = true)



In [15]:
smoke_query = (decoded_streaming_smoke_df
    .writeStream
    .format("memory")
    .queryName("SmokeSensorEvent")
    .start())

In [23]:
spark.sql("SELECT * FROM SmokeSensorEvent ORDER BY TS DESC").show(5)

+------+-----+-------------------+
|sensor|smoke|                 ts|
+------+-----+-------------------+
|    S1|false|2020-11-24 16:39:14|
|    S1|false|2020-11-24 16:39:04|
|    S1|false|2020-11-24 16:38:54|
|    S1|false|2020-11-24 16:38:44|
|    S1|false|2020-11-24 16:38:34|
+------+-----+-------------------+
only showing top 5 rows



In [21]:
temperarture_schema = StructType([
    StructField("sensor", StringType(), True),
    StructField("temperature", DoubleType(), True),
    StructField("ts", TimestampType(), True)])

streaming_temperature_df = (spark
  .readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", servers)
  .option("startingOffsets", "earliest")
  .option("subscribe", temperature_topic)
  .load())

decoded_streaming_temperature_df = (streaming_temperature_df
                      .select(from_json(col("value").cast("string"), temperarture_schema).alias("value"))
                      .select("value.*"))

temperature_query = (decoded_streaming_temperature_df
                     .writeStream
                     .format("memory")
                     .queryName("TemperatureSensorEvent")
                     .start())

In [22]:
streaming_temperature_df.select(from_json(col("value").cast("string"), temperarture_schema).alias("value")).select("value.*").printSchema()

root
 |-- sensor: string (nullable = true)
 |-- temperature: double (nullable = true)
 |-- ts: timestamp (nullable = true)



## Q0 - Filter

In [24]:
spark.sql("SELECT * FROM TemperatureSensorEvent WHERE temperature > 20").show(5)

+------+------------------+-------------------+
|sensor|       temperature|                 ts|
+------+------------------+-------------------+
|    S1|20.007560270758102|2020-11-24 16:19:59|
|    S1| 20.09222114500581|2020-11-24 16:20:10|
|    S1| 20.50689495592087|2020-11-24 16:20:20|
|    S1|20.763614705089402|2020-11-24 16:20:30|
|    S1|21.459574875521923|2020-11-24 16:21:10|
+------+------------------+-------------------+
only showing top 5 rows



## Q1 - Filter

In [25]:
spark.sql("SELECT * FROM SmokeSensorEvent").show(5)

+------+-----+-------------------+
|sensor|smoke|                 ts|
+------+-----+-------------------+
|    S1|false|2020-11-24 16:19:22|
|    S1|false|2020-11-24 16:19:33|
|    S1|false|2020-11-24 16:19:43|
|    S1|false|2020-11-24 16:19:53|
|    S1|false|2020-11-24 16:20:03|
+------+-----+-------------------+
only showing top 5 rows



In [28]:
spark.sql("SELECT * FROM SmokeSensorEvent WHERE smoke").show(5)

+------+-----+-------------------+
|sensor|smoke|                 ts|
+------+-----+-------------------+
|    S1| true|2020-11-24 16:40:39|
|    S1| true|2020-11-24 16:40:49|
|    S1| true|2020-11-24 16:40:59|
+------+-----+-------------------+



## Q2 - Avg

In [30]:
spark.sql("""
SELECT SENSOR, AVG(temperature) 
FROM TemperatureSensorEvent
GROUP BY SENSOR
""").show()

+------+-----------------+
|SENSOR| avg(temperature)|
+------+-----------------+
|    S1|19.87929456823966|
+------+-----------------+



## Q3 - Logical Sliding Window

**Not supported**

## Q4 - Logical Tumbling Window

In [31]:
LTW_temperature_query = (decoded_streaming_temperature_df
                         .withWatermark("TS", "1 minutes")
                         .groupBy(window("TS", "1 minutes"),"SENSOR")
                         .avg("TEMPERATURE")
                     .writeStream
                     .format("memory")
                     .queryName("results")
                     .start())

In [34]:
spark.sql("SELECT * FROM results ORDER BY window DESC").show(5,False)

+------------------------------------------+------+------------------+
|window                                    |SENSOR|avg(TEMPERATURE)  |
+------------------------------------------+------+------------------+
|[2020-11-24 16:42:00, 2020-11-24 16:43:00]|S1    |19.66152089883474 |
|[2020-11-24 16:41:00, 2020-11-24 16:42:00]|S1    |19.442302437875963|
|[2020-11-24 16:40:00, 2020-11-24 16:41:00]|S1    |19.680136023966956|
|[2020-11-24 16:39:00, 2020-11-24 16:40:00]|S1    |20.018839642142996|
|[2020-11-24 16:38:00, 2020-11-24 16:39:00]|S1    |19.976101350750596|
+------------------------------------------+------+------------------+
only showing top 5 rows



In [33]:
LTW_temperature_query.stop()

## Q5 - Physical Sliding Window

**Not supported**

## Q6 - Physical Tumbling Window

**Not supported**

## Q7 - Logical Hopping Window

In [35]:
LHW_temperature_query = (decoded_streaming_temperature_df
                         .withWatermark("TS", "1 minutes")
                         .groupBy(window("TS", "1 minutes", "5 seconds"),"SENSOR")
                         .avg("TEMPERATURE")
                     .writeStream
                     .format("memory")
                     .queryName("results")
                     .start())

In [37]:
spark.sql("SELECT * FROM results ORDER BY window DESC").show(5,False)

+------------------------------------------+------+------------------+
|window                                    |SENSOR|avg(TEMPERATURE)  |
+------------------------------------------+------+------------------+
|[2020-11-24 16:44:40, 2020-11-24 16:45:40]|S1    |20.2465254488466  |
|[2020-11-24 16:44:35, 2020-11-24 16:45:35]|S1    |20.2465254488466  |
|[2020-11-24 16:44:30, 2020-11-24 16:45:30]|S1    |20.661297009034165|
|[2020-11-24 16:44:25, 2020-11-24 16:45:25]|S1    |20.661297009034165|
|[2020-11-24 16:44:20, 2020-11-24 16:45:20]|S1    |20.861393843699165|
+------------------------------------------+------+------------------+
only showing top 5 rows



note the duplicates

In [38]:
LHW_temperature_query.stop()

## Q8 - Stream-to-Stream Join

NOTE: this stream-to-stream join is equivalent to the EPL pattern `every a = SmokeSensorEvent(smoke=true) -> every TemperatureSensorEvent(temperature > 50, sensor=a.sensor) where timer:within(1 min)`. Do not expect the same performances! It is evaluated differently.

Apply watermarks on event-time columns and other filters

In [40]:
last_minute_smoke_events = (decoded_streaming_smoke_df
                .withWatermark("ts", "1 minute")
                .filter(col("smoke") == True)
               )

last_minute_high_temperature_events = (decoded_streaming_temperature_df
                .withWatermark("ts", "1 minute")
                .filter(col("temperature") > 50)
               )

Join with event-time constraints

In [41]:
join_df = (last_minute_smoke_events.join(
  last_minute_high_temperature_events,
    (last_minute_smoke_events.sensor == last_minute_high_temperature_events.sensor) &
    (last_minute_smoke_events.ts < last_minute_high_temperature_events.ts))
           .select(last_minute_smoke_events.sensor,
                   last_minute_smoke_events.smoke,
                   last_minute_high_temperature_events.temperature,
                   last_minute_smoke_events.ts
                  ))

In [42]:
s_to_s_join_query = (join_df
                     .writeStream
                     .format("memory")
                     .queryName("results")
                     .start())

In [45]:
spark.sql("SELECT * FROM results ORDER BY ts DESC").show(5,False)

+------+-----+------------------+-------------------+
|sensor|smoke|temperature       |ts                 |
+------+-----+------------------+-------------------+
|S1    |true |55.742657075291305|2020-11-24 16:54:10|
|S1    |true |53.51279575442952 |2020-11-24 16:54:00|
|S1    |true |55.742657075291305|2020-11-24 16:54:00|
|S1    |true |53.51279575442952 |2020-11-24 16:53:50|
|S1    |true |55.832399109094005|2020-11-24 16:53:50|
+------+-----+------------------+-------------------+
only showing top 5 rows



**IMPORTANT** To detect fire, run the appropriate cells in the data generators.

In [46]:
s_to_s_join_query.stop()

## Q9 - Count FireEvent

In [47]:
Count_Fire_Event_query = (join_df
                     .withWatermark("TS", "1 minutes")
                     .groupBy(window("TS", "1 minutes", "30 seconds"),"SENSOR")
                     .count()
                     .writeStream
                     .format("memory")
                     .queryName("results")
                     .start())

In [49]:
spark.sql("SELECT * FROM results ORDER BY window DESC").show(100,False)

+------------------------------------------+------+-----+
|window                                    |SENSOR|count|
+------------------------------------------+------+-----+
|[2020-11-24 16:54:00, 2020-11-24 16:55:00]|S1    |69   |
|[2020-11-24 16:53:30, 2020-11-24 16:54:30]|S1    |86   |
|[2020-11-24 16:53:00, 2020-11-24 16:54:00]|S1    |95   |
|[2020-11-24 16:52:30, 2020-11-24 16:53:30]|S1    |96   |
|[2020-11-24 16:52:00, 2020-11-24 16:53:00]|S1    |96   |
|[2020-11-24 16:51:30, 2020-11-24 16:52:30]|S1    |96   |
|[2020-11-24 16:51:00, 2020-11-24 16:52:00]|S1    |96   |
|[2020-11-24 16:50:30, 2020-11-24 16:51:30]|S1    |96   |
|[2020-11-24 16:50:00, 2020-11-24 16:51:00]|S1    |96   |
|[2020-11-24 16:49:30, 2020-11-24 16:50:30]|S1    |96   |
|[2020-11-24 16:49:00, 2020-11-24 16:50:00]|S1    |96   |
|[2020-11-24 16:48:30, 2020-11-24 16:49:30]|S1    |96   |
|[2020-11-24 16:48:00, 2020-11-24 16:49:00]|S1    |96   |
|[2020-11-24 16:47:30, 2020-11-24 16:48:30]|S1    |96   |
|[2020-11-24 1

In [50]:
Count_Fire_Event_query.stop()

## clean up

In [51]:
smoke_query.stop()

In [52]:
temperature_query.stop()