# Spark Structured Streaming - Demo
## Fire alarm

In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
import io
from pyspark.sql.functions import *
import time
import json
import struct
import requests 

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1,org.apache.spark:spark-streaming-kafka-0-10_2.11:2.4.5,org.apache.kafka:kafka-clients:2.6.0 pyspark-shell'
                                    
spark = (SparkSession.builder 
    .master("local[*]")
    .appName("test")
    .getOrCreate()
        )

spark

set up the environment variables

In [2]:
smoke_topic = 'SmokeSensorEvent'
temperature_topic = 'TemperatureSensorEvent'
servers = "kafka:9092"

## Understanding spark-kafka integration
Let's treat first kafka as a bulk source

In [3]:
smoke_df = (spark
  .read
  .format("kafka")
  .option("kafka.bootstrap.servers", servers)
  .option("subscribe", smoke_topic)
  .option("startingOffsets", "earliest")
  .option("endingOffsets", "latest")
  .load())

In [4]:
smoke_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [5]:
smoke_df.show(5)

+-------+--------------------+----------------+---------+------+--------------------+-------------+
|    key|               value|           topic|partition|offset|           timestamp|timestampType|
+-------+--------------------+----------------+---------+------+--------------------+-------------+
|[53 31]|[7B 22 73 65 6E 7...|SmokeSensorEvent|        0|     0|2020-11-24 15:34:...|            0|
|[53 31]|[7B 22 73 65 6E 7...|SmokeSensorEvent|        0|     1|2020-11-24 15:34:...|            0|
|[53 31]|[7B 22 73 65 6E 7...|SmokeSensorEvent|        0|     2|2020-11-24 15:35:...|            0|
|[53 31]|[7B 22 73 65 6E 7...|SmokeSensorEvent|        0|     3|2020-11-24 15:35:...|            0|
|[53 31]|[7B 22 73 65 6E 7...|SmokeSensorEvent|        0|     4|2020-11-24 15:35:...|            0|
+-------+--------------------+----------------+---------+------+--------------------+-------------+
only showing top 5 rows



In [6]:
stringified_smoke_df = smoke_df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
stringified_smoke_df.show(5,False)

+---+--------------------------------------------------+
|key|value                                             |
+---+--------------------------------------------------+
|S1 |{"sensor": "S1", "smoke": false, "ts": 1606232086}|
|S1 |{"sensor": "S1", "smoke": false, "ts": 1606232097}|
|S1 |{"sensor": "S1", "smoke": false, "ts": 1606232107}|
|S1 |{"sensor": "S1", "smoke": false, "ts": 1606232117}|
|S1 |{"sensor": "S1", "smoke": false, "ts": 1606232127}|
+---+--------------------------------------------------+
only showing top 5 rows



In [7]:
from pyspark.sql.types import *

smoke_schema = StructType([
    StructField("sensor", StringType(), True),
    StructField("smoke", BooleanType(), True),
    StructField("ts", TimestampType(), True)])

In [8]:
decoded_smoke_df = stringified_smoke_df.select(col("key").cast("string"),from_json(col("value"), smoke_schema).alias("value"))

In [9]:
decoded_smoke_df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: struct (nullable = true)
 |    |-- sensor: string (nullable = true)
 |    |-- smoke: boolean (nullable = true)
 |    |-- ts: timestamp (nullable = true)



In [10]:
decoded_smoke_df.select("value.*").show(5)

+------+-----+-------------------+
|sensor|smoke|                 ts|
+------+-----+-------------------+
|    S1|false|2020-11-24 15:34:46|
|    S1|false|2020-11-24 15:34:57|
|    S1|false|2020-11-24 15:35:07|
|    S1|false|2020-11-24 15:35:17|
|    S1|false|2020-11-24 15:35:27|
+------+-----+-------------------+
only showing top 5 rows



## DEMO
Please refer to [Gitter](https://gitter.im/USDE2020/EPL) for the EPL version of the following queries.

In [11]:
streaming_smoke_df = (spark
  .readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", servers)
  .option("startingOffsets", "earliest")
  .option("subscribe", smoke_topic)
  .load())

In [13]:
decoded_streaming_smoke_df=(streaming_smoke_df
                      .select(from_json(col("value").cast("string"), smoke_schema).alias("value"))
                      .select("value.*"))

In [14]:
smoke_query = (decoded_streaming_smoke_df
    .writeStream
    .format("memory")
    .queryName("SmokeSensorEvent")
    .start())

In [15]:
spark.sql("SELECT * FROM SmokeSensorEvent LIMIT 5").show()

+------+-----+-------------------+
|sensor|smoke|                 ts|
+------+-----+-------------------+
|    S1|false|2020-11-24 15:34:46|
|    S1|false|2020-11-24 15:34:57|
|    S1|false|2020-11-24 15:35:07|
|    S1|false|2020-11-24 15:35:17|
|    S1|false|2020-11-24 15:35:27|
+------+-----+-------------------+



In [16]:
temperarture_schema = StructType([
    StructField("sensor", StringType(), True),
    StructField("temperature", DoubleType(), True),
    StructField("ts", TimestampType(), True)])

streaming_temperature_df = (spark
  .readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", servers)
  .option("startingOffsets", "earliest")
  .option("subscribe", temperature_topic)
  .load())

decoded_streaming_temperature_df = (streaming_temperature_df
                      .select(from_json(col("value").cast("string"), temperarture_schema).alias("value"))
                      .select("value.*"))

temperature_query = (decoded_streaming_temperature_df
                     .writeStream
                     .format("memory")
                     .queryName("TemperatureSensorEvent")
                     .start())

In [17]:
streaming_temperature_df.select(from_json(col("value").cast("string"), temperarture_schema).alias("value")).select("value.*").printSchema()

root
 |-- sensor: string (nullable = true)
 |-- temperature: double (nullable = true)
 |-- ts: timestamp (nullable = true)



## Q0 - Filter

In [18]:
spark.sql("SELECT * FROM TemperatureSensorEvent WHERE temperature > 20").show(5)

+------+------------------+-------------------+
|sensor|       temperature|                 ts|
+------+------------------+-------------------+
|    S1|21.731105455964585|2020-11-24 15:34:31|
|    S1| 20.45618833625183|2020-11-24 15:34:42|
|    S1| 20.81828476345006|2020-11-24 15:35:02|
|    S1|20.054285720118937|2020-11-24 15:35:12|
|    S1| 20.28286479862641|2020-11-24 15:35:42|
+------+------------------+-------------------+
only showing top 5 rows



## Q1 - Filter

In [19]:
spark.sql("SELECT * FROM SmokeSensorEvent").show(5)

+------+-----+-------------------+
|sensor|smoke|                 ts|
+------+-----+-------------------+
|    S1|false|2020-11-24 15:34:46|
|    S1|false|2020-11-24 15:34:57|
|    S1|false|2020-11-24 15:35:07|
|    S1|false|2020-11-24 15:35:17|
|    S1|false|2020-11-24 15:35:27|
+------+-----+-------------------+
only showing top 5 rows



In [21]:
spark.sql("SELECT * FROM SmokeSensorEvent WHERE smoke").show(5)

+------+-----+-------------------+
|sensor|smoke|                 ts|
+------+-----+-------------------+
|    S1| true|2020-11-24 15:40:26|
|    S1| true|2020-11-24 15:40:36|
+------+-----+-------------------+



## Q2 - Avg

In [22]:
spark.sql("""
SELECT SENSOR, AVG(temperature) 
FROM TemperatureSensorEvent
GROUP BY SENSOR
""").show()

+------+-----------------+
|SENSOR| avg(temperature)|
+------+-----------------+
|    S1|19.71852147462904|
+------+-----------------+



## Q3 - Logical Sliding Window

**Not supported**

## Q4 - Logical Tumbling Window

In [28]:
LTW_temperature_query = (decoded_streaming_temperature_df
                         .withWatermark("TS", "1 minutes")
                         .groupBy(window("TS", "1 minutes"),"SENSOR")
                         .avg("TEMPERATURE")
                     .writeStream
                     .format("memory")
                     .queryName("results")
                     .start())

In [29]:
spark.sql("SELECT * FROM results ORDER BY window DESC").show(5,False)

+------------------------------------------+------+------------------+
|window                                    |SENSOR|avg(TEMPERATURE)  |
+------------------------------------------+------+------------------+
|[2020-11-24 15:34:00, 2020-11-24 15:35:00]|S1    |20.700160450359075|
|[2020-11-24 15:35:00, 2020-11-24 15:36:00]|S1    |19.71717485127529 |
|[2020-11-24 15:36:00, 2020-11-24 15:37:00]|S1    |19.01932177618036 |
|[2020-11-24 15:37:00, 2020-11-24 15:38:00]|S1    |19.511754399001067|
|[2020-11-24 15:38:00, 2020-11-24 15:39:00]|S1    |19.51523745625821 |
+------------------------------------------+------+------------------+
only showing top 5 rows



In [30]:
LTW_temperature_query.stop()

## Q5 - Physical Sliding Window

**Not supported**

## Q6 - Physical Tumbling Window

**Not supported**

## Q7 - Logical Hopping Window

In [31]:
LHW_temperature_query = (decoded_streaming_temperature_df
                         .withWatermark("TS", "1 minutes")
                         .groupBy(window("TS", "1 minutes", "5 seconds"),"SENSOR")
                         .avg("TEMPERATURE")
                     .writeStream
                     .format("memory")
                     .queryName("results")
                     .start())

In [33]:
spark.sql("SELECT * FROM results ORDER BY window DESC").show(5,False)

+------------------------------------------+------+------------------+
|window                                    |SENSOR|avg(TEMPERATURE)  |
+------------------------------------------+------+------------------+
|[2020-11-24 15:33:35, 2020-11-24 15:34:35]|S1    |21.731105455964585|
|[2020-11-24 15:33:40, 2020-11-24 15:34:40]|S1    |21.731105455964585|
|[2020-11-24 15:33:45, 2020-11-24 15:34:45]|S1    |21.093646896108208|
|[2020-11-24 15:33:50, 2020-11-24 15:34:50]|S1    |21.093646896108208|
|[2020-11-24 15:33:55, 2020-11-24 15:34:55]|S1    |20.700160450359075|
+------------------------------------------+------+------------------+
only showing top 5 rows



note the duplicates

In [34]:
LHW_temperature_query.stop()

## Q8 - Stream-to-Stream Join

NOTE: this stream-to-stream join is equivalent to the EPL pattern `every a = SmokeSensorEvent(smoke=true) -> every TemperatureSensorEvent(temperature > 50, sensor=a.sensor) where timer:within(1 min)`. Do not expect the same performances! It is evaluated differently.

Apply watermarks on event-time columns and other filters

In [35]:
last_minute_smoke_events = (decoded_streaming_smoke_df
                .withWatermark("ts", "1 minute")
                .filter(col("smoke") == True)
               )

last_minute_high_temperature_events = (decoded_streaming_temperature_df
                .withWatermark("ts", "1 minute")
                .filter(col("temperature") > 50)
               )

Join with event-time constraints

In [36]:
join_df = (last_minute_smoke_events.join(
  last_minute_high_temperature_events,
    (last_minute_smoke_events.sensor == last_minute_high_temperature_events.sensor) &
    (last_minute_smoke_events.ts < last_minute_high_temperature_events.ts))
           .select(last_minute_smoke_events.sensor,
                   last_minute_smoke_events.smoke,
                   last_minute_high_temperature_events.temperature,
                   last_minute_smoke_events.ts
                  ))

In [37]:
s_to_s_join_query = (join_df
                     .writeStream
                     .format("memory")
                     .queryName("results")
                     .start())

In [40]:
spark.sql("SELECT * FROM results").show(5,False)

+------+-----+------------------+-------------------+
|sensor|smoke|temperature       |ts                 |
+------+-----+------------------+-------------------+
|S1    |true |54.831176555315075|2020-11-24 15:40:26|
|S1    |true |54.831176555315075|2020-11-24 15:40:36|
|S1    |true |54.831176555315075|2020-11-24 15:40:46|
|S1    |true |54.831176555315075|2020-11-24 15:40:56|
|S1    |true |54.831176555315075|2020-11-24 15:41:06|
+------+-----+------------------+-------------------+
only showing top 5 rows



**IMPORTANT** To detect fire, run the appropriate cells in the data generators.

In [41]:
s_to_s_join_query.stop()

## Q9 - Count FireEvent

In [46]:
Count_Fire_Event_query = (join_df
                     .withWatermark("TS", "1 minutes")
                     .groupBy(window("TS", "1 minutes", "30 seconds"),"SENSOR")
                     .count()
                     .writeStream
                     .format("memory")
                     .queryName("results")
                     .start())

In [58]:
spark.sql("SELECT * FROM results ORDER BY window DESC").show(100,False)

+------------------------------------------+------+-----+
|window                                    |SENSOR|count|
+------------------------------------------+------+-----+
|[2020-11-24 15:50:30, 2020-11-24 15:51:30]|S1    |63   |
|[2020-11-24 15:50:00, 2020-11-24 15:51:00]|S1    |63   |
|[2020-11-24 15:49:30, 2020-11-24 15:50:30]|S1    |63   |
|[2020-11-24 15:49:00, 2020-11-24 15:50:00]|S1    |63   |
|[2020-11-24 15:48:30, 2020-11-24 15:49:30]|S1    |63   |
|[2020-11-24 15:48:00, 2020-11-24 15:49:00]|S1    |63   |
|[2020-11-24 15:47:30, 2020-11-24 15:48:30]|S1    |63   |
|[2020-11-24 15:47:00, 2020-11-24 15:48:00]|S1    |63   |
|[2020-11-24 15:46:30, 2020-11-24 15:47:30]|S1    |63   |
|[2020-11-24 15:46:00, 2020-11-24 15:47:00]|S1    |63   |
|[2020-11-24 15:45:30, 2020-11-24 15:46:30]|S1    |63   |
|[2020-11-24 15:45:00, 2020-11-24 15:46:00]|S1    |63   |
|[2020-11-24 15:44:30, 2020-11-24 15:45:30]|S1    |63   |
|[2020-11-24 15:44:00, 2020-11-24 15:45:00]|S1    |78   |
|[2020-11-24 1

In [59]:
Count_Fire_Event_query.stop()

## clean up

In [60]:
smoke_query.stop()

In [61]:
temperature_query.stop()