# Spark Structured Streaming - Demo
## Fire alarm

In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
import io
from pyspark.sql.functions import *
import time
import json
import struct
import requests 

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1,org.apache.spark:spark-streaming-kafka-0-10_2.11:2.4.5,org.apache.kafka:kafka-clients:2.6.0 pyspark-shell'
                                    
spark = (SparkSession.builder 
    .master("local[*]")
    .appName("test")
    .getOrCreate()
        )

spark

set up the environment variables

In [2]:
smoke_topic = 'SmokeSensorEvent'
temperature_topic = 'TemperatureSensorEvent'
servers = "kafka:9092"

## Understanding spark-kafka integration
Let's treat first kafka as a bulk source

In [3]:
smoke_df = (spark
  .read
  .format("kafka")
  .option("kafka.bootstrap.servers", servers)
  .option("subscribe", smoke_topic)
  .option("startingOffsets", "earliest")
  .option("endingOffsets", "latest")
  .load())

In [4]:
smoke_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [5]:
smoke_df.show(5)

+-------+--------------------+----------------+---------+------+--------------------+-------------+
|    key|               value|           topic|partition|offset|           timestamp|timestampType|
+-------+--------------------+----------------+---------+------+--------------------+-------------+
|[53 31]|[7B 22 73 65 6E 7...|SmokeSensorEvent|        0|     0|2021-10-17 13:10:...|            0|
|[53 31]|[7B 22 73 65 6E 7...|SmokeSensorEvent|        0|     1|2021-10-17 13:10:...|            0|
|[53 31]|[7B 22 73 65 6E 7...|SmokeSensorEvent|        0|     2|2021-10-17 13:10:...|            0|
|[53 31]|[7B 22 73 65 6E 7...|SmokeSensorEvent|        0|     3|2021-10-17 13:10:...|            0|
|[53 31]|[7B 22 73 65 6E 7...|SmokeSensorEvent|        0|     4|2021-10-17 13:10:...|            0|
+-------+--------------------+----------------+---------+------+--------------------+-------------+
only showing top 5 rows



In [6]:
stringified_smoke_df = smoke_df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
stringified_smoke_df.show(5,False)

+---+--------------------------------------------------+
|key|value                                             |
+---+--------------------------------------------------+
|S1 |{"sensor": "S1", "smoke": false, "ts": 1634476200}|
|S1 |{"sensor": "S1", "smoke": false, "ts": 1634476211}|
|S1 |{"sensor": "S1", "smoke": false, "ts": 1634476221}|
|S1 |{"sensor": "S1", "smoke": false, "ts": 1634476231}|
|S1 |{"sensor": "S1", "smoke": false, "ts": 1634476241}|
+---+--------------------------------------------------+
only showing top 5 rows



In [7]:
from pyspark.sql.types import *

smoke_schema = StructType([
    StructField("sensor", StringType(), True),
    StructField("smoke", BooleanType(), True),
    StructField("ts", TimestampType(), True)])

In [8]:
smoke_df = stringified_smoke_df.select(col("key").cast("string"),from_json(col("value"), smoke_schema).alias("value"))

In [9]:
smoke_df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: struct (nullable = true)
 |    |-- sensor: string (nullable = true)
 |    |-- smoke: boolean (nullable = true)
 |    |-- ts: timestamp (nullable = true)



In [10]:
smoke_df.select("value.*").show(5)

+------+-----+-------------------+
|sensor|smoke|                 ts|
+------+-----+-------------------+
|    S1|false|2021-10-17 13:10:00|
|    S1|false|2021-10-17 13:10:11|
|    S1|false|2021-10-17 13:10:21|
|    S1|false|2021-10-17 13:10:31|
|    S1|false|2021-10-17 13:10:41|
+------+-----+-------------------+
only showing top 5 rows



## Let's explore Spark Structured Streaming by example
Please refer to [continuous-analytics-examples/epl_firealarm/readme.md](https://github.com/quantiaconsulting/continuous-analytics-examples/blob/master/epl_firealarm/readme.md) for the EPL version of the following queries.

### Let's create the streaming Data Frames using the data in the kafka topics

In [11]:
raw_streaming_smoke_df = (spark
  .readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", servers)
  .option("startingOffsets", "earliest")
  .option("subscribe", smoke_topic)
  .load())

In [12]:
smoke_sdf=(raw_streaming_smoke_df
                      .select(from_json(col("value").cast("string"), smoke_schema).alias("value"))
                      .select("value.*"))

In [13]:
smoke_sdf.printSchema()

root
 |-- sensor: string (nullable = true)
 |-- smoke: boolean (nullable = true)
 |-- ts: timestamp (nullable = true)



In [14]:
temperarture_schema = StructType([
    StructField("sensor", StringType(), True),
    StructField("temperature", DoubleType(), True),
    StructField("ts", TimestampType(), True)])

raw_streaming_temperature_df = (spark
  .readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", servers)
  .option("startingOffsets", "earliest")
  .option("subscribe", temperature_topic)
  .load())

temperature_sdf = (raw_streaming_temperature_df
                      .select(from_json(col("value").cast("string"), temperarture_schema).alias("value"))
                      .select("value.*"))

In [15]:
temperature_sdf.printSchema()

root
 |-- sensor: string (nullable = true)
 |-- temperature: double (nullable = true)
 |-- ts: timestamp (nullable = true)



### to gain some confidence, let's first inspect the content of the stream of smoke event

In [16]:
basic_query = (smoke_sdf
    .writeStream
    .format("memory") # this is for debug purpose only! DO NOT USE IN PRODUCTION
    .queryName("sinkTable")
    .start())

run the following cell to see the most recent content of the sinkTable

In [24]:
spark.sql("SELECT * FROM sinkTable ORDER BY TS DESC").show(5)

+------+-----+---+
|sensor|smoke| ts|
+------+-----+---+
+------+-----+---+



do not forget to stop queries that you are not using

In [25]:
basic_query.stop()

## Q0 - Filter

the temperature events whose temperature is greater than 20 °C (was 50 °C in EPL)

### the SQL style

In [26]:
# create a logic table on top of the streaming data frame
temperature_sdf.createTempView("TemperatureSensorEvent")

# write your query in SQL, register it and start it
q0 = (spark.sql("select * from TemperatureSensorEvent where temperature > 20")
                     .writeStream
                     .format("memory")
                     .queryName("sinkTable")
                     .start())

In [27]:
# look up the most recent results
spark.sql("SELECT * FROM sinkTable ORDER BY TS DESC").show(5)

+------+------------------+-------------------+
|sensor|       temperature|                 ts|
+------+------------------+-------------------+
|    S1|21.922499443193175|2021-10-17 13:17:57|
|    S1| 20.63290770113203|2021-10-17 13:17:37|
|    S1|21.815063486324263|2021-10-17 13:17:27|
|    S1| 21.21051796750092|2021-10-17 13:17:07|
|    S1| 20.10022509878042|2021-10-17 13:16:07|
+------+------------------+-------------------+
only showing top 5 rows



In [28]:
# clean up
q0.stop()
spark.catalog.dropTempView("TemperatureSensorEvent")

### The DataFrame style

In [29]:
q0bis = (temperature_sdf
                     .where("temperature > 20") # you can add anything that fits in a SQL where statemente 
                     .writeStream
                     .format("memory")
                     .queryName("sinkTable")
                     .start())

In [30]:
spark.sql("SELECT * FROM sinkTable ORDER BY TS DESC").show(5)

+------+------------------+-------------------+
|sensor|       temperature|                 ts|
+------+------------------+-------------------+
|    S1| 20.07658199571906|2021-10-17 13:18:17|
|    S1|21.922499443193175|2021-10-17 13:17:57|
|    S1| 20.63290770113203|2021-10-17 13:17:37|
|    S1|21.815063486324263|2021-10-17 13:17:27|
|    S1| 21.21051796750092|2021-10-17 13:17:07|
+------+------------------+-------------------+
only showing top 5 rows



In [31]:
q0bis.stop()

> NOTE: there was no need to
> * create a logic table on top of the streaming data frame
> * drop such a logic table

## Q1 - Avg

the average of all the temperature observation for each sensor up to the last event received

### the SQL sytyle

In [32]:
# create a logic table on top of the streaming data frame
temperature_sdf.createTempView("TemperatureSensorEvent") # this time we will not clean it up, because we use it in the next queries

**NOTE**: the following query gives *intentionally* an error

In [33]:
query_string = """
SELECT SENSOR, AVG(temperature) 
FROM TemperatureSensorEvent
GROUP BY SENSOR
"""

# write your query in SQL, register it and start it
q1 = (spark.sql(query_string)
                     .writeStream
                     .format("memory")
                     .queryName("sinkTable")
                     .start())

AnalysisException: Append output mode not supported when there are streaming aggregations on streaming DataFrames/DataSets without watermark;;
Aggregate [SENSOR#195], [SENSOR#195, avg(temperature#196) AS avg(temperature)#453]
+- SubqueryAlias temperaturesensorevent
   +- Project [value#193.sensor AS sensor#195, value#193.temperature AS temperature#196, value#193.ts AS ts#197]
      +- Project [from_json(StructField(sensor,StringType,true), StructField(temperature,DoubleType,true), StructField(ts,TimestampType,true), cast(value#180 as string), Some(Etc/UTC)) AS value#193]
         +- StreamingRelationV2 org.apache.spark.sql.kafka010.KafkaSourceProvider@a981a9, kafka, org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaTable@44aa2e1e, org.apache.spark.sql.util.CaseInsensitiveStringMap@24f684c4, [key#179, value#180, topic#181, partition#182, offset#183L, timestamp#184, timestampType#185], StreamingRelation DataSource(org.apache.spark.sql.SparkSession@180f196a,kafka,List(),None,List(),None,Map(startingOffsets -> earliest, subscribe -> TemperatureSensorEvent, kafka.bootstrap.servers -> kafka:9092),None), kafka, [key#172, value#173, topic#174, partition#175, offset#176L, timestamp#177, timestampType#178]


The **append output mode** (i.e., the default one) is not supported when there are streaming aggregations on streaming DataFrames/DataSets without watermark, we need to use the **complete output mode**.

In [34]:
query_string = """
SELECT SENSOR, AVG(temperature) 
FROM TemperatureSensorEvent
GROUP BY SENSOR
"""

# write your query in SQL, register it and start it
q1 = (spark.sql(query_string)
                     .writeStream
                     .format("memory")
                     .outputMode("complete") # <-- CHANGE HERE
                     .queryName("sinkTable")
                     .start())

In [36]:
# look up the most recent results
spark.sql("SELECT * FROM sinkTable").show(5) # woithout ORDER BY TS DESC because the result in the table is already only the most recent

+------+-----------------+
|SENSOR| avg(temperature)|
+------+-----------------+
|    S1|20.12358327664434|
+------+-----------------+



In [37]:
# clean up
q1.stop()

### The DataFrame style

In [45]:
# write your query in SQL, register it and start it
q1bis = (temperature_sdf#.withWatermark("ts", "10 seconds") 
                     .groupBy("sensor")
                     .avg()
                     .writeStream
                     .format("memory")
                     .outputMode("complete") # 
                     .queryName("sinkTable")
                     .start())

In [47]:
# look up the most recent results
spark.sql("SELECT * FROM sinkTable").show(5) # woithout ORDER BY TS DESC because the result in the table is already only the most recent

+------+------------------+
|sensor|  avg(temperature)|
+------+------------------+
|    S1|20.146700353213802|
+------+------------------+



In [48]:
# clean up
q1bis.stop()

## Q2 - Logical Sliding Window

The average temperature observed by each sensor in the last 4 seconds

MEMO: the average should change as soon as the receive a new event

**Not supported**

## Q3 - Logical Tumbling Window

The average temperature of the last 30 seconds every 30 seconds (was 4 seconds in EPL)

NOTE: this query is not possibile in the SQL style

In [49]:
q3 = (temperature_sdf
                         .withWatermark("TS", "30 seconds")
                         .groupBy(window("TS", "30 seconds"),"SENSOR")
                         .avg("TEMPERATURE")
                     .writeStream
                     .format("memory")
                     .queryName("sinkTable")
                     .start())

In [50]:
spark.sql("SELECT * FROM sinkTable ORDER BY window DESC").show(5,False) # NOTE: here we order by window instead of ordering by timestamp# window instead of timestamp, again

+------------------------------------------+------+------------------+
|window                                    |SENSOR|avg(TEMPERATURE)  |
+------------------------------------------+------+------------------+
|[2021-10-17 13:18:30, 2021-10-17 13:19:00]|S1    |20.05881265071291 |
|[2021-10-17 13:18:00, 2021-10-17 13:18:30]|S1    |20.257708187951632|
|[2021-10-17 13:17:30, 2021-10-17 13:18:00]|S1    |20.679886087730136|
|[2021-10-17 13:17:00, 2021-10-17 13:17:30]|S1    |20.451175871597943|
|[2021-10-17 13:16:30, 2021-10-17 13:17:00]|S1    |19.09561258488085 |
+------------------------------------------+------+------------------+
only showing top 5 rows



In [51]:
q3.stop()

## Q4 - Physical Sliding Window

The moving average of the last 4 temperature events

**Not supported**

## Q5 - Physical Tumbling Window

The moving average of the last 4 temperature events every 4 events 

**Not supported**

## Q6 - Logical Hopping Window

The average temperature of the last 1 minute (was 4 seconds in EPL) every 5 seconds (was 2 seconds in EPL)

In [52]:
q6 = (temperature_sdf
      .withWatermark("TS", "1 minutes")
      .groupBy(window("TS", "1 minutes", "5 seconds"),"SENSOR")
      .avg("TEMPERATURE")
      .writeStream
      .format("memory")
      .queryName("sinkTable")
      .start())

In [53]:
spark.sql("SELECT * FROM sinkTable ORDER BY window DESC").show(5,False) # NOTE: here we order by window instead of ordering by timestamp

+------------------------------------------+------+------------------+
|window                                    |SENSOR|avg(TEMPERATURE)  |
+------------------------------------------+------+------------------+
|[2021-10-17 13:18:25, 2021-10-17 13:19:25]|S1    |20.433593778304285|
|[2021-10-17 13:18:20, 2021-10-17 13:19:20]|S1    |20.433593778304285|
|[2021-10-17 13:18:15, 2021-10-17 13:19:15]|S1    |20.325446587772678|
|[2021-10-17 13:18:10, 2021-10-17 13:19:10]|S1    |20.325446587772678|
|[2021-10-17 13:18:05, 2021-10-17 13:19:05]|S1    |20.15826041933227 |
+------------------------------------------+------+------------------+
only showing top 5 rows



note the duplicates. They are present because the query is evalauted every 5 seconds, but a new event arrives every 10 seconds.

In [54]:
q6.stop()

## Q7 - Stream-to-Stream Join

In EPL, at this point we moved on to the pattern matching part required to satisfy the information need, i.e., "find every smoke event followed by a temperature event whose temperature is above 50 °C within 2 minutes."

Spark Structured Streaming does not support the EPL's operator `->` (that reads as *followed by*. We need to use a stream-to-stream join.

Let's apply the watermarks on event-time columns and the other two filters.

In [57]:
last_minute_smoke_events = (smoke_sdf
                .withWatermark("ts", "2 minute")
                .where("smoke = True")
                .withColumnRenamed("sensor","sensorSmoke")
                .withColumnRenamed("ts","tsSmoke")
               )

last_minute_high_temperature_events = (temperature_sdf
                .withWatermark("ts", "2 minute")
                .where("temperature > 50")
                .withColumnRenamed("sensor","sensorTemp")
                .withColumnRenamed("ts","tsTemp")
               )

Join with event-time constraints

In [58]:
join_sdf = (last_minute_smoke_events.join(
  last_minute_high_temperature_events, expr("""
    (sensorTemp == sensorSmoke) AND
    (tsTemp > tsSmoke ) AND
    (tsTemp < tsSmoke + interval 2 minute )
    """
    )))

In [59]:
q7 = (join_sdf
                     .writeStream
                     .format("memory")
                     .queryName("sinkTable")
                     .start())

**IMPORTANT** To detect fire, run the appropriate cells in the data generators.

In [63]:
spark.sql("SELECT * FROM sinkTable ORDER BY tsTemp DESC").show(5,False) # note, I change ts in tsTemp

+-----------+-----+-------------------+----------+------------------+-------------------+
|sensorSmoke|smoke|tsSmoke            |sensorTemp|temperature       |tsTemp             |
+-----------+-----+-------------------+----------+------------------+-------------------+
|S1         |true |2021-10-17 13:22:23|S1        |55.820123183532424|2021-10-17 13:22:28|
+-----------+-----+-------------------+----------+------------------+-------------------+



let's have a look to the progesses

In [64]:
from IPython.display import clear_output
import json
while True:
    print(json.dumps(q7.lastProgress, indent=4))
    print(q7.status)
    time.sleep(1)
    clear_output(wait=True)
    

{
    "id": "df3922a5-62d8-4e52-9b07-5a58eceb5662",
    "runId": "ac7fea0b-15c4-4105-9d3c-0020303cab54",
    "name": "sinkTable",
    "timestamp": "2021-10-17T13:23:11.722Z",
    "batchId": 22,
    "numInputRows": 1,
    "inputRowsPerSecond": 0.27570995312930796,
    "processedRowsPerSecond": 0.27693159789531985,
    "durationMs": {
        "addBatch": 3500,
        "getBatch": 0,
        "latestOffset": 1,
        "queryPlanning": 86,
        "triggerExecution": 3611,
        "walCommit": 12
    },
    "eventTime": {
        "avg": "2021-10-17T13:23:08.000Z",
        "max": "2021-10-17T13:23:08.000Z",
        "min": "2021-10-17T13:23:08.000Z",
        "watermark": "2021-10-17T13:20:58.000Z"
    },
    "stateOperators": [
        {
            "numRowsTotal": 10,
            "numRowsUpdated": 1,
            "memoryUsedBytes": 329760,
            "customMetrics": {
                "loadedMapCacheHitCount": 8800,
                "loadedMapCacheMissCount": 0,
                "stateOnCurre

KeyboardInterrupt: 

to interrupt the execution of the cell, prese the square icon in the bar or choose *interrupt kernel* from the *kernel* dropdown menu

#### Discussion

> This query is equivalent to the EPL pattern `every a = SmokeSensorEvent(smoke=true) -> every TemperatureSensorEvent(temperature > 50, sensor=a.sensor) where timer:within(1 min)`. 
>
> Do not expect the same performances! It is evaluated as a relational join. Spark Structured Streaming lacks the specilized data structure of Esper.
>
> **It does not tame the torrent effect**, but this is expected! 
>
> Spark Structured Streaming is a Data Stream Management System meant to tame *flow that you cannot stop*

Counterintuitively, we can stop q7 because we only need the streaming Data Frame `join_sdf`. We do not need q7 to write its result in the in memory table.

In [65]:
q7.stop()

## Q8 - Count FireEvent

we are very close to the solution of the running example, we "just" need to count the number of events generated by the previous query over an hopping window of 1 minutes that slides every 30 seconds (was a sliding window of 10 secondsin EPL). 

So let's count the results of Q7. 

In [80]:
q8 = (join_sdf
                     .withWatermark("tsTemp", "1 minutes")
                     .groupBy(window("tsTemp", "1 minutes", "30 seconds"),"sensorTemp")
                     .count()
                     .writeStream
                     .format("memory")
                     .queryName("sinkTable") 
                     .start())

In [83]:
spark.sql("SELECT * FROM sinkTable ORDER BY window DESC").show(100,False)

+------------------------------------------+----------+-----+
|window                                    |sensorTemp|count|
+------------------------------------------+----------+-----+
|[2021-10-17 13:24:00, 2021-10-17 13:25:00]|S1        |71   |
|[2021-10-17 13:23:30, 2021-10-17 13:24:30]|S1        |62   |
|[2021-10-17 13:23:00, 2021-10-17 13:24:00]|S1        |45   |
|[2021-10-17 13:22:30, 2021-10-17 13:23:30]|S1        |27   |
|[2021-10-17 13:22:00, 2021-10-17 13:23:00]|S1        |10   |
|[2021-10-17 13:21:30, 2021-10-17 13:22:30]|S1        |1    |
+------------------------------------------+----------+-----+



In [84]:
q8.stop()