In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
import io
from pyspark.sql.functions import *
import time
import json
import struct
import requests 

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1,org.apache.spark:spark-streaming-kafka-0-10_2.11:2.4.5,org.apache.kafka:kafka-clients:2.6.0 pyspark-shell'
                                    
spark = (SparkSession.builder 
    .master("local[*]")
    .appName("test")
    .getOrCreate()
        )

spark

In [2]:
smoke_topic = 'SmokeSensorEvent'
temperature_topic = 'TemperatureSensorEvent'
servers = "kafka:9092"

In [3]:
smoke_df = (spark
  .read
  .format("kafka")
  .option("kafka.bootstrap.servers", servers)
  .option("subscribe", smoke_topic)
  .option("startingOffsets", "earliest")
  .option("endingOffsets", "latest")
  .load())

In [4]:
smoke_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [5]:
smoke_df.show(5)

+-------+--------------------+----------------+---------+------+--------------------+-------------+
|    key|               value|           topic|partition|offset|           timestamp|timestampType|
+-------+--------------------+----------------+---------+------+--------------------+-------------+
|[53 31]|[7B 22 73 65 6E 7...|SmokeSensorEvent|        0|     0|2020-11-24 14:14:...|            0|
|[53 31]|[7B 22 73 65 6E 7...|SmokeSensorEvent|        0|     1|2020-11-24 14:14:...|            0|
|[53 31]|[7B 22 73 65 6E 7...|SmokeSensorEvent|        0|     2|2020-11-24 14:14:...|            0|
|[53 31]|[7B 22 73 65 6E 7...|SmokeSensorEvent|        0|     3|2020-11-24 14:14:...|            0|
|[53 31]|[7B 22 73 65 6E 7...|SmokeSensorEvent|        0|     4|2020-11-24 14:14:...|            0|
+-------+--------------------+----------------+---------+------+--------------------+-------------+
only showing top 5 rows



In [7]:
stringified_smoke_df = smoke_df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
stringified_smoke_df.show(5,False)

+---+--------------------------------------------------+
|key|value                                             |
+---+--------------------------------------------------+
|S1 |{"sensor": "S1", "smoke": false, "ts": 1606227254}|
|S1 |{"sensor": "S1", "smoke": false, "ts": 1606227265}|
|S1 |{"sensor": "S1", "smoke": false, "ts": 1606227275}|
|S1 |{"sensor": "S1", "smoke": false, "ts": 1606227285}|
|S1 |{"sensor": "S1", "smoke": false, "ts": 1606227295}|
+---+--------------------------------------------------+
only showing top 5 rows



In [8]:
from pyspark.sql.types import *

smoke_schema = StructType([
    StructField("sensor", StringType(), True),
    StructField("smoke", BooleanType(), True),
    StructField("ts", TimestampType(), True)])

In [9]:
decoded_smoke_df = stringified_smoke_df.select(col("key").cast("string"),from_json(col("value"), smoke_schema).alias("value"))

In [10]:
decoded_smoke_df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: struct (nullable = true)
 |    |-- sensor: string (nullable = true)
 |    |-- smoke: boolean (nullable = true)
 |    |-- ts: timestamp (nullable = true)



In [12]:
decoded_smoke_df.select("value.*").show(5)

+------+-----+-------------------+
|sensor|smoke|                 ts|
+------+-----+-------------------+
|    S1|false|2020-11-24 14:14:14|
|    S1|false|2020-11-24 14:14:25|
|    S1|false|2020-11-24 14:14:35|
|    S1|false|2020-11-24 14:14:45|
|    S1|false|2020-11-24 14:14:55|
+------+-----+-------------------+
only showing top 5 rows



In [50]:
streaming_smoke_df = (spark
  .readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", servers)
  .option("startingOffsets", "earliest")
  .option("subscribe", smoke_topic)
  .load())

In [51]:
decoded_streaming_smoke_df=(streaming_smoke_df
                      .select(from_json(col("value").cast("string"), schema).alias("value"))
                      .select("value.*"))

In [52]:
smoke_query = (decoded_streaming_smoke_df
    .writeStream
    .format("memory")
    .queryName("SmokeSensorEvent")
    .start())

In [18]:
spark.sql("SELECT * FROM SmokeSensorEvent LIMIT 5").show()

+------+-----+-------------------+
|sensor|smoke|                 ts|
+------+-----+-------------------+
|    S1|false|2020-11-24 14:14:14|
|    S1|false|2020-11-24 14:14:25|
|    S1|false|2020-11-24 14:14:35|
|    S1|false|2020-11-24 14:14:45|
|    S1|false|2020-11-24 14:14:55|
+------+-----+-------------------+



In [34]:
temperarture_schema = StructType([
    StructField("sensor", StringType(), True),
    StructField("temperature", DoubleType(), True),
    StructField("ts", TimestampType(), True)])

streaming_temperature_df = (spark
  .readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", servers)
  .option("startingOffsets", "earliest")
  .option("subscribe", temperature_topic)
  .load())

decoded_streaming_temperature_df = (streaming_temperature_df
                      .select(from_json(col("value").cast("string"), temperarture_schema).alias("value"))
                      .select("value.*"))

temperature_query = (decoded_streaming_temperature_df
                     .writeStream
                     .format("memory")
                     .queryName("TemperatureSensorEvent")
                     .start())

In [28]:
streaming_temperature_df.select(from_json(col("value").cast("string"), temperarture_schema).alias("value")).select("value.*").printSchema()

root
 |-- sensor: string (nullable = true)
 |-- temperature: boolean (nullable = true)
 |-- ts: timestamp (nullable = true)



## Q0 - Filter

In [54]:
spark.sql("SELECT * FROM TemperatureSensorEvent WHERE temperature > 20").show(5)

+------+------------------+-------------------+
|sensor|       temperature|                 ts|
+------+------------------+-------------------+
|    S1|20.763870006867744|2020-11-24 14:22:07|
|    S1| 21.16533247355798|2020-11-24 14:22:18|
|    S1|20.943058476233357|2020-11-24 14:23:08|
|    S1| 21.67293464162423|2020-11-24 14:23:28|
|    S1| 20.61440334054661|2020-11-24 14:23:38|
+------+------------------+-------------------+
only showing top 5 rows



## Q1 - Filter

In [48]:
spark.sql("SELECT * FROM SmokeSensorEvent").show(5)

+------+-----+-------------------+
|sensor|smoke|                 ts|
+------+-----+-------------------+
|    S1|false|2020-11-24 14:14:14|
|    S1|false|2020-11-24 14:14:25|
|    S1|false|2020-11-24 14:14:35|
|    S1|false|2020-11-24 14:14:45|
|    S1|false|2020-11-24 14:14:55|
+------+-----+-------------------+
only showing top 5 rows



In [53]:
spark.sql("SELECT * FROM SmokeSensorEvent WHERE smoke").show(5)

+------+-----+-------------------+
|sensor|smoke|                 ts|
+------+-----+-------------------+
|    S1| true|2020-11-24 14:37:05|
|    S1| true|2020-11-24 14:37:15|
|    S1| true|2020-11-24 14:37:25|
|    S1| true|2020-11-24 14:37:35|
|    S1| true|2020-11-24 14:37:45|
+------+-----+-------------------+
only showing top 5 rows



## Q2 - Avg

In [60]:
spark.sql("""
SELECT SENSOR, AVG(temperature) 
FROM TemperatureSensorEvent
GROUP BY SENSOR
""").show()

+------+------------------+
|SENSOR|  avg(temperature)|
+------+------------------+
|    S1|25.755518607259987|
+------+------------------+



## Q3 - Logical Sliding Window

**Not supported**

## Q4 - Logical Tumbling Window

In [76]:
LTW_temperature_query = (decoded_streaming_temperature_df
                         .withWatermark("TS", "10 minutes")
                         .groupBy(window("TS", "1 minutes"),"SENSOR")
                         .avg("TEMPERATURE")
                     .writeStream
                     .format("memory")
                     .queryName("results")
                     .start())

In [78]:
spark.sql("SELECT * FROM results ORDER BY window").show(5,False)

+------------------------------------------+------+------------------+
|window                                    |SENSOR|avg(TEMPERATURE)  |
+------------------------------------------+------+------------------+
|[2020-11-24 14:40:00, 2020-11-24 14:41:00]|S1    |54.78537965704618 |
|[2020-11-24 14:29:00, 2020-11-24 14:30:00]|S1    |19.656529885025204|
|[2020-11-24 14:26:00, 2020-11-24 14:27:00]|S1    |20.253691894608895|
|[2020-11-24 14:34:00, 2020-11-24 14:35:00]|S1    |20.07524626663701 |
|[2020-11-24 14:36:00, 2020-11-24 14:37:00]|S1    |19.384828838569067|
+------------------------------------------+------+------------------+
only showing top 5 rows



In [79]:
LTW_temperature_query.stop()

## Q5 - Physical Sliding Window

**Not supported**

## Q6 - Physical Tumbling Window

**Not supported**

## Q7 - Logical Hopping Window

In [84]:
LHW_temperature_query = (decoded_streaming_temperature_df
                         .withWatermark("TS", "10 minutes")
                         .groupBy(window("TS", "1 minutes", "5 seconds"),"SENSOR")
                         .avg("TEMPERATURE")
                     .writeStream
                     .format("memory")
                     .queryName("results")
                     .start())

In [87]:
spark.sql("SELECT * FROM results ORDER BY window").show(5,False)

+------------------------------------------+------+------------------+
|window                                    |SENSOR|avg(TEMPERATURE)  |
+------------------------------------------+------+------------------+
|[2020-11-24 14:21:10, 2020-11-24 14:22:10]|S1    |20.763870006867744|
|[2020-11-24 14:21:15, 2020-11-24 14:22:15]|S1    |20.763870006867744|
|[2020-11-24 14:21:20, 2020-11-24 14:22:20]|S1    |20.964601240212865|
|[2020-11-24 14:21:25, 2020-11-24 14:22:25]|S1    |20.964601240212865|
|[2020-11-24 14:21:30, 2020-11-24 14:22:30]|S1    |20.50070180868975 |
+------------------------------------------+------+------------------+
only showing top 5 rows



note the duplicates

In [88]:
LHW_temperature_query.stop()

## Q8 - Stream-to-Stream Join

NOTE: this stream-to-stream join is equivalent to the EPL pattern `every a = SmokeSensorEvent(smoke=true) -> every TemperatureSensorEvent(temperature > 50, sensor=a.sensor) where timer:within(1 min)`. Do not expect the same performances! It is evaluated differently.

Apply watermarks on event-time columns and other filters

In [97]:
last_minute_smoke_events = (decoded_streaming_smoke_df
                .withWatermark("ts", "1 minute")
                .filter(col("smoke") == True)
               )

last_minute_high_temperature_events = (decoded_streaming_temperature_df
                .withWatermark("ts", "1 minute")
                .filter(col("temperature") > 50)
               )

Join with event-time constraints

In [105]:
join_df = (last_minute_smoke_events.join(
  last_minute_high_temperature_events,
    (last_minute_smoke_events.sensor == last_minute_high_temperature_events.sensor) &
    (last_minute_smoke_events.ts < last_minute_high_temperature_events.ts))
           .select(last_minute_smoke_events.sensor,
                   last_minute_smoke_events.smoke,
                   last_minute_high_temperature_events.temperature,
                   last_minute_smoke_events.ts
                  ))

In [101]:
s_to_s_join_query = (join_df
                     .writeStream
                     .format("memory")
                     .queryName("results")
                     .start())

In [103]:
spark.sql("SELECT * FROM results").show(5,False)

+------+-----+-------------------+------+----------------+-------------------+
|sensor|smoke|ts                 |sensor|temperature     |ts                 |
+------+-----+-------------------+------+----------------+-------------------+
|S1    |true |2020-11-24 14:37:05|S1    |54.2981082858924|2020-11-24 14:38:18|
|S1    |true |2020-11-24 14:37:15|S1    |54.2981082858924|2020-11-24 14:38:18|
|S1    |true |2020-11-24 14:37:25|S1    |54.2981082858924|2020-11-24 14:38:18|
|S1    |true |2020-11-24 14:37:35|S1    |54.2981082858924|2020-11-24 14:38:18|
|S1    |true |2020-11-24 14:37:45|S1    |54.2981082858924|2020-11-24 14:38:18|
+------+-----+-------------------+------+----------------+-------------------+
only showing top 5 rows



**IMPORTANT** To detect fire, run the appropriate cells in the data generators.

In [110]:
s_to_s_join_query.stop()

## Q9 - Count FireEvent

In [111]:
Count_Fire_Event_query = (join_df
                     .withWatermark("TS", "10 minutes")
                     .groupBy(window("TS", "10 minutes", "1 minute"),"SENSOR")
                     .count()
                     .writeStream
                     .format("memory")
                     .queryName("results")
                     .start())

In [115]:
spark.sql("SELECT * FROM results ORDER BY window").show(5,False)

+------------------------------------------+------+-----+
|window                                    |SENSOR|count|
+------------------------------------------+------+-----+
|[2020-11-24 14:28:00, 2020-11-24 14:38:00]|S1    |1350 |
|[2020-11-24 14:29:00, 2020-11-24 14:39:00]|S1    |2690 |
|[2020-11-24 14:30:00, 2020-11-24 14:40:00]|S1    |3995 |
|[2020-11-24 14:31:00, 2020-11-24 14:41:00]|S1    |5264 |
|[2020-11-24 14:32:00, 2020-11-24 14:42:00]|S1    |6497 |
+------------------------------------------+------+-----+
only showing top 5 rows



In [116]:
Count_Fire_Event_query.stop()

## clean up

In [117]:
smoke_query.stop()

In [118]:
temperature_query.stop()