# Spark Structured Streaming - Demo
## Robotic Arm

In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
import io
from pyspark.sql.functions import *
import time
import json
import struct
import requests 

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1,org.apache.spark:spark-streaming-kafka-0-10_2.11:2.4.5,org.apache.kafka:kafka-clients:2.6.0 pyspark-shell'
                                    
spark = (SparkSession.builder 
    .master("local[*]")
    .appName("test")
    .getOrCreate()
        )

spark

set up the environment variables

In [2]:
topic = 'RoboticArm'
servers = "kafka:9092"

### Let's create the streaming Data Frames using the data in the kafka topics

In [3]:
raw_RoboticArm_df = (spark
  .readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", servers)
  .option("startingOffsets", "earliest")
  .option("subscribe", topic)
  .load())

In [4]:
raw_RoboticArm_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [5]:
from pyspark.sql.types import *

RoboticArm_schema = StructType([
    StructField("id", StringType(), True),
    StructField("status", StringType(), True),
    StructField("stressLevel", IntegerType(), True),
    StructField("ts", TimestampType(), True)])


In [6]:
RoboticArm_sdf=(raw_RoboticArm_df
                      .select(from_json(col("value").cast("string"), RoboticArm_schema).alias("value"))
                      .select("value.*"))

In [8]:
RoboticArm_sdf.printSchema()

root
 |-- id: string (nullable = true)
 |-- status: string (nullable = true)
 |-- stressLevel: integer (nullable = true)
 |-- ts: timestamp (nullable = true)



### to gain some confidence, let's first inspect the content of the stream of smoke event

In [9]:
basic_query = (RoboticArm_sdf
    .writeStream
    .format("memory") # this is for debug purpose only! DO NOT USE IN PRODUCTION
    .queryName("sinkTable")
    .start())

In [10]:
spark.sql("SELECT * FROM sinkTable ORDER BY TS DESC").show(5)

+---+-----------+-----------+-------------------+
| id|     status|stressLevel|                 ts|
+---+-----------+-----------+-------------------+
|  2| movingGood|          9|2021-10-18 14:05:11|
|  2|goodGrasped|          5|2021-10-18 14:05:10|
|  2|      ready|          0|2021-10-18 14:05:08|
|  1| movingGood|          7|2021-10-18 14:05:08|
|  1|goodGrasped|          1|2021-10-18 14:05:06|
+---+-----------+-----------+-------------------+
only showing top 5 rows



do not forget to stop queries that you are not using

In [11]:
basic_query.stop()

## E2

Write a continuous query that emits the max stress for each arm.

### the SQL sytyle

In [13]:
# create a logic table on top of the streaming data frame
RoboticArm_sdf.createTempView("RoboticArm") # this time we will not clean it up, because we use it in the next queries

**NOTE**: the following query gives *intentionally* an error

In [14]:
query_string = """
SELECT id, MAX(stressLevel) 
FROM RoboticArm
GROUP BY id
"""

# write your query in SQL, register it and start it
e2 = (spark.sql(query_string)
                     .writeStream
                     .format("memory")
                     .outputMode("complete") 
                     .queryName("sinkTable")
                     .start())

In [15]:
# look up the most recent results
spark.sql("SELECT * FROM sinkTable").show(5) # without ORDER BY TS DESC because the result in the table is already only the most recent

+---+----------------+
| id|max(stressLevel)|
+---+----------------+
|  1|               7|
|  2|               9|
+---+----------------+



In [16]:
# clean up
e2.stop()

### The DataFrame style

In [18]:
e2bis = (RoboticArm_sdf
                     .groupBy("id")
                     .max()
                     .writeStream
                     .format("memory")
                     .outputMode("complete") # 
                     .queryName("sinkTable")
                     .start())

In [19]:
# look up the most recent results
spark.sql("SELECT * FROM sinkTable").show(5) # woithout ORDER BY TS DESC because the result in the table is already only the most recent

+---+----------------+
| id|max(stressLevel)|
+---+----------------+
|  1|               7|
|  2|               9|
+---+----------------+



In [20]:
# clean up
e2bis.stop()