## Initialize pyspark

In [1]:
import os, sys, json, io
from pyspark.sql import *
from pyspark.sql.utils import StreamingQueryException
import sys
import json

os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'
sys.path.append('/class')

# Kafka variables
brokers = 'localhost:9092'
kafka_topic = 'stocks-json'
receiver_sleep_time = 4

# Connect to Spark 
if not 'sc' in locals():
    from initspark import initspark
    sc, spark, config = initspark()



initializing pyspark
:: loading settings :: url = jar:file:/usr/local/spark-3.2.1-bin-hadoop3.2/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-streaming-kafka-0-8_2.11 added as a dependency
org.mongodb.spark#mongo-spark-connector_2.11 added as a dependency
com.datastax.spark#spark-cassandra-connector_2.11 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.apache.spark#spark-avro_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-fe9c56c6-5c49-468e-9db3-ecb2941e3f67;1.0
	confs: [default]
	found org.apache.spark#spark-streaming-kafka-0-8_2.11;2.0.2 in central
	found org.apache.kafka#kafka_2.11;0.8.2.1 in central
	found org.scala-lang.modules#scala-xml_2.11;1.0.2 in central
	found com.yammer.metrics#metrics-core;2.2.0 in central
	found org.slf4j#slf4j-api;1.7.16 in spark-list
	found org.scala-lang.modules#scala-parser-combinators_2.11;1.0.2 in central
	found com.101tec#zkclient;0.3 in central
	found log4j#log4j;1.2.17 in spark-li

pyspark initialized


## Create a helper function to stream to a memory table.

In [3]:
def write_memory(df, queryname = 'debug', mode = "append"):
    # modes are: complete, update, append

    # if queryname in spark.catalog.listTables():
    #     spark.catalog.dropTempView(queryname)
    
    query = (df.writeStream 
            .format("memory")
            .queryName(queryname)
            .outputMode(mode)
            .start()
            )
    return query


## Define a streaming source and create a temp view to receive the results for debugging.

In [4]:
df = (spark.readStream 
    .format("kafka") 
    .option("kafka.bootstrap.servers", brokers) 
    .option("subscribe", kafka_topic) 
    .option("startingOffsets", "earliest")
    .option("failOnDataLoss", False)
    .load()
    )

# df.createOrReplaceTempView('table')
# df1 = spark.sql("""SELECT 'new data' as newfield, * from table""")

df1 = df.selectExpr("UPPER(CAST(value AS STRING)) as value")

debug1 = write_memory(df1, 'debug1')

## Query from the memory stream like it's a tempory view using `spark.sql`

In [6]:
spark.sql("select * from debug1").take(10)

[Row(value='{"EVENT_TIME": "2022-02-18 01:18:17", "SYMBOL": "AAPL", "PRICE": 268.36, "QUANTITY": 621}'),
 Row(value='{"EVENT_TIME": "2022-02-18 01:18:17", "SYMBOL": "GOOG", "PRICE": 287.68, "QUANTITY": 462}'),
 Row(value='{"EVENT_TIME": "2022-02-18 01:18:17", "SYMBOL": "MSFT", "PRICE": 253.72, "QUANTITY": 123}'),
 Row(value='{"EVENT_TIME": "2022-02-18 01:18:21", "SYMBOL": "MSFT", "PRICE": 164.02, "QUANTITY": 296}'),
 Row(value='{"EVENT_TIME": "2022-02-18 01:18:21", "SYMBOL": "AAPL", "PRICE": 233.43, "QUANTITY": 732}'),
 Row(value='{"EVENT_TIME": "2022-02-18 01:18:21", "SYMBOL": "GOOG", "PRICE": 175.03, "QUANTITY": 475}'),
 Row(value='{"EVENT_TIME": "2022-02-18 01:18:25", "SYMBOL": "GOOG", "PRICE": 137.65, "QUANTITY": 650}'),
 Row(value='{"EVENT_TIME": "2022-02-18 01:18:25", "SYMBOL": "AAPL", "PRICE": 256.29, "QUANTITY": 641}'),
 Row(value='{"EVENT_TIME": "2022-02-18 01:18:25", "SYMBOL": "MSFT", "PRICE": 177.63, "QUANTITY": 390}'),
 Row(value='{"EVENT_TIME": "2022-02-18 01:18:29", "SYMB

## You can stop and restart a memory stream whenever you like

In [None]:
debug1.stop()

In [None]:
debug1 = write_memory(df1, 'debug1')

In [None]:
spark.sql("select * from debug1").take(10)

## Spark SQL magic is also quite helpful.

In [7]:
%load_ext sparksql_magic
# pip install sparksql-magic

In [12]:
%%sparksql
select * from debug1 order by value limit 10

0
value
"{""EVENT_TIME"": ""2022-02-18 01:18:17"", ""SYMBOL"": ""AAPL"", ""PRICE"": 268.36, ""QUANTITY"": 621}"
"{""EVENT_TIME"": ""2022-02-18 01:18:17"", ""SYMBOL"": ""GOOG"", ""PRICE"": 287.68, ""QUANTITY"": 462}"
"{""EVENT_TIME"": ""2022-02-18 01:18:17"", ""SYMBOL"": ""MSFT"", ""PRICE"": 253.72, ""QUANTITY"": 123}"
"{""EVENT_TIME"": ""2022-02-18 01:18:21"", ""SYMBOL"": ""AAPL"", ""PRICE"": 233.43, ""QUANTITY"": 732}"
"{""EVENT_TIME"": ""2022-02-18 01:18:21"", ""SYMBOL"": ""GOOG"", ""PRICE"": 175.03, ""QUANTITY"": 475}"
"{""EVENT_TIME"": ""2022-02-18 01:18:21"", ""SYMBOL"": ""MSFT"", ""PRICE"": 164.02, ""QUANTITY"": 296}"
"{""EVENT_TIME"": ""2022-02-18 01:18:25"", ""SYMBOL"": ""AAPL"", ""PRICE"": 256.29, ""QUANTITY"": 641}"
"{""EVENT_TIME"": ""2022-02-18 01:18:25"", ""SYMBOL"": ""GOOG"", ""PRICE"": 137.65, ""QUANTITY"": 650}"
"{""EVENT_TIME"": ""2022-02-18 01:18:25"", ""SYMBOL"": ""MSFT"", ""PRICE"": 177.63, ""QUANTITY"": 390}"


## Stop a memory stream when you don't need it, as it can consume a lot of memory.

In [13]:
debug1.stop()

## Let's try reading AVRO

In [14]:
stock_schema = open("stock.avsc", "r").read()
print('stock_schema', stock_schema)
stock_struct = spark.read.format("avro").option("avroSchema", stock_schema).load().schema
print('stock_struct', stock_struct)


stock_schema {
    "namespace": "stock.avro",
    "type": "record",
    "name": "Stock",
    "fields": [
        {"name": "event_time", "type": "string"},
        {"name": "symbol",  "type": "string"},
        {"name": "price", "type": "float"},
        {"name": "quantity", "type": "int"}
    ]
}
stock_struct StructType(List(StructField(event_time,StringType,true),StructField(symbol,StringType,true),StructField(price,FloatType,true),StructField(quantity,IntegerType,true)))


In [15]:
brokers = 'localhost:9092'
kafka_topic = 'stocks-avro'
receiver_sleep_time = 4

df = (spark.readStream 
    .format("kafka") 
    .option("kafka.bootstrap.servers", brokers) 
    .option("subscribe", kafka_topic) 
    .option("startingOffsets", "latest")
    .option("failOnDataLoss", False)
    .load()
    )
print('df', df)

df DataFrame[key: binary, value: binary, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int]


In [16]:
debug2 = write_memory(df, 'debug2')

In [19]:
%%sparksql
select timestamp, key, value from debug2 order by timestamp desc limit 10

0,1,2
timestamp,key,value
2022-02-19 03:54:37.857000,"bytearray(b'\xdaV\x8f\x16B\xf6H\x94\xbctZ""\x8fD[\xdb')","bytearray(b'Obj\x01\x04\x14avro.codec\x08null\x16avro.schema\xc6\x03{""type"": ""record"", ""name"": ""Stock"", ""namespace"": ""stock.avro"", ""fields"": [{""type"": ""string"", ""name"": ""event_time""}, {""type"": ""string"", ""name"": ""symbol""}, {""type"": ""float"", ""name"": ""price""}, {""type"": ""int"", ""name"": ""quantity""}]}\x00-\xa7nw\xd73o\x968\x99C\xc2s#\x8bJ\x02>&2022-02-19 03:54:37\x08GOOGq\xdd\x8eC\xb6\x0e-\xa7nw\xd73o\x968\x99C\xc2s#\x8bJ')"
2022-02-19 03:54:36.897000,bytearray(b'\xc4j\n\xaa\x86\xe6B\x0b\xb6\xde\xd8\xb4\x08Wl '),"bytearray(b'Obj\x01\x04\x14avro.codec\x08null\x16avro.schema\xc6\x03{""type"": ""record"", ""name"": ""Stock"", ""namespace"": ""stock.avro"", ""fields"": [{""type"": ""string"", ""name"": ""event_time""}, {""type"": ""string"", ""name"": ""symbol""}, {""type"": ""float"", ""name"": ""price""}, {""type"": ""int"", ""name"": ""quantity""}]}\x00\xa8\x98\x86/H\xf3B\xc5\x1a\x03Z\xad\xe7/S\x9a\x02>&2022-02-19 03:54:36\x08GOOG\x85k-C\xdc\t\xa8\x98\x86/H\xf3B\xc5\x1a\x03Z\xad\xe7/S\x9a')"
2022-02-19 03:54:36.890000,bytearray(b'\xfbnDK\xed\x8bH\xcc\x92\xe3+\x15\x95\xfd\xb3\x80'),"bytearray(b'Obj\x01\x04\x14avro.codec\x08null\x16avro.schema\xc6\x03{""type"": ""record"", ""name"": ""Stock"", ""namespace"": ""stock.avro"", ""fields"": [{""type"": ""string"", ""name"": ""event_time""}, {""type"": ""string"", ""name"": ""symbol""}, {""type"": ""float"", ""name"": ""price""}, {""type"": ""int"", ""name"": ""quantity""}]}\x00\xb5\xad\xba\x1b ,\xfe!\xff)\xe8\xf1\x82ndV\x02>&2022-02-19 03:54:36\x08AAPL\xd7cwC\xa0\x05\xb5\xad\xba\x1b ,\xfe!\xff)\xe8\xf1\x82ndV')"
2022-02-19 03:54:36.846000,"bytearray(b'\xe2L""\xb5`0J\xa8\x87Bj%\xa2F\xe7\xe7')","bytearray(b'Obj\x01\x04\x14avro.codec\x08null\x16avro.schema\xc6\x03{""type"": ""record"", ""name"": ""Stock"", ""namespace"": ""stock.avro"", ""fields"": [{""type"": ""string"", ""name"": ""event_time""}, {""type"": ""string"", ""name"": ""symbol""}, {""type"": ""float"", ""name"": ""price""}, {""type"": ""int"", ""name"": ""quantity""}]}\x00\x03A>\xe4c\x92E\x1f\xfa\xdfR!\xe1\xc7\xf9\xa2\x02>&2022-02-19 03:54:36\x08MSFT\xaeGKC\xec\x04\x03A>\xe4c\x92E\x1f\xfa\xdfR!\xe1\xc7\xf9\xa2')"
2022-02-19 03:54:34.062000,bytearray(b'3\x8e\xf7\xa1l.II\xb1*\x13(R%QF'),"bytearray(b'Obj\x01\x04\x14avro.codec\x08null\x16avro.schema\xc6\x03{""type"": ""record"", ""name"": ""Stock"", ""namespace"": ""stock.avro"", ""fields"": [{""type"": ""string"", ""name"": ""event_time""}, {""type"": ""string"", ""name"": ""symbol""}, {""type"": ""float"", ""name"": ""price""}, {""type"": ""int"", ""name"": ""quantity""}]}\x00\xd0<\x0e\x96eY\xe0;=\xee""\xfd\xdd\xf1\xcb=\x02<&2022-02-19 03:54:34\x08MSFT\xd7#\xfcBZ\xd0<\x0e\x96eY\xe0;=\xee""\xfd\xdd\xf1\xcb=')"
2022-02-19 03:54:33.889000,bytearray(b'\x00\xb2\xcd\xe4\xd6\xddC\x16\xaeLC \x980\x8cQ'),"bytearray(b'Obj\x01\x04\x14avro.codec\x08null\x16avro.schema\xc6\x03{""type"": ""record"", ""name"": ""Stock"", ""namespace"": ""stock.avro"", ""fields"": [{""type"": ""string"", ""name"": ""event_time""}, {""type"": ""string"", ""name"": ""symbol""}, {""type"": ""float"", ""name"": ""price""}, {""type"": ""int"", ""name"": ""quantity""}]}\x00\x87\xae\xafn0(b4\x12=\xab\x14\x17\xf9M\xcb\x02>&2022-02-19 03:54:33\x08AAPL)\x9cWC\xde\x05\x87\xae\xafn0(b4\x12=\xab\x14\x17\xf9M\xcb')"
2022-02-19 03:54:33.853000,bytearray(b'\x85\xfc\xac\xad5\x8cJ\xc5\xbc7\x0e\xd0pU\xc8 '),"bytearray(b'Obj\x01\x04\x14avro.codec\x08null\x16avro.schema\xc6\x03{""type"": ""record"", ""name"": ""Stock"", ""namespace"": ""stock.avro"", ""fields"": [{""type"": ""string"", ""name"": ""event_time""}, {""type"": ""string"", ""name"": ""symbol""}, {""type"": ""float"", ""name"": ""price""}, {""type"": ""int"", ""name"": ""quantity""}]}\x00\xaf2hBM\x1c\xdb*+F\xfd5\xaf\xdb\x06+\x02>&2022-02-19 03:54:33\x08GOOG\x9aYAC\xd2\x04\xaf2hBM\x1c\xdb*+F\xfd5\xaf\xdb\x06+')"
2022-02-19 03:54:32.894000,bytearray(b'\xf2\x13B\x95\xc4uI\x12\x89\x08)V\x1eB\xf5\xf9'),"bytearray(b'Obj\x01\x04\x14avro.codec\x08null\x16avro.schema\xc6\x03{""type"": ""record"", ""name"": ""Stock"", ""namespace"": ""stock.avro"", ""fields"": [{""type"": ""string"", ""name"": ""event_time""}, {""type"": ""string"", ""name"": ""symbol""}, {""type"": ""float"", ""name"": ""price""}, {""type"": ""int"", ""name"": ""quantity""}]}\x00\x04\xd4\x98\x1d\xab\xd35C$\xe3\x1d\xd1\xac\x80\x08^\x02>&2022-02-19 03:54:32\x08GOOG\xec\x91&C\x86\n\x04\xd4\x98\x1d\xab\xd35C$\xe3\x1d\xd1\xac\x80\x08^')"
2022-02-19 03:54:32.885000,bytearray(b'\x87w\xc9Tu\x96N\x96\xa2/aZ\xd3\xca\x1eN'),"bytearray(b'Obj\x01\x04\x14avro.codec\x08null\x16avro.schema\xc6\x03{""type"": ""record"", ""name"": ""Stock"", ""namespace"": ""stock.avro"", ""fields"": [{""type"": ""string"", ""name"": ""event_time""}, {""type"": ""string"", ""name"": ""symbol""}, {""type"": ""float"", ""name"": ""price""}, {""type"": ""int"", ""name"": ""quantity""}]}\x00\x1b!\xbc\x1bL\xe0-s*\xe2\xb85lI\xe7\x11\x02>&2022-02-19 03:54:32\x08AAPL\xe1:%C\xb6\x02\x1b!\xbc\x1bL\xe0-s*\xe2\xb85lI\xe7\x11')"


                                                                                

ConsumerRecord(topic='stocks-avro', partition=0, offset=40679, timestamp=1645243362535, timestamp_type=0, key=b'\xd6\x0cgMs<By\xb8\xcaR\x02\xe0\xfa\x93\x14', value=b'Obj\x01\x04\x14avro.codec\x08null\x16avro.schema\xc6\x03{"type": "record", "name": "Stock", "namespace": "stock.avro", "fields": [{"type": "string", "name": "event_time"}, {"type": "string", "name": "symbol"}, {"type": "float", "name": "price"}, {"type": "int", "name": "quantity"}]}\x00$\x7f#w\xeaD\xdchK\xc5!\xf85\x10\xc8J\x02>&2022-02-19 04:02:42\x08MSFT\xa4p\xdfB\xf0\t$\x7f#w\xeaD\xdchK\xc5!\xf85\x10\xc8J', headers=[], checksum=None, serialized_key_size=16, serialized_value_size=328, serialized_header_size=-1)
ConsumerRecord(topic='stocks-avro', partition=0, offset=40679, timestamp=1645243362535, timestamp_type=0, key=b'\xd6\x0cgMs<By\xb8\xcaR\x02\xe0\xfa\x93\x14', value=b'Obj\x01\x04\x14avro.codec\x08null\x16avro.schema\xc6\x03{"type": "record", "name": "Stock", "namespace": "stock.avro", "fields": [{"type": "string", "name": "event_time"}, {"type": "string", "name": "symbol"}, {"type": "float", "name": "price"}, {"type": "int", "name": "quantity"}]}\x00$\x7f#w\xeaD\xdchK\xc5!\xf85\x10\xc8J\x02>&2022-02-19 04:02:42\x08MSFT\xa4p\xdfB\xf0\t$\x7f#w\xeaD\xdchK\xc5!\xf85\x10\xc8J', headers=[], checksum=None, serialized_key_size=16, serialized_value_size=328, serialized_header_size=-1)

ConsumerRecord(topic='stocks-avro', partition=0, offset=40814, timestamp=1645243470645, timestamp_type=0, key=b'\xc8\xeb\xc2\xe9O\xaaJ\x86\x83\x85\xb9\xd7\xf46\xea\x8f', value=b'Obj\x01\x04\x14avro.codec\x08null\x16avro.schema\xc6\x03{"type": "record", "name": "Stock", "namespace": "stock.avro", "fields": [{"type": "string", "name": "event_time"}, {"type": "string", "name": "symbol"}, {"type": "float", "name": "price"}, {"type": "int", "name": "quantity"}]}\x00\xe8\xf5x\r\xbf\x8aC\x98&\xaf\x13iz\x9dp\x13\x02>&2022-02-19 04:04:30\x08MSFT\xc3\xf5\x11C\xac\x04\xe8\xf5x\r\xbf\x8aC\x98&\xaf\x13iz\x9dp\x13', headers=[], checksum=None, serialized_key_size=16, serialized_value_size=328, serialized_header_size=-1)


In [23]:
debug2.stop()

22/02/19 03:59:03 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@65c97253 is aborting.
22/02/19 03:59:03 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@65c97253 aborted.
22/02/19 03:59:03 ERROR Utils: Aborting task
org.apache.spark.TaskKilledException
	at org.apache.spark.TaskContextImpl.killTaskIfInterrupted(TaskContextImpl.scala:216)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:36)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:759)
	at org.apache.spark.sql.execution.dataso

In [32]:
if 'debug3' in locals():
    debug3.stop()
    
from pyspark.sql.avro.functions import from_avro, to_avro
stock_schema = """{
    "type": "record",
    "name": "Stock",
    "fields": [
        {"name": "event_time", "type": "string"},
        {"name": "symbol",  "type": "string"},
        {"name": "price", "type": "float"},
        {"name": "quantity", "type": "int"}
    ]
}"""

df3 = df.select("timestamp", "key", from_avro(df.value, stock_schema, options = {"mode":"PERMISSIVE"}).alias("value"))
print('df3', df3)
debug3 = write_memory(df3, 'debug3')

df3 DataFrame[timestamp: timestamp, key: binary, value: struct<event_time:string,symbol:string,price:float,quantity:int>]


In [34]:
%%sparksql
select * from debug3 order by timestamp desc

0,1,2
timestamp,key,value
2022-02-19 04:09:26.907000,bytearray(b'\x8c<E\xa1\x1f7O\xfb\x86\xc2\x97\xadd\x8d\xe7B'),"Row(event_time=None, symbol=None, price=None, quantity=None)"
2022-02-19 04:09:26.739000,bytearray(b'\xe1c\xbc\xbb\x8e\x8fI\x93\xa6\xfcC\xcbM\xae\xb78'),"Row(event_time=None, symbol=None, price=None, quantity=None)"
2022-02-19 04:09:26.658000,bytearray(b'\xfd\xe6\x9d\x90\xc6\x9aM{\xac\x11\x93VE\x00H\x81'),"Row(event_time=None, symbol=None, price=None, quantity=None)"
2022-02-19 04:09:22.905000,bytearray(b'D\xb6)1=\x8aC\x88\xa2p]\x8as\xe4/k'),"Row(event_time=None, symbol=None, price=None, quantity=None)"
2022-02-19 04:09:22.734000,bytearray(b'\xa1Z\xa5w\x105I\xeb\xb7\x8c\x94\rE\xd5\x10}'),"Row(event_time=None, symbol=None, price=None, quantity=None)"
2022-02-19 04:09:22.653000,bytearray(b'}bV\xbe\xbc\xbbI|\xa4\xca\x9f\xc0\ny~\x8f'),"Row(event_time=None, symbol=None, price=None, quantity=None)"


                                                                                

In [35]:
debug3.stop()

## Let's read some data from the JSON stream and fix it up to make it more usable.

In [150]:
from pyspark.sql.functions import *
import uuid

brokers = 'localhost:9092'
kafka_topic = 'stocks-json'
receiver_sleep_time = 4

stock_schema = open("stock.avsc", "r").read()
print('stock_schema', stock_schema)

stock_struct = spark.read.format("avro").option("avroSchema", stock_schema).load().schema
print('stock_struct', stock_struct)

df = (spark.readStream 
    .format("kafka") 
    .option("kafka.bootstrap.servers", brokers) 
    .option("subscribe", kafka_topic) 
    .option("startingOffsets", "earliest")
    .option("failOnDataLoss", False)
#    .option("kafka.group.id", "stock-json-spark-group")
    .load()
    )
print('df', df)


def convert_uuid(value):
    # value is a bytearray in this case coming from spark
    ret = uuid.UUID(bytes = bytes(value))
    return str(ret)

convert_uuid_udf = udf(convert_uuid, StringType())

# keep the key and timestamp and convert the value from bytes to string
#df1 = df.select(col("key"), "timestamp", expr("CAST(value AS STRING) as value"))
df1 = df.select(convert_uuid_udf(col("key")).alias("key"), "timestamp", expr("CAST(value AS STRING) as value"))
print('df1', df1)

# cast the string json to a struct
# keep all the columns we selected and convery the JSON string into a struct object and remove the string version
df2 = df1.select(*df1.columns, from_json(df1.value, stock_struct).alias("value2")).drop('value')
print('df2', df2)

# flatten the struct to a normal DataFrame
df4 = df2.select(*(df2.columns), col("value2.*")).drop('value2')
print('df4', df4)

if 'debug4' in locals():
    debug4.stop()
    
debug4 = write_memory(df4, 'debug4')


stock_schema {
    "namespace": "stock.avro",
    "type": "record",
    "name": "Stock",
    "fields": [
        {"name": "event_time", "type": "string"},
        {"name": "symbol",  "type": "string"},
        {"name": "price", "type": "float"},
        {"name": "quantity", "type": "int"}
    ]
}
stock_struct StructType(List(StructField(event_time,StringType,true),StructField(symbol,StringType,true),StructField(price,FloatType,true),StructField(quantity,IntegerType,true)))
df DataFrame[key: binary, value: binary, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int]
df1 DataFrame[key: string, timestamp: timestamp, value: string]
df2 DataFrame[key: string, timestamp: timestamp, value2: struct<event_time:string,symbol:string,price:float,quantity:int>]
df4 DataFrame[key: string, timestamp: timestamp, event_time: string, symbol: string, price: float, quantity: int]


In [158]:
%%sparksql 
select * from debug4 order by event_time desc 

only showing top 20 row(s)


0,1,2,3,4,5
key,timestamp,event_time,symbol,price,quantity
a75b2000-988e-4334-a7c1-272e5f2e7553,2022-02-19 05:17:21.537000,2022-02-19 05:17:21,AAPL,134.86000061035156,96
7ea8125a-e910-4255-bba7-d4c4edacca33,2022-02-19 05:17:21.529000,2022-02-19 05:17:21,GOOG,213.97000122070312,97
b985e72a-6969-4577-8182-30d8f60996cf,2022-02-19 05:17:21.513000,2022-02-19 05:17:21,MSFT,182.24000549316406,308
af609968-cfc4-41ab-894e-f6b016839bde,2022-02-19 05:17:17.509000,2022-02-19 05:17:17,MSFT,273.6600036621094,413
d4ae6ca4-594d-4bb9-8283-841fa2b31a83,2022-02-19 05:17:17.533000,2022-02-19 05:17:17,AAPL,277.17999267578125,624
82d075e4-50b6-4d09-b5e0-b97465ce87e8,2022-02-19 05:17:17.525000,2022-02-19 05:17:17,GOOG,273.9599914550781,863
3425ce13-f4c5-4cff-955e-1e2f4df428b5,2022-02-19 05:17:13.521000,2022-02-19 05:17:13,GOOG,268.4100036621094,160
75467653-dcad-4801-a50f-9e72fb81aac9,2022-02-19 05:17:13.529000,2022-02-19 05:17:13,AAPL,248.05999755859375,474
db9944f4-5937-433f-a53c-a8ae0e40ac29,2022-02-19 05:17:13.505000,2022-02-19 05:17:13,MSFT,216.8000030517578,500


In [159]:
debug4.stop()
debug4 = write_memory(df4, 'debug4')


## Now that we have a normal DataFrame, let's manipulate it how we want and write the results out to another stream.

In [178]:
fixed_window = (df4.select("timestamp", "symbol","quantity")
        .withWatermark("timestamp", "10 seconds") 
        .groupBy(window("timestamp", "10 seconds").alias("window"), "symbol") 
        .agg(sum("quantity").alias("sum"))
        )
print(fixed_window)

debug5 = write_memory(fixed_window, 'debug5')


DataFrame[window: struct<start:timestamp,end:timestamp>, symbol: string, sum: bigint]




## We can see that we get aggregate by symbol every ten seconds. This data can be written off somewhere like a SQL or NoSQL database or forwarded as a new message to create a streaming aggregation.

In [187]:
%%sparksql
select * from debug5 order by window desc, symbol limit 9

0,1,2
window,symbol,sum
"Row(start=datetime.datetime(2022, 2, 19, 5, 36, 30), end=datetime.datetime(2022, 2, 19, 5, 36, 40))",AAPL,818
"Row(start=datetime.datetime(2022, 2, 19, 5, 36, 30), end=datetime.datetime(2022, 2, 19, 5, 36, 40))",GOOG,952
"Row(start=datetime.datetime(2022, 2, 19, 5, 36, 30), end=datetime.datetime(2022, 2, 19, 5, 36, 40))",MSFT,1571
"Row(start=datetime.datetime(2022, 2, 19, 5, 36, 20), end=datetime.datetime(2022, 2, 19, 5, 36, 30))",AAPL,781
"Row(start=datetime.datetime(2022, 2, 19, 5, 36, 20), end=datetime.datetime(2022, 2, 19, 5, 36, 30))",GOOG,1138
"Row(start=datetime.datetime(2022, 2, 19, 5, 36, 20), end=datetime.datetime(2022, 2, 19, 5, 36, 30))",MSFT,724
"Row(start=datetime.datetime(2022, 2, 19, 5, 36, 10), end=datetime.datetime(2022, 2, 19, 5, 36, 20))",AAPL,2227
"Row(start=datetime.datetime(2022, 2, 19, 5, 36, 10), end=datetime.datetime(2022, 2, 19, 5, 36, 20))",GOOG,1639
"Row(start=datetime.datetime(2022, 2, 19, 5, 36, 10), end=datetime.datetime(2022, 2, 19, 5, 36, 20))",MSFT,647


In [188]:
debug5.stop()

                                                                                

## Sliding windows are similar except you give it two parameters, the first is the total length of the window and the second is the refresh interval. In this case the windows will overlap.

In [173]:
sliding_window = (df4.select("timestamp", "symbol","quantity")
        .withWatermark("timestamp", "10 seconds") 
        .groupBy(window("timestamp", "30 seconds", "10 seconds").alias("window"), "symbol") 
        .agg(sum("quantity").alias("sum"))
        )
print(sliding_window)

debug6 = write_memory(sliding_window, 'debug6')


DataFrame[key: string, timestamp: timestamp, event_time: string, symbol: string, price: float, quantity: int]




In [176]:
%%sparksql
select * from debug6 order by window desc, symbol limit 21


                                                                                

only showing top 20 row(s)


0,1,2
window,symbol,sum
"Row(start=datetime.datetime(2022, 2, 19, 5, 27, 20), end=datetime.datetime(2022, 2, 19, 5, 27, 50))",AAPL,3332
"Row(start=datetime.datetime(2022, 2, 19, 5, 27, 20), end=datetime.datetime(2022, 2, 19, 5, 27, 50))",GOOG,3656
"Row(start=datetime.datetime(2022, 2, 19, 5, 27, 20), end=datetime.datetime(2022, 2, 19, 5, 27, 50))",MSFT,4305
"Row(start=datetime.datetime(2022, 2, 19, 5, 27, 10), end=datetime.datetime(2022, 2, 19, 5, 27, 40))",AAPL,5031
"Row(start=datetime.datetime(2022, 2, 19, 5, 27, 10), end=datetime.datetime(2022, 2, 19, 5, 27, 40))",GOOG,5215
"Row(start=datetime.datetime(2022, 2, 19, 5, 27, 10), end=datetime.datetime(2022, 2, 19, 5, 27, 40))",MSFT,4219
"Row(start=datetime.datetime(2022, 2, 19, 5, 27), end=datetime.datetime(2022, 2, 19, 5, 27, 30))",AAPL,4103
"Row(start=datetime.datetime(2022, 2, 19, 5, 27), end=datetime.datetime(2022, 2, 19, 5, 27, 30))",GOOG,4407
"Row(start=datetime.datetime(2022, 2, 19, 5, 27), end=datetime.datetime(2022, 2, 19, 5, 27, 30))",MSFT,3525


[Stage 4165:=>           (28 + 1) / 200][Stage 4167:>               (0 + 0) / 1]

In [None]:
debug6.stop()

22/02/19 05:29:15 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@5fd0f6ba is aborting.
22/02/19 05:29:15 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@5fd0f6ba aborted.
                                                                                

## Session Window is similar but used to group data that represents a continuous stream of activity. The time specifies a timeout period or period of inactivity that indicates when a session should end.

In [None]:
session_window = (df4.select("timestamp", "symbol","quantity")
        .withWatermark("timestamp", "10 seconds") 
        .groupBy(session_window("timestamp", "5 minutes").alias("window"), "symbol") 
        .agg(sum("quantity").alias("sum"))
        )
print(session_window)

debug7 = write_memory(session_window, 'debug7')


## Let's join the streaming aggregation with a static reference table.

In [189]:
x = sc.parallelize([('AAPL', 'Apple'), ('MSFT', 'Microsoft'), ('GOOG','Google')])
stocks = spark.createDataFrame(x, 'symbol:string, name:string')
stocks.createOrReplaceTempView('stocks')
fixed_window.createOrReplaceTempView('trades')

joined_aggregate = spark.sql("""
SELECT t.*, s.name
FROM trades as t
JOIN stocks as s on t.symbol = s.symbol
""")

debug8 = write_memory(joined_aggregate, 'debug8')





In [195]:
%%sparksql
select * from debug8 order by window desc, symbol limit 9


                                                                                

0,1,2,3
window,symbol,sum,name
"Row(start=datetime.datetime(2022, 2, 19, 5, 44, 10), end=datetime.datetime(2022, 2, 19, 5, 44, 20))",AAPL,2051,Apple
"Row(start=datetime.datetime(2022, 2, 19, 5, 44, 10), end=datetime.datetime(2022, 2, 19, 5, 44, 20))",GOOG,1442,Google
"Row(start=datetime.datetime(2022, 2, 19, 5, 44, 10), end=datetime.datetime(2022, 2, 19, 5, 44, 20))",MSFT,2047,Microsoft
"Row(start=datetime.datetime(2022, 2, 19, 5, 44), end=datetime.datetime(2022, 2, 19, 5, 44, 10))",AAPL,1490,Apple
"Row(start=datetime.datetime(2022, 2, 19, 5, 44), end=datetime.datetime(2022, 2, 19, 5, 44, 10))",GOOG,554,Google
"Row(start=datetime.datetime(2022, 2, 19, 5, 44), end=datetime.datetime(2022, 2, 19, 5, 44, 10))",MSFT,454,Microsoft
"Row(start=datetime.datetime(2022, 2, 19, 5, 43, 50), end=datetime.datetime(2022, 2, 19, 5, 44))",AAPL,1776,Apple
"Row(start=datetime.datetime(2022, 2, 19, 5, 43, 50), end=datetime.datetime(2022, 2, 19, 5, 44))",GOOG,1533,Google
"Row(start=datetime.datetime(2022, 2, 19, 5, 43, 50), end=datetime.datetime(2022, 2, 19, 5, 44))",MSFT,2030,Microsoft




In [196]:
debug8.stop()

22/02/19 05:45:07 ERROR TorrentBroadcast: Store broadcast broadcast_4933 fail, remove all pieces of the broadcast
