## Initialize pyspark

In [None]:
import os, sys, json, io
from pyspark.sql import *
from pyspark.sql.utils import StreamingQueryException
import sys
import json

os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'
sys.path.append('/class')

# Kafka variables
brokers = 'localhost:9092'
kafka_topic = 'stocks-json'
receiver_sleep_time = 4

# Connect to Spark 
if not 'sc' in locals():
    from initspark import initspark
    sc, spark, config = initspark()



## Create a helper function to stream to a memory table.

In [None]:
def write_memory(df, queryname = 'debug', mode = "append"):
    # modes are: complete, update, append

    # if queryname in spark.catalog.listTables():
    #     spark.catalog.dropTempView(queryname)
    
    query = (df.writeStream 
            .format("memory")
            .queryName(queryname)
            .outputMode(mode)
            .start()
            )
    return query


## Define a streaming source and create a temp view to receive the results for debugging.

In [None]:
df = (spark.readStream 
    .format("kafka") 
    .option("kafka.bootstrap.servers", brokers) 
    .option("subscribe", kafka_topic) 
    .option("startingOffsets", "earliest")
    .option("failOnDataLoss", False)
    .load()
    )

# df.createOrReplaceTempView('table')
# df1 = spark.sql("""SELECT 'new data' as newfield, * from table""")

df1 = df.selectExpr("UPPER(CAST(value AS STRING)) as value")

debug1 = write_memory(df1, 'debug1')

## Query from the memory stream like it's a tempory view using `spark.sql`

In [None]:
spark.sql("select * from debug1").take(10)

## You can stop and restart a memory stream whenever you like

In [None]:
debug1.stop()

In [None]:
debug1 = write_memory(df1, 'debug1')

In [None]:
spark.sql("select * from debug1").take(10)

## Spark SQL magic is also quite helpful.

In [None]:
%load_ext sparksql_magic
# pip install sparksql-magic

In [None]:
%%sparksql
select * from debug1 order by value limit 10

## Stop a memory stream when you don't need it, as it can consume a lot of memory.

In [None]:
debug1.stop()

## Let's try reading AVRO

In [None]:
stock_schema = open("stock.avsc", "r").read()
print('stock_schema', stock_schema)
stock_struct = spark.read.format("avro").option("avroSchema", stock_schema).load().schema
print('stock_struct', stock_struct)


In [None]:
brokers = 'localhost:9092'
kafka_topic = 'stocks-avro'
receiver_sleep_time = 4

df = (spark.readStream 
    .format("kafka") 
    .option("kafka.bootstrap.servers", brokers) 
    .option("subscribe", kafka_topic) 
    .option("startingOffsets", "latest")
    .option("failOnDataLoss", False)
    .load()
    )
print('df', df)

In [None]:
debug2 = write_memory(df, 'debug2')

In [None]:
%%sparksql
select timestamp, key, value from debug2 order by timestamp desc limit 10

ConsumerRecord(topic='stocks-avro', partition=0, offset=40679, timestamp=1645243362535, timestamp_type=0, key=b'\xd6\x0cgMs<By\xb8\xcaR\x02\xe0\xfa\x93\x14', value=b'Obj\x01\x04\x14avro.codec\x08null\x16avro.schema\xc6\x03{"type": "record", "name": "Stock", "namespace": "stock.avro", "fields": [{"type": "string", "name": "event_time"}, {"type": "string", "name": "symbol"}, {"type": "float", "name": "price"}, {"type": "int", "name": "quantity"}]}\x00$\x7f#w\xeaD\xdchK\xc5!\xf85\x10\xc8J\x02>&2022-02-19 04:02:42\x08MSFT\xa4p\xdfB\xf0\t$\x7f#w\xeaD\xdchK\xc5!\xf85\x10\xc8J', headers=[], checksum=None, serialized_key_size=16, serialized_value_size=328, serialized_header_size=-1)
ConsumerRecord(topic='stocks-avro', partition=0, offset=40679, timestamp=1645243362535, timestamp_type=0, key=b'\xd6\x0cgMs<By\xb8\xcaR\x02\xe0\xfa\x93\x14', value=b'Obj\x01\x04\x14avro.codec\x08null\x16avro.schema\xc6\x03{"type": "record", "name": "Stock", "namespace": "stock.avro", "fields": [{"type": "string", "name": "event_time"}, {"type": "string", "name": "symbol"}, {"type": "float", "name": "price"}, {"type": "int", "name": "quantity"}]}\x00$\x7f#w\xeaD\xdchK\xc5!\xf85\x10\xc8J\x02>&2022-02-19 04:02:42\x08MSFT\xa4p\xdfB\xf0\t$\x7f#w\xeaD\xdchK\xc5!\xf85\x10\xc8J', headers=[], checksum=None, serialized_key_size=16, serialized_value_size=328, serialized_header_size=-1)

ConsumerRecord(topic='stocks-avro', partition=0, offset=40814, timestamp=1645243470645, timestamp_type=0, key=b'\xc8\xeb\xc2\xe9O\xaaJ\x86\x83\x85\xb9\xd7\xf46\xea\x8f', value=b'Obj\x01\x04\x14avro.codec\x08null\x16avro.schema\xc6\x03{"type": "record", "name": "Stock", "namespace": "stock.avro", "fields": [{"type": "string", "name": "event_time"}, {"type": "string", "name": "symbol"}, {"type": "float", "name": "price"}, {"type": "int", "name": "quantity"}]}\x00\xe8\xf5x\r\xbf\x8aC\x98&\xaf\x13iz\x9dp\x13\x02>&2022-02-19 04:04:30\x08MSFT\xc3\xf5\x11C\xac\x04\xe8\xf5x\r\xbf\x8aC\x98&\xaf\x13iz\x9dp\x13', headers=[], checksum=None, serialized_key_size=16, serialized_value_size=328, serialized_header_size=-1)


In [None]:
debug2.stop()

In [None]:
if 'debug3' in locals():
    debug3.stop()
    
from pyspark.sql.avro.functions import from_avro, to_avro
stock_schema = """{
    "type": "record",
    "name": "Stock",
    "fields": [
        {"name": "event_time", "type": "string"},
        {"name": "symbol",  "type": "string"},
        {"name": "price", "type": "float"},
        {"name": "quantity", "type": "int"}
    ]
}"""

df3 = df.select("timestamp", "key", from_avro(df.value, stock_schema, options = {"mode":"PERMISSIVE"}).alias("value"))
print('df3', df3)
debug3 = write_memory(df3, 'debug3')

In [None]:
%%sparksql
select * from debug3 order by timestamp desc

In [None]:
debug3.stop()

## Let's read some data from the JSON stream and fix it up to make it more usable.

In [None]:
from pyspark.sql.functions import *
import uuid

brokers = 'localhost:9092'
kafka_topic = 'stocks-json'
receiver_sleep_time = 4

stock_schema = open("stock.avsc", "r").read()
print('stock_schema', stock_schema)

stock_struct = spark.read.format("avro").option("avroSchema", stock_schema).load().schema
print('stock_struct', stock_struct)

df = (spark.readStream 
    .format("kafka") 
    .option("kafka.bootstrap.servers", brokers) 
    .option("subscribe", kafka_topic) 
    .option("startingOffsets", "earliest")
    .option("failOnDataLoss", False)
#    .option("kafka.group.id", "stock-json-spark-group")
    .load()
    )
print('df', df)


def convert_uuid(value):
    # value is a bytearray in this case coming from spark
    ret = uuid.UUID(bytes = bytes(value))
    return str(ret)

convert_uuid_udf = udf(convert_uuid, StringType())

# keep the key and timestamp and convert the value from bytes to string
#df1 = df.select(col("key"), "timestamp", expr("CAST(value AS STRING) as value"))
df1 = df.select(convert_uuid_udf(col("key")).alias("key"), "timestamp", expr("CAST(value AS STRING) as value"))
print('df1', df1)

# cast the string json to a struct
# keep all the columns we selected and convery the JSON string into a struct object and remove the string version
df2 = df1.select(*df1.columns, from_json(df1.value, stock_struct).alias("value2")).drop('value')
print('df2', df2)

# flatten the struct to a normal DataFrame
df4 = df2.select(*(df2.columns), col("value2.*")).drop('value2')
print('df4', df4)

if 'debug4' in locals():
    debug4.stop()
    
debug4 = write_memory(df4, 'debug4')


In [None]:
%%sparksql 
select * from debug4 order by event_time desc 

In [None]:
debug4.stop()
debug4 = write_memory(df4, 'debug4')


## Now that we have a normal DataFrame, let's manipulate it how we want and write the results out to another stream.

In [None]:
fixed_window = (df4.select("timestamp", "symbol","quantity")
        .withWatermark("timestamp", "10 seconds") 
        .groupBy(window("timestamp", "10 seconds").alias("window"), "symbol") 
        .agg(sum("quantity").alias("sum"))
        )
print(fixed_window)

debug5 = write_memory(fixed_window, 'debug5')


## We can see that we get aggregate by symbol every ten seconds. This data can be written off somewhere like a SQL or NoSQL database or forwarded as a new message to create a streaming aggregation.

In [None]:
%%sparksql
select * from debug5 order by window desc, symbol limit 9

In [None]:
debug5.stop()

## Sliding windows are similar except you give it two parameters, the first is the total length of the window and the second is the refresh interval. In this case the windows will overlap.

In [None]:
sliding_window = (df4.select("timestamp", "symbol","quantity")
        .withWatermark("timestamp", "10 seconds") 
        .groupBy(window("timestamp", "30 seconds", "10 seconds").alias("window"), "symbol") 
        .agg(sum("quantity").alias("sum"))
        )
print(sliding_window)

debug6 = write_memory(sliding_window, 'debug6')


In [None]:
%%sparksql
select * from debug6 order by window desc, symbol limit 21


In [None]:
debug6.stop()

## Session Window is similar but used to group data that represents a continuous stream of activity. The time specifies a timeout period or period of inactivity that indicates when a session should end.

In [None]:
session_window = (df4.select("timestamp", "symbol","quantity")
        .withWatermark("timestamp", "10 seconds") 
        .groupBy(session_window("timestamp", "5 minutes").alias("window"), "symbol") 
        .agg(sum("quantity").alias("sum"))
        )
print(session_window)

debug7 = write_memory(session_window, 'debug7')


## Let's join the streaming aggregation with a static reference table.

In [None]:
x = sc.parallelize([('AAPL', 'Apple'), ('MSFT', 'Microsoft'), ('GOOG','Google')])
stocks = spark.createDataFrame(x, 'symbol:string, name:string')
stocks.createOrReplaceTempView('stocks')
fixed_window.createOrReplaceTempView('trades')

joined_aggregate = spark.sql("""
SELECT t.*, s.name
FROM trades as t
JOIN stocks as s on t.symbol = s.symbol
""")

debug8 = write_memory(joined_aggregate, 'debug8')



In [None]:
%%sparksql
select * from debug8 order by window desc, symbol limit 9


In [None]:
debug8.stop()