## Initialize PySpark.

In [1]:
import os, sys, json, io
from pyspark.sql import *
from pyspark.sql.utils import StreamingQueryException
import sys
import json

os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'
sys.path.append('/class')

# Kafka variables
brokers = 'localhost:9092'
kafka_topic = 'stocks-json'
receiver_sleep_time = 4

# Connect to Spark 
if not 'sc' in locals():
    from initspark import initspark
    sc, spark, config = initspark(packages = ['kafka', 'kafka-sql', 'spark-avro'])



initializing pyspark
packages ['kafka', 'kafka-sql', 'spark-avro']
--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.2,org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.1,org.apache.spark:spark-avro_2.12:3.2.1 pyspark-shell
pyspark initialized


## Basic batch source example

In [None]:
! hadoop fs -rm -r /territories

In [None]:
territories = spark.read.csv('file:///class/2-apache-spark/territories.csv', header=True)
print(territories)
territories.show()
territories.write.csv('hdfs://localhost:9000/territories')
                      
# territories.where('RegionID = 1').show()
# territories.groupby('RegionID').count().show()
# territories.show()
# territories.createOrReplaceTempView('territories')
# spark.sql('SELECT regionid, count(*) as cnt from territories group by regionid order by cnt desc').show()


## Create a helper function to stream to a memory table.

In [2]:
def write_memory(df, queryname = 'debug', mode = "append"):
    # modes are: complete, update, append

    # if queryname in spark.catalog.listTables():
    #     spark.catalog.dropTempView(queryname)
    
    query = (df.writeStream 
            .format("memory")
            .queryName(queryname)
            .outputMode(mode)
            .start()
            )
    return query


## Define a streaming source and create a temp view to receive the results for debugging.

In [None]:
df = (spark.readStream 
    .format("kafka") 
    .option("kafka.bootstrap.servers", brokers) 
    .option("subscribe", kafka_topic) 
    .option("startingOffsets", "earliest")
    .option("failOnDataLoss", False)
    .load()
    )

# df.createOrReplaceTempView('table')
# df1 = spark.sql("""SELECT 'new data' as newfield, * from table""")

if 'debug1' in locals():
    debug1.stop()

df1 = df.selectExpr("UPPER(CAST(value AS STRING)) as value")

debug1 = write_memory(df1, 'debug1')

## Query from the memory stream like it's a temporary view using `spark.sql`

In [None]:
spark.sql("select * from debug1").take(10)

## You can stop and restart a memory stream whenever you like.

In [None]:
debug1.stop()

In [None]:
debug1 = write_memory(df1, 'debug1')

In [None]:
spark.sql("select * from debug1").take(10)

## Spark SQL magic is also quite helpful.

In [3]:
%load_ext sparksql_magic
# pip install sparksql-magic

In [None]:
%%sparksql
select * from debug1 order by value limit 10

## Stop a memory stream when you don't need it, as it can consume a lot of memory.

In [None]:
debug1.stop()

## Let's try reading AVRO.

In [4]:
stock_schema = open("stock.avsc", "r").read()
print('stock_schema', stock_schema)
stock_struct = spark.read.format("avro").option("avroSchema", stock_schema).load().schema
print('stock_struct', stock_struct)


stock_schema {
    "namespace": "stock.avro",
    "type": "record",
    "name": "Stock",
    "fields": [
        {"name": "event_time", "type": "string"},
        {"name": "symbol",  "type": "string"},
        {"name": "price", "type": "float"},
        {"name": "quantity", "type": "int"}
    ]
}
stock_struct StructType(List(StructField(event_time,StringType,true),StructField(symbol,StringType,true),StructField(price,FloatType,true),StructField(quantity,IntegerType,true)))


In [5]:
brokers = 'localhost:9092'
kafka_topic = 'stocks-avro'
receiver_sleep_time = 4

df = (spark.readStream 
    .format("kafka") 
    .option("kafka.bootstrap.servers", brokers) 
    .option("subscribe", kafka_topic) 
    .option("startingOffsets", "earliest")
    .option("failOnDataLoss", False)
    .load()
    )
print('df', df)

df DataFrame[key: binary, value: binary, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int]


In [6]:
if 'debug2' in locals():
    debug2.stop()
debug2 = write_memory(df, 'debug2')

In [7]:
%%sparksql
select timestamp, key, value from debug2 order by timestamp desc limit 10

0,1,2
timestamp,key,value


In [8]:
debug2.stop()

In [None]:
from pyspark.sql.functions import *
import uuid

def convert_uuid(value):
    # value is a bytearray in this case coming from spark
    ret = uuid.UUID(bytes = bytes(value))
    return str(ret)

convert_uuid_udf = udf(convert_uuid, StringType())
    
from pyspark.sql.avro.functions import from_avro, to_avro
stock_schema = """{
    "type": "record",
    "name": "Stock",
    "fields": [
        {"name": "event_time", "type": "string"},
        {"name": "symbol",  "type": "string"},
        {"name": "price", "type": "float"},
        {"name": "quantity", "type": "int"}
    ]
}"""

#df3 = df.select("timestamp", "key", "value")

df3 = df.select("timestamp"
                , convert_uuid_udf(col("key")).alias("key")
                , from_avro(df.value, stock_schema, options = {"mode":"PERMISSIVE"}).alias("value"))

df3 = df3.select(*(df3.columns), col("value.*")).drop('value')
#df3 = df3.where("symbol = 'GOOG'")
df3.createOrReplaceTempView('stocks')
#df3 = spark.sql('SELECT symbol, count(*) as cnt, sum(quantity) as qty from stocks group by symbol')

print('df3', df3)
if 'debug3' in locals():
    debug3.stop()
debug3 = write_memory(df3, 'debug3')

In [None]:
%%sparksql
select * from debug3 order by timestamp desc

In [None]:
debug3.stop()

## Here's the same thing for the JSON stream.

In [9]:
from pyspark.sql.functions import *
import uuid

brokers = 'localhost:9092'
kafka_topic = 'stocks-json'
receiver_sleep_time = 4

stock_schema = open("stock.avsc", "r").read()
print('stock_schema', stock_schema)

stock_struct = spark.read.format("avro").option("avroSchema", stock_schema).load().schema
print('stock_struct', stock_struct)

df = (spark.readStream 
    .format("kafka") 
    .option("kafka.bootstrap.servers", brokers) 
    .option("subscribe", kafka_topic) 
    .option("startingOffsets", "earliest")
    .option("failOnDataLoss", False)
#    .option("kafka.group.id", "stock-json-spark-group")
    .load()
    )
print('df', df)


def convert_uuid(value):
    # value is a bytearray in this case coming from spark
    ret = uuid.UUID(bytes = bytes(value))
    return str(ret)

convert_uuid_udf = udf(convert_uuid, StringType())

# keep the key and timestamp and convert the value from bytes to string
#df1 = df.select(col("key"), "timestamp", expr("CAST(value AS STRING) as value"))
df1 = df.select(convert_uuid_udf(col("key")).alias("key"), "timestamp", expr("CAST(value AS STRING) as value"))
print('df1', df1)

# cast the string json to a struct
# keep all the columns we selected and convery the JSON string into a struct object and remove the string version
df2 = df1.select(*df1.columns, from_json(df1.value, stock_struct).alias("value2")).drop('value')
print('df2', df2)

# flatten the struct to a normal DataFrame
df4 = df2.select(*(df2.columns), col("value2.*")).drop('value2')
print('df4', df4)

if 'debug4' in locals():
    debug4.stop()
    
debug4 = write_memory(df4, 'debug4')


stock_schema {
    "namespace": "stock.avro",
    "type": "record",
    "name": "Stock",
    "fields": [
        {"name": "event_time", "type": "string"},
        {"name": "symbol",  "type": "string"},
        {"name": "price", "type": "float"},
        {"name": "quantity", "type": "int"}
    ]
}
stock_struct StructType(List(StructField(event_time,StringType,true),StructField(symbol,StringType,true),StructField(price,FloatType,true),StructField(quantity,IntegerType,true)))
df DataFrame[key: binary, value: binary, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int]
df1 DataFrame[key: string, timestamp: timestamp, value: string]
df2 DataFrame[key: string, timestamp: timestamp, value2: struct<event_time:string,symbol:string,price:float,quantity:int>]
df4 DataFrame[key: string, timestamp: timestamp, event_time: string, symbol: string, price: float, quantity: int]


In [10]:
%%sparksql 
select * from debug4 order by event_time desc 

only showing top 20 row(s)


0,1,2,3,4,5
key,timestamp,event_time,symbol,price,quantity
ffd48443-b110-4207-a1f1-d178dc213f4b,2022-03-07 20:35:44.543000,2022-03-07 20:35:44,AAPL,214.05999755859375,192
c1d3c92e-8226-47b8-8cad-d229c20bfe52,2022-03-07 20:35:44.542000,2022-03-07 20:35:44,GOOG,121.73999786376953,381
972c287e-2c9c-432f-a5ca-4e43ccb98e8f,2022-03-07 20:35:44.536000,2022-03-07 20:35:44,MSFT,148.6699981689453,988
5188cf74-f083-45b0-b629-b2a87253eb89,2022-03-07 20:35:40.537000,2022-03-07 20:35:40,AAPL,181.80999755859375,289
df085c5c-5de3-45d5-bb62-90f99b10de98,2022-03-07 20:35:40.537000,2022-03-07 20:35:40,GOOG,100.87999725341797,990
cf979a9b-7384-43ac-a22f-88041f069a44,2022-03-07 20:35:40.531000,2022-03-07 20:35:40,MSFT,256.04998779296875,883
e2485552-66c9-4a26-b9b9-d083fa523f31,2022-03-07 20:35:36.533000,2022-03-07 20:35:36,AAPL,194.4600067138672,646
f4404ef3-5823-45aa-84a4-4167a2dd2763,2022-03-07 20:35:36.533000,2022-03-07 20:35:36,GOOG,224.5500030517578,774
b4161ac5-4cd0-4eb0-abc6-e98b7145a719,2022-03-07 20:35:36.526000,2022-03-07 20:35:36,MSFT,172.9499969482422,412


In [11]:
debug4.stop()


## Now that we have a normal DataFrame, let's manipulate it how we want and write the results out to another stream.

In [12]:
fixed_window = (df4.select("timestamp", "symbol", "quantity")
        .withWatermark("timestamp", "10 seconds") 
        .groupBy(window("timestamp", "10 seconds").alias("window"), "symbol") 
        .agg(sum("quantity").alias("sum"))
        )
print(fixed_window)

if 'debug5' in locals():
    debug5.stop()
debug5 = write_memory(fixed_window, 'debug5')


DataFrame[window: struct<start:timestamp,end:timestamp>, symbol: string, sum: bigint]


## We can see that we get aggregate by symbol every ten seconds. This data can be written off somewhere like a SQL or NoSQL database or forwarded as a new message to create a streaming aggregation.

In [14]:
%%sparksql
select * from debug5 order by window desc, symbol limit 9

0,1,2
window,symbol,sum
"Row(start=datetime.datetime(2022, 3, 7, 20, 35, 30), end=datetime.datetime(2022, 3, 7, 20, 35, 40))",AAPL,1608
"Row(start=datetime.datetime(2022, 3, 7, 20, 35, 30), end=datetime.datetime(2022, 3, 7, 20, 35, 40))",GOOG,1262
"Row(start=datetime.datetime(2022, 3, 7, 20, 35, 30), end=datetime.datetime(2022, 3, 7, 20, 35, 40))",MSFT,638
"Row(start=datetime.datetime(2022, 3, 7, 20, 35, 20), end=datetime.datetime(2022, 3, 7, 20, 35, 30))",AAPL,1510
"Row(start=datetime.datetime(2022, 3, 7, 20, 35, 20), end=datetime.datetime(2022, 3, 7, 20, 35, 30))",GOOG,1153
"Row(start=datetime.datetime(2022, 3, 7, 20, 35, 20), end=datetime.datetime(2022, 3, 7, 20, 35, 30))",MSFT,2096
"Row(start=datetime.datetime(2022, 3, 7, 20, 35, 10), end=datetime.datetime(2022, 3, 7, 20, 35, 20))",AAPL,1434
"Row(start=datetime.datetime(2022, 3, 7, 20, 35, 10), end=datetime.datetime(2022, 3, 7, 20, 35, 20))",GOOG,1785
"Row(start=datetime.datetime(2022, 3, 7, 20, 35, 10), end=datetime.datetime(2022, 3, 7, 20, 35, 20))",MSFT,1032


In [15]:
debug5.stop()

## Sliding windows are similar except you give it two parameters, the first is the total length of the window and the second is the refresh interval. In this case, the windows will overlap.

In [16]:
sliding_window = (df4.select("timestamp", "symbol","quantity")
        .withWatermark("timestamp", "10 seconds") 
        .groupBy(window("timestamp", "30 seconds", "10 seconds").alias("window"), "symbol") 
        .agg(sum("quantity").alias("sum"))
        )
print(sliding_window)

debug6 = write_memory(sliding_window, 'debug6')


DataFrame[window: struct<start:timestamp,end:timestamp>, symbol: string, sum: bigint]


In [22]:
%%sparksql
select * from debug6 order by window desc, symbol limit 21


only showing top 20 row(s)


0,1,2
window,symbol,sum
"Row(start=datetime.datetime(2022, 3, 7, 20, 36, 20), end=datetime.datetime(2022, 3, 7, 20, 36, 50))",AAPL,4353
"Row(start=datetime.datetime(2022, 3, 7, 20, 36, 20), end=datetime.datetime(2022, 3, 7, 20, 36, 50))",GOOG,3177
"Row(start=datetime.datetime(2022, 3, 7, 20, 36, 20), end=datetime.datetime(2022, 3, 7, 20, 36, 50))",MSFT,3869
"Row(start=datetime.datetime(2022, 3, 7, 20, 36, 10), end=datetime.datetime(2022, 3, 7, 20, 36, 40))",AAPL,3463
"Row(start=datetime.datetime(2022, 3, 7, 20, 36, 10), end=datetime.datetime(2022, 3, 7, 20, 36, 40))",GOOG,2647
"Row(start=datetime.datetime(2022, 3, 7, 20, 36, 10), end=datetime.datetime(2022, 3, 7, 20, 36, 40))",MSFT,3885
"Row(start=datetime.datetime(2022, 3, 7, 20, 36), end=datetime.datetime(2022, 3, 7, 20, 36, 30))",AAPL,3539
"Row(start=datetime.datetime(2022, 3, 7, 20, 36), end=datetime.datetime(2022, 3, 7, 20, 36, 30))",GOOG,4049
"Row(start=datetime.datetime(2022, 3, 7, 20, 36), end=datetime.datetime(2022, 3, 7, 20, 36, 30))",MSFT,4730


In [23]:
debug6.stop()

## Session Window is similar but used to group data that represents a continuous stream of activity. The time specifies a timeout period or period of inactivity that indicates when a session should end.

In [None]:
session_window = (df4.select("timestamp", "symbol","quantity")
        .withWatermark("timestamp", "10 seconds") 
        .groupBy(session_window("timestamp", "5 minutes").alias("window"), "symbol") 
        .agg(sum("quantity").alias("sum"))
        )
print(session_window)

debug7 = write_memory(session_window, 'debug7')


## Let's join the streaming aggregation with a static reference table.

In [24]:
x = sc.parallelize([('AAPL', 'Apple'), ('MSFT', 'Microsoft'), ('GOOG','Google')])
stocks = spark.createDataFrame(x, 'symbol:string, name:string')
stocks.createOrReplaceTempView('stocks')
fixed_window.createOrReplaceTempView('trades')

joined_aggregate = spark.sql("""
SELECT t.*, s.name
FROM trades as t
JOIN stocks as s on t.symbol = s.symbol
""")

debug8 = write_memory(joined_aggregate, 'debug8')



In [27]:
%%sparksql
select * from debug8 order by window desc, symbol limit 9


0,1,2,3
window,symbol,sum,name
"Row(start=datetime.datetime(2022, 3, 7, 20, 37, 30), end=datetime.datetime(2022, 3, 7, 20, 37, 40))",AAPL,1345,Apple
"Row(start=datetime.datetime(2022, 3, 7, 20, 37, 30), end=datetime.datetime(2022, 3, 7, 20, 37, 40))",GOOG,265,Google
"Row(start=datetime.datetime(2022, 3, 7, 20, 37, 30), end=datetime.datetime(2022, 3, 7, 20, 37, 40))",MSFT,1307,Microsoft
"Row(start=datetime.datetime(2022, 3, 7, 20, 37, 20), end=datetime.datetime(2022, 3, 7, 20, 37, 30))",AAPL,1108,Apple
"Row(start=datetime.datetime(2022, 3, 7, 20, 37, 20), end=datetime.datetime(2022, 3, 7, 20, 37, 30))",GOOG,971,Google
"Row(start=datetime.datetime(2022, 3, 7, 20, 37, 20), end=datetime.datetime(2022, 3, 7, 20, 37, 30))",MSFT,1448,Microsoft
"Row(start=datetime.datetime(2022, 3, 7, 20, 37, 10), end=datetime.datetime(2022, 3, 7, 20, 37, 20))",AAPL,1312,Apple
"Row(start=datetime.datetime(2022, 3, 7, 20, 37, 10), end=datetime.datetime(2022, 3, 7, 20, 37, 20))",GOOG,511,Google
"Row(start=datetime.datetime(2022, 3, 7, 20, 37, 10), end=datetime.datetime(2022, 3, 7, 20, 37, 20))",MSFT,1078,Microsoft


In [28]:
debug8.stop()

ConsumerRecord(topic='stocks-avro', partition=0, offset=40679, timestamp=1645243362535, timestamp_type=0, key=b'\xd6\x0cgMs<By\xb8\xcaR\x02\xe0\xfa\x93\x14', value=b'Obj\x01\x04\x14avro.codec\x08null\x16avro.schema\xc6\x03{"type": "record", "name": "Stock", "namespace": "stock.avro", "fields": [{"type": "string", "name": "event_time"}, {"type": "string", "name": "symbol"}, {"type": "float", "name": "price"}, {"type": "int", "name": "quantity"}]}\x00$\x7f#w\xeaD\xdchK\xc5!\xf85\x10\xc8J\x02>&2022-02-19 04:02:42\x08MSFT\xa4p\xdfB\xf0\t$\x7f#w\xeaD\xdchK\xc5!\xf85\x10\xc8J', headers=[], checksum=None, serialized_key_size=16, serialized_value_size=328, serialized_header_size=-1)
ConsumerRecord(topic='stocks-avro', partition=0, offset=40679, timestamp=1645243362535, timestamp_type=0, key=b'\xd6\x0cgMs<By\xb8\xcaR\x02\xe0\xfa\x93\x14', value=b'Obj\x01\x04\x14avro.codec\x08null\x16avro.schema\xc6\x03{"type": "record", "name": "Stock", "namespace": "stock.avro", "fields": [{"type": "string", "name": "event_time"}, {"type": "string", "name": "symbol"}, {"type": "float", "name": "price"}, {"type": "int", "name": "quantity"}]}\x00$\x7f#w\xeaD\xdchK\xc5!\xf85\x10\xc8J\x02>&2022-02-19 04:02:42\x08MSFT\xa4p\xdfB\xf0\t$\x7f#w\xeaD\xdchK\xc5!\xf85\x10\xc8J', headers=[], checksum=None, serialized_key_size=16, serialized_value_size=328, serialized_header_size=-1)

ConsumerRecord(topic='stocks-avro', partition=0, offset=40814, timestamp=1645243470645, timestamp_type=0, key=b'\xc8\xeb\xc2\xe9O\xaaJ\x86\x83\x85\xb9\xd7\xf46\xea\x8f', value=b'Obj\x01\x04\x14avro.codec\x08null\x16avro.schema\xc6\x03{"type": "record", "name": "Stock", "namespace": "stock.avro", "fields": [{"type": "string", "name": "event_time"}, {"type": "string", "name": "symbol"}, {"type": "float", "name": "price"}, {"type": "int", "name": "quantity"}]}\x00\xe8\xf5x\r\xbf\x8aC\x98&\xaf\x13iz\x9dp\x13\x02>&2022-02-19 04:04:30\x08MSFT\xc3\xf5\x11C\xac\x04\xe8\xf5x\r\xbf\x8aC\x98&\xaf\x13iz\x9dp\x13', headers=[], checksum=None, serialized_key_size=16, serialized_value_size=328, serialized_header_size=-1)
