## Initialize PySpark.

In [1]:
import os, sys, json, io
from pyspark.sql import *
from pyspark.sql.utils import StreamingQueryException
import sys
import json

os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'
sys.path.append('/class')

# Kafka variables
brokers = 'localhost:9092'
kafka_topic = 'stocks-json'
receiver_sleep_time = 4

# Connect to Spark 
if not 'sc' in locals():
    from initspark import initspark
    sc, spark, config = initspark(packages = ['kafka', 'kafka-sql', 'spark-avro'])



initializing pyspark
packages ['kafka', 'kafka-sql', 'spark-avro']
--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.2,org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.1,org.apache.spark:spark-avro_2.12:3.2.1 pyspark-shell
pyspark initialized


## Basic batch source example

In [2]:
! hadoop fs -rm -r /territories

Deleted /territories


In [3]:
help(spark.read.csv)

Help on method csv in module pyspark.sql.readwriter:

csv(path, schema=None, sep=None, encoding=None, quote=None, escape=None, comment=None, header=None, inferSchema=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None, negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None, maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None, columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None, samplingRatio=None, enforceSchema=None, emptyValue=None, locale=None, lineSep=None, pathGlobFilter=None, recursiveFileLookup=None, modifiedBefore=None, modifiedAfter=None, unescapedQuoteHandling=None) method of pyspark.sql.readwriter.DataFrameReader instance
    Loads a CSV file and returns the result as a  :class:`DataFrame`.
    
    This function will go through the input once to determine the input schema if
    ``inferSchema`` is enabled. To avoid going through the entire data once, di

In [4]:
territories = spark.read.csv('file:///class/2-apache-spark/territories.csv', header=True, inferSchema = True)
#print(territories)
#territories.show()
#territories.write.csv('hdfs://localhost:9000/territories', sep = '|')
#territories.write.json('hdfs://localhost:9000/territories_json')
#territories.write.parquet('hdfs://localhost:9000/territories_parquet')
                      
# territories.where('RegionID = 1').show()
# territories.groupby('RegionID').count().show()
# t2 = territories.where("TerritoryName like '%a%'")
# t3 = t2.groupby('RegionID').count()
# t4 = t3.filter('count > 5')

# (spark.read.csv('file:///class/2-apache-spark/territories.csv'
#                , header=True, inferSchema = True)
#       .where("TerritoryName like '%a%'")
#       .groupby('RegionID').count()
#       .filter('count > 5')
#       .show()
# )

#t4.show()
# territories.show()
territories.createOrReplaceTempView('territories')
spark.sql("""SELECT regionid, count(*) as cnt 
          from territories 
          where territoryname like '%a%' 
          group by regionid 
          order by cnt desc""").show()


+--------+---+
|regionid|cnt|
+--------+---+
|       2| 11|
|       4|  6|
|       1|  5|
|       3|  5|
+--------+---+



## LAB: ## 
### The folder /class/datasets/northwind/ contains sample data in a variety of formats. CSV contains comma separated data without headers, CSVHeaders is the same data with headers. JSON, AVO, ORC, PARQUET folders have the data in those formats. 
1. Read the CSVHeaders version of Categories into a DataFrame variable called categories. Print and show it to see what the data looks like.
2. Read the JSON version of Products into a DataFrame variable called products. Print and show it to see what the data looks like.
3. Using spark sql, turn each DataFrame variable into a temporary view and write a SQL statement to join the two into a new DataFrame variable that shows the ProductID, ProductName, CategoryID and CategoryName.
4. Using dot syntax take the joined DataFrame and count how many items are in each category.
5. Write the results to HDFS in a folder called /category_count
<p></p>

<details><summary>Click for <b>hint</b></summary>
<p>1. Use spark.read.csv and tell it to the file has headers and infer the schema. Use file:/// prefix to point to the files.</p>
    <p>2. Use spark.read.json</p>
    <p>3. Turn both DataFrames into a temporary view and write a standard SQL JOIN</p>
    <p>4. Use .grouby and .count</p>
    <p>5. Take the DataFrame and call .write.csv and save the results using hdfs:// prefix</p>
</details>

<details><summary>Click for <b>code</b></summary>
<p>

```python
categories = spark.read.csv('file:///class/datasets/northwind/CSVHeaders/categories', header = True, inferSchema = True)
products = spark.read.json('file:///class/datasets/northwind/JSON/products')
categories.createOrReplaceTempView('categories')
products.createOrReplaceTempView('products')
prod_cat = spark.sql("""SELECT c.CategoryID, c.CategoryName, p.ProductID, p.ProductName
FROM categories AS c
JOIN Products AS p ON c.CategoryID = p.CategoryID
""")
category_count = prod_cat.groupby('CategoryName').count()
category_count.show()
category_count.write.csv('hdfs://localhost:9000/category_count')
```
</p>
</details>

## Create a helper function to stream to a memory table.

In [5]:
def write_memory(df, queryname = 'debug', mode = "append"):
    # modes are: complete, update, append

    # if queryname in spark.catalog.listTables():
    #     spark.catalog.dropTempView(queryname)
    
    query = (df.writeStream 
            .format("memory")
            .queryName(queryname)
            .outputMode(mode)
            .start()
            )
    return query


## Define a streaming source and create a temp view to receive the results for debugging.

In [6]:
df = (spark.readStream 
    .format("kafka") 
    .option("kafka.bootstrap.servers", brokers) 
    .option("subscribe", kafka_topic) 
    .option("startingOffsets", "earliest")
    .option("failOnDataLoss", False)
    .load()
    )

# df.createOrReplaceTempView('table')
# df1 = spark.sql("""SELECT 'new data' as newfield, * from table""")

if 'debug1' in locals():
    debug1.stop()

df1 = df.selectExpr("UPPER(CAST(value AS STRING)) as value")

debug1 = write_memory(df1, 'debug1')

## Query from the memory stream like it's a temporary view using `spark.sql`

In [7]:
spark.sql("select * from debug1").take(10)

[]

## You can stop and restart a memory stream whenever you like.

In [8]:
debug1.stop()

In [9]:
debug1 = write_memory(df1, 'debug1')

In [10]:
spark.sql("select * from debug1").take(10)

[]

## Spark SQL magic is also quite helpful.

In [11]:
%load_ext sparksql_magic
# pip install sparksql-magic

In [12]:
%%sparksql
select * from debug1 order by value limit 10

0
value


## Stop a memory stream when you don't need it, as it can consume a lot of memory.

In [13]:
debug1.stop()

## Let's try reading AVRO. First we are using schemaless AVRO messages so we need to read in a schema from a file or repository to apply to the message body to parse it into a structured format. This trick will take an AVRO schema file and turn it into a JSON string which can then be converted into a Spark struct object suitable for use in the deserializing process.

In [14]:
stock_schema = open("stock.avsc", "r").read()
print('stock_schema', stock_schema)
stock_struct = spark.read.format("avro").option("avroSchema", stock_schema).load().schema
print('stock_struct', stock_struct)


stock_schema {
    "namespace": "stock.avro",
    "type": "record",
    "name": "Stock",
    "fields": [
        {"name": "event_time", "type": "string"},
        {"name": "symbol",  "type": "string"},
        {"name": "price", "type": "float"},
        {"name": "quantity", "type": "int"}
    ]
}
stock_struct StructType(List(StructField(event_time,StringType,true),StructField(symbol,StringType,true),StructField(price,FloatType,true),StructField(quantity,IntegerType,true)))


## LAB: ## 
### Using the stocks-json example do the following to start up and read an AVRO stream instead:
1. Open a new terminal window and cd /class/1-producers-and-consumers
2. Run the 3-python-kafka-avro-producer.py to start making messages
3. Open another terminal window and cd /class//1-producers-and-consumers
4. Run 4-python-kafka-avro-consumer.py to show the messages are being created and sent.
5. In the cells below write Spark code based on the JSON example that can read the AVRO stream and simple display it. 

<p></p>

<details><summary>Click for <b>code</b></summary>
<p>

```python
brokers = 'localhost:9092'
kafka_topic = 'stocks-avro'
receiver_sleep_time = 4

df = (spark.readStream 
    .format("kafka") 
    .option("kafka.bootstrap.servers", brokers) 
    .option("subscribe", kafka_topic) 
    .option("startingOffsets", "earliest")
    .option("failOnDataLoss", False)
    .load()
    )
print('df', df)

if 'debug2' in locals():
    debug2.stop()
debug2 = write_memory(df, 'debug2')

# In a new cell
%%sparksql
select timestamp, key, value from debug2 order by timestamp desc limit 10    
```
</details>

0,1,2
timestamp,key,value
2022-03-07 20:24:37.801000,"bytearray(b'qo3F,\xb5C\xc3\x84\x8e\x8ay\xf9\x05G\xbb')",bytearray(b'&2022-03-07 20:24:37\x08GOOG\xae\x87ZC\xf0\x04')
2022-03-07 20:24:37.798000,bytearray(b'&(\xcef\xdd\tF\x1b\xb4\xb1\x99qkV\xfb\xa7'),bytearray(b'&2022-03-07 20:24:37\x08MSFT\xae\x07\x17C\xc2\n')
2022-03-07 20:24:37.797000,bytearray(b'R?\xdb\xebp?E\xd7\x828\xb0h\x13\xbcK)'),bytearray(b'&2022-03-07 20:24:37\x08AAPL\x00@tC\x9e\x0b')
2022-03-07 20:24:33.797000,bytearray(b'\xa0\x07Z\x87M!K\xf3\x80\xdb\x18\xd6\xe0`\xb5\x0b'),bytearray(b'&2022-03-07 20:24:33\x08GOOG\x1fEeC\xca\x0f')
2022-03-07 20:24:33.794000,bytearray(b'\xa7$\x0f}\xe7\x83E\x08\x997\x9eT\xa9\xfb\x08\x91'),bytearray(b'&2022-03-07 20:24:33\x08AAPL=\x8a\xd1B\xa2\x08')
2022-03-07 20:24:33.793000,bytearray(b'n\x14\xf1Xf\xa8O\xcc\xa7\x89\xbe`\x03\xbd\x86\x04'),bytearray(b'&2022-03-07 20:24:33\x08MSFT\xf6\xa8fC\xca\r')
2022-03-07 20:24:29.793000,bytearray(b'N4\x1f\xb8\xf1\x8cKS\xb3\x03\xed\x1e\xb5C\x8c\x07'),bytearray(b'&2022-03-07 20:24:29\x08GOOG\xcd\x0c(C\x84\r')
2022-03-07 20:24:29.790000,bytearray(b'A\xd8o\xed\xf7eE\x9c\xb6\xebVw{\x9a\x04\xf0'),bytearray(b'&2022-03-07 20:24:29\x08MSFT\xcdL\x13C\xb6\x08')
2022-03-07 20:24:29.789000,bytearray(b'B\xe7\x9b\xa9\x1f2I\xf8\x99\x9e@\x12n\xf1\x06\xeb'),bytearray(b'&2022-03-07 20:24:29\x08AAPL\xcd\xcc\x87C\xe6\x06')


### Stop the memory stream object you created for the AVRO lab.

In [37]:
debug2.stop()

## Next steps are to parse the message body and keep whatever metadata from the message we're interested in, and turn that into a DataFrame object we can work with and do whatever we want with the results. In this case in involves converting the key back to a UUID and the message body into Python dictionary.

In [19]:
from pyspark.sql.functions import *
import uuid

def convert_uuid(value):
    # value is a bytearray in this case coming from spark
    ret = uuid.UUID(bytes = bytes(value))
    return str(ret)

convert_uuid_udf = udf(convert_uuid, StringType())
    
from pyspark.sql.avro.functions import from_avro, to_avro

# Could have read the schema from a file as shown earlier but just put it here so it's easier to see it.
stock_schema = """{
    "type": "record",
    "name": "Stock",
    "fields": [
        {"name": "event_time", "type": "string"},
        {"name": "symbol",  "type": "string"},
        {"name": "price", "type": "float"},
        {"name": "quantity", "type": "int"}
    ]
}"""

# Select the three columns we're interested in
# df3 = df.select("timestamp", "key", "value")

# Do some manipulating on the columns to make them into something meaningful
df3 = df.select("timestamp"
                , convert_uuid_udf(col("key")).alias("key")
                , from_avro(df.value, stock_schema, options = {"mode":"PERMISSIVE"}).alias("value"))

# We end up with three columns called timestamp, key and value, but value is a single column of the datatype
# struct, so this trick will flatten it out so we end up with six normal columns.
df3 = df3.select(*(df3.columns), col("value.*")).drop('value')

#df3 = df3.where("symbol = 'GOOG'")
df3.createOrReplaceTempView('stocks')

print('df3', df3)
if 'debug3' in locals():
    debug3.stop()
debug3 = write_memory(df3, 'debug3')

df3 DataFrame[timestamp: timestamp, key: string, event_time: string, symbol: string, price: float, quantity: int]


In [20]:
%%sparksql
select * from debug3 order by timestamp desc

0,1,2,3,4,5
timestamp,key,event_time,symbol,price,quantity


In [21]:
debug3.stop()

## Here's the same thing for the JSON stream using the from_json function instead of from_avro.

In [46]:
from pyspark.sql.functions import *
import uuid

brokers = 'localhost:9092'
kafka_topic = 'stocks-json'
receiver_sleep_time = 4

stock_schema = open("stock.avsc", "r").read()
print('stock_schema', stock_schema)

stock_struct = spark.read.format("avro").option("avroSchema", stock_schema).load().schema
print('stock_struct', stock_struct)

df = (spark.readStream 
    .format("kafka") 
    .option("kafka.bootstrap.servers", brokers) 
    .option("subscribe", kafka_topic) 
    .option("startingOffsets", "earliest")
    .option("failOnDataLoss", False)
#    .option("kafka.group.id", "stock-json-spark-group")
    .load()
    )
print('df', df)


def convert_uuid(value):
    # value is a bytearray in this case coming from spark
    ret = uuid.UUID(bytes = bytes(value))
    return str(ret)

convert_uuid_udf = udf(convert_uuid, StringType())

# keep the key and timestamp and convert the value from bytes to string
#df1 = df.select(col("key"), "timestamp", expr("CAST(value AS STRING) as value"))
df1 = df.select(convert_uuid_udf(col("key")).alias("key"), "timestamp", expr("CAST(value AS STRING) as value"))
print('df1', df1)

# cast the string json to a struct
# keep all the columns we selected and convery the JSON string into a struct object and remove the string version
df2 = df1.select(*df1.columns, from_json(df1.value, stock_struct).alias("value2")).drop('value')
print('df2', df2)

# flatten the struct to a normal DataFrame
df4 = df2.select(*(df2.columns), col("value2.*")).drop('value2')
print('df4', df4)

if 'debug4' in locals():
    debug4.stop()
    
debug4 = write_memory(df4, 'debug4')


stock_schema {
    "namespace": "stock.avro",
    "type": "record",
    "name": "Stock",
    "fields": [
        {"name": "event_time", "type": "string"},
        {"name": "symbol",  "type": "string"},
        {"name": "price", "type": "float"},
        {"name": "quantity", "type": "int"}
    ]
}
stock_struct StructType(List(StructField(event_time,StringType,true),StructField(symbol,StringType,true),StructField(price,FloatType,true),StructField(quantity,IntegerType,true)))
df DataFrame[key: binary, value: binary, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int]
df1 DataFrame[key: string, timestamp: timestamp, value: string]
df2 DataFrame[key: string, timestamp: timestamp, value2: struct<event_time:string,symbol:string,price:float,quantity:int>]
df4 DataFrame[key: string, timestamp: timestamp, event_time: string, symbol: string, price: float, quantity: int]


In [47]:
%%sparksql 
select * from debug4 order by event_time desc 

only showing top 20 row(s)


0,1,2,3,4,5
key,timestamp,event_time,symbol,price,quantity
efefebba-940c-4d09-9ebd-7b369d23a389,2022-03-09 12:12:54.881000,2022-03-09 12:12:54,AAPL,272.2699890136719,279
8a9229ac-2b31-4877-b56a-12f265b4fd25,2022-03-09 12:12:54.833000,2022-03-09 12:12:54,MSFT,146.0800018310547,557
96e822b1-dfd9-4d24-97e3-282578ff1062,2022-03-09 12:12:54.797000,2022-03-09 12:12:54,GOOG,151.11000061035156,629
a1cf5931-cc05-47e0-8aa6-87ee634e6b12,2022-03-09 12:12:50.793000,2022-03-09 12:12:50,GOOG,237.11000061035156,514
9d6d5588-537d-4cbd-b58c-4fb828b588aa,2022-03-09 12:12:50.877000,2022-03-09 12:12:50,AAPL,151.05999755859375,332
c9babdd2-d3d0-4453-ae8a-863b654bc45d,2022-03-09 12:12:50.829000,2022-03-09 12:12:50,MSFT,140.60000610351562,998
b1aa0482-e60d-4007-94c5-cbd5cba2fc3a,2022-03-09 12:12:46.825000,2022-03-09 12:12:46,MSFT,264.82000732421875,712
4d84d6c9-42b2-48dc-a151-cbcc35d437f0,2022-03-09 12:12:46.874000,2022-03-09 12:12:46,AAPL,291.510009765625,572
908c9580-0d95-43d3-bb9d-0f208a51ff04,2022-03-09 12:12:46.789000,2022-03-09 12:12:46,GOOG,249.1300048828125,22


In [48]:
debug4.stop()


## Now that we have a normal DataFrame, let's manipulate it how we want and write the results out to another stream. Try the following, it will fail. Read why.

In [49]:
print(df4)
df4.createOrReplaceTempView('stocks2')
spark.sql('SELECT symbol, count(*) as cnt, sum(quantity) as qty from stocks2 group by symbol').show()

DataFrame[key: string, timestamp: timestamp, event_time: string, symbol: string, price: float, quantity: int]


AnalysisException: Queries with streaming sources must be executed with writeStream.start();
kafka

### Streaming sources can't be aggregated unles we addd a window to them.

In [50]:
fixed_window = (df4.select("timestamp", "symbol", "quantity")
        .withWatermark("timestamp", "10 seconds") 
        .groupBy(window("timestamp", "10 seconds").alias("window"), "symbol") 
        .agg(sum("quantity").alias("sum"))
        )
print(fixed_window)

if 'debug5' in locals():
    debug5.stop()
debug5 = write_memory(fixed_window, 'debug5')


DataFrame[window: struct<start:timestamp,end:timestamp>, symbol: string, sum: bigint]


## We can see that we get aggregate by symbol every ten seconds. This data can be written off somewhere like a SQL or NoSQL database or forwarded as a new message to create a streaming aggregation.

In [57]:
%%sparksql
select * from debug5 order by window desc, symbol limit 9

0,1,2
window,symbol,sum


In [27]:
debug5.stop()

## Sliding windows are similar except you give it two parameters, the first is the total length of the window and the second is the refresh interval. In this case, the windows will overlap.

In [28]:
sliding_window = (df4.select("timestamp", "symbol","quantity")
        .withWatermark("timestamp", "10 seconds") 
        .groupBy(window("timestamp", "30 seconds", "10 seconds").alias("window"), "symbol") 
        .agg(sum("quantity").alias("sum"))
        )
print(sliding_window)

debug6 = write_memory(sliding_window, 'debug6')


DataFrame[window: struct<start:timestamp,end:timestamp>, symbol: string, sum: bigint]


In [29]:
%%sparksql
select * from debug6 order by window desc, symbol limit 21


0,1,2
window,symbol,sum


In [30]:
debug6.stop()

## Session Window is similar but used to group data that represents a continuous stream of activity. The time specifies a timeout period or period of inactivity that indicates when a session should end.

In [31]:
session_window = (df4.select("timestamp", "symbol","quantity")
        .withWatermark("timestamp", "10 seconds") 
        .groupBy(session_window("timestamp", "5 minutes").alias("window"), "symbol") 
        .agg(sum("quantity").alias("sum"))
        )
print(session_window)

debug7 = write_memory(session_window, 'debug7')


DataFrame[window: struct<start:timestamp,end:timestamp>, symbol: string, sum: bigint]


## Let's join the streaming aggregation with a static reference table.

In [32]:
x = sc.parallelize([('AAPL', 'Apple'), ('MSFT', 'Microsoft'), ('GOOG','Google')])
stocks = spark.createDataFrame(x, 'symbol:string, name:string')
stocks.createOrReplaceTempView('stocks')
fixed_window.createOrReplaceTempView('trades')

joined_aggregate = spark.sql("""
SELECT t.*, s.name
FROM trades as t
JOIN stocks as s on t.symbol = s.symbol
""")

debug8 = write_memory(joined_aggregate, 'debug8')



In [33]:
%%sparksql
select * from debug8 order by window desc, symbol limit 9


0,1,2,3
window,symbol,sum,name


In [34]:
debug8.stop()

### Sample of what a consume record in AVRO looks like

ConsumerRecord(topic='stocks-avro', partition=0, offset=40679, timestamp=1645243362535, timestamp_type=0, key=b'\xd6\x0cgMs<By\xb8\xcaR\x02\xe0\xfa\x93\x14', value=b'Obj\x01\x04\x14avro.codec\x08null\x16avro.schema\xc6\x03{"type": "record", "name": "Stock", "namespace": "stock.avro", "fields": [{"type": "string", "name": "event_time"}, {"type": "string", "name": "symbol"}, {"type": "float", "name": "price"}, {"type": "int", "name": "quantity"}]}\x00$\x7f#w\xeaD\xdchK\xc5!\xf85\x10\xc8J\x02>&2022-02-19 04:02:42\x08MSFT\xa4p\xdfB\xf0\t$\x7f#w\xeaD\xdchK\xc5!\xf85\x10\xc8J', headers=[], checksum=None, serialized_key_size=16, serialized_value_size=328, serialized_header_size=-1)
ConsumerRecord(topic='stocks-avro', partition=0, offset=40679, timestamp=1645243362535, timestamp_type=0, key=b'\xd6\x0cgMs<By\xb8\xcaR\x02\xe0\xfa\x93\x14', value=b'Obj\x01\x04\x14avro.codec\x08null\x16avro.schema\xc6\x03{"type": "record", "name": "Stock", "namespace": "stock.avro", "fields": [{"type": "string", "name": "event_time"}, {"type": "string", "name": "symbol"}, {"type": "float", "name": "price"}, {"type": "int", "name": "quantity"}]}\x00$\x7f#w\xeaD\xdchK\xc5!\xf85\x10\xc8J\x02>&2022-02-19 04:02:42\x08MSFT\xa4p\xdfB\xf0\t$\x7f#w\xeaD\xdchK\xc5!\xf85\x10\xc8J', headers=[], checksum=None, serialized_key_size=16, serialized_value_size=328, serialized_header_size=-1)

ConsumerRecord(topic='stocks-avro', partition=0, offset=40814, timestamp=1645243470645, timestamp_type=0, key=b'\xc8\xeb\xc2\xe9O\xaaJ\x86\x83\x85\xb9\xd7\xf46\xea\x8f', value=b'Obj\x01\x04\x14avro.codec\x08null\x16avro.schema\xc6\x03{"type": "record", "name": "Stock", "namespace": "stock.avro", "fields": [{"type": "string", "name": "event_time"}, {"type": "string", "name": "symbol"}, {"type": "float", "name": "price"}, {"type": "int", "name": "quantity"}]}\x00\xe8\xf5x\r\xbf\x8aC\x98&\xaf\x13iz\x9dp\x13\x02>&2022-02-19 04:04:30\x08MSFT\xc3\xf5\x11C\xac\x04\xe8\xf5x\r\xbf\x8aC\x98&\xaf\x13iz\x9dp\x13', headers=[], checksum=None, serialized_key_size=16, serialized_value_size=328, serialized_header_size=-1)
