In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, window

spark = SparkSession.builder.appName("Chapter3").getOrCreate()

staticDF = spark.read.format("csv").option("header", "true").load("retail-data/by-day/*.csv")
static_schema = staticDF.schema

print(static_schema)



StructType(List(StructField(InvoiceNo,StringType,true),StructField(StockCode,StringType,true),StructField(Description,StringType,true),StructField(Quantity,StringType,true),StructField(InvoiceDate,StringType,true),StructField(UnitPrice,StringType,true),StructField(CustomerID,StringType,true),StructField(Country,StringType,true)))


In [3]:
staticDF.selectExpr("CustomerID", "Quantity * UnitPrice as TotalCost", "InvoiceDate").groupBy(col("CustomerID"), window(col("InvoiceDate"), "1 day")).sum("TotalCost").show(10)

+----------+--------------------+------------------+
|CustomerID|              window|    sum(TotalCost)|
+----------+--------------------+------------------+
|   15274.0|{2011-12-05 05:30...|            332.58|
|   14719.0|{2011-12-08 05:30...|406.41999999999985|
|   16794.0|{2011-12-08 05:30...|100.66000000000003|
|   12464.0|{2011-11-29 05:30...|             281.9|
|   15269.0|{2011-11-16 05:30...|             408.8|
|   12720.0|{2011-11-16 05:30...|            409.02|
|   15900.0|{2011-11-16 05:30...| 351.1099999999999|
|   12600.0|{2011-11-11 05:30...|379.58000000000004|
|   16161.0|{2010-12-06 05:30...|125.20000000000002|
|   14646.0|{2011-11-23 05:30...|10078.640000000003|
+----------+--------------------+------------------+
only showing top 10 rows



In [4]:
streamingDF = spark.readStream.format("csv").option("header", "true").schema(static_schema).load("retail-data/by-day/*.csv")

purchaseByHourDF = streamingDF.selectExpr("CustomerID", "Quantity * UnitPrice as TotalCost", "InvoiceDate").groupBy("CustomerID").sum("TotalCost")


purchaseByHourDF.writeStream.format("memory").queryName("customer_purchased").outputMode("complete").start()




<pyspark.sql.streaming.StreamingQuery at 0x7fc9bef5ce80>

In [7]:
spark.sql("select * from customer_purchased").show(5)

+----------+------------------+
|CustomerID|    sum(TotalCost)|
+----------+------------------+
|   14349.0|133.50000000000006|
|   16553.0|           5664.57|
|   12535.0| 716.3500000000001|
|   17966.0|           1098.43|
|   13514.0|152.20000000000002|
+----------+------------------+
only showing top 5 rows

