In [0]:
# First we will clearn input directory and drop output table and its respective warehouse directory using below commands
dbutils.fs.rm("dbfs:/FileStore/streaming_input/input1", True)



Out[24]: True

In [0]:
#if table exist you can drop that table using below command
spark.sql("drop table orders_result_final")


Out[25]: DataFrame[]

In [0]:
#Also we will delete the directory also
dbutils.fs.rm("dbfs:/user/hive/warehouse/orders_result_final", True)

Out[26]: False

In [0]:
# we will create input directory
dbutils.fs.mkdirs("dbfs:/FileStore/streaming_input/input1")

Out[27]: True

In [0]:
dbutils.fs.rm("dbfs:/Checkpointlocation105", True)

Out[28]: True

In [0]:
#target table creation command

spark.sql("create table orders_result_final (customer_id long, orders_placed long, products_purchased long, amount_spent double)")

Out[29]: DataFrame[]

In [0]:
schema_json = "order_id long, customer_id long, customer_fname string, customer_lname string,  city string, state string, pincode long, line_iems array<struct<order_item_id:long, order_item_product_id:long, order_item_quantity:long, order_item_subtotal:float, order_item_product_price:float>>"

In [0]:
order_data = spark.readStream \
.format("json") \
.schema(schema_json) \
.option("path", "dbfs:/FileStore/streaming_input/input1") \
.load()


In [0]:
order_data.createOrReplaceTempView("orders")

In [0]:
exploded_orders = spark.sql("select order_id, customer_id, city, state, pincode, explode(line_iems) as lines from orders")

In [0]:
exploded_orders.createOrReplaceTempView("exploded_orders")

In [0]:
flattend_orders = spark.sql("""select order_id, customer_id, city, state, pincode, 
                            lines.order_item_id as item_id,
                            lines.order_item_product_id as product_id, 
                            lines.order_item_quantity as quantity,
                            lines.order_item_product_price as product_price, 
                            lines.order_item_subtotal as subtotal
                            from exploded_orders""")

In [0]:
flattend_orders.createOrReplaceTempView("flattened_orders")

In [0]:
# from pyspark.sql.functions import distinct

aggregated_orders = spark.sql("""select customer_id, approx_count_distinct(order_id) as orders_placed, count(product_id) as products_purchased, sum(subtotal) 
                              as amount_spent
                              from flattened_orders
                              group By customer_id
                              """)

In [0]:
# fixed interval trigger

streaming_query = aggregated_orders \
    .writeStream \
    .format("delta") \
    .outputMode("complete") \
    .trigger(availableNow=True) \
    .option("checkpointLocation", "Checkpointlocation117") \
    .toTable("orders_result_final")

In [0]:
spark.sql("select * from orders_result_final").show()

+-----------+-------------+------------------+------------------+
|customer_id|orders_placed|products_purchased|      amount_spent|
+-----------+-------------+------------------+------------------+
|      11318|            1|                 5|1129.8600387573242|
|      11599|            1|                 1| 299.9800109863281|
|        256|            1|                 3| 579.9800109863281|
|       8827|            1|                 4| 699.8500099182129|
+-----------+-------------+------------------+------------------+



In [0]:
streaming_query.explain()

