In [0]:
spark

<pyspark.sql.connect.session.SparkSession at 0xffefe0808910>

In [0]:
#Load cleaned order data into Databricks
cleaned_orders_df = spark.read.option("header", True).option("inferSchema", True) \
    .csv("/Volumes/workspace/csv_data/customersordersinsights/cleaned_orders.csv")

cleaned_orders_df.show()

+--------+-----------+-------------+----------+-------------+---------+-----------------+----------+-------+
|order_id|customer_id|customer_name|order_date|delivery_date|   status|            issue|delay_days|delayed|
+--------+-----------+-------------+----------+-------------+---------+-----------------+----------+-------+
|       1|        101|  Amit Sharma|05-01-2024|   05-05-2024|Delivered|         No issue|       414|      1|
|       2|        102|  Priya Singh|05-03-2024|         NULL|  Delayed|    Late delivery|         0|      0|
|       3|        101|  Amit Sharma|05-10-2024|   05-09-2024|Delivered|         No issue|       291|      1|
|       4|        103|   Ravi Kumar|05-12-2024|         NULL|  Pending|Address not found|         0|      0|
|       5|        104|  Anita Desai|      NULL|         NULL|Delivered|         No issue|         0|      0|
|       6|        102|  Priya Singh|      NULL|         NULL|  Delayed|    Late delivery|         0|      0|
|       7|        1

In [0]:
#Pipeline to update latest delivery status
from pyspark.sql.functions import when, col, isnull

updated_orders_df = cleaned_orders_df.withColumn(
    "delivery_status",
    when(col("delivery_date").isNull(), "Pending")
    .when(col("delivery_date") > col("order_date"), "Delivered Late")
    .otherwise("Delivered On Time")
)

updated_orders_df.show()

+--------+-----------+-------------+----------+-------------+---------+-----------------+----------+-------+-----------------+
|order_id|customer_id|customer_name|order_date|delivery_date|   status|            issue|delay_days|delayed|  delivery_status|
+--------+-----------+-------------+----------+-------------+---------+-----------------+----------+-------+-----------------+
|       1|        101|  Amit Sharma|05-01-2024|   05-05-2024|Delivered|         No issue|       414|      1|   Delivered Late|
|       2|        102|  Priya Singh|05-03-2024|         NULL|  Delayed|    Late delivery|         0|      0|          Pending|
|       3|        101|  Amit Sharma|05-10-2024|   05-09-2024|Delivered|         No issue|       291|      1|Delivered On Time|
|       4|        103|   Ravi Kumar|05-12-2024|         NULL|  Pending|Address not found|         0|      0|          Pending|
|       5|        104|  Anita Desai|      NULL|         NULL|Delivered|         No issue|         0|      0|   

In [0]:
#Save the results as Delta or CSV
#Save as CSV
updated_orders_df.coalesce(1).write.option("header", True).mode("overwrite") \
    .csv("/Volumes/workspace/csv_data/customersordersinsights/output/updated_orders_csv")

#Save as Delta
updated_orders_df.write.format("delta").mode("overwrite") \
    .save("/Volumes/workspace/csv_data/customersordersinsights/output/updated_orders_delta")

In [0]:
#SQL Query to Show Top 5 Delayed Customers
updated_orders_df.createOrReplaceTempView("orders_view")

spark.sql("""
    SELECT customer_name, COUNT(*) AS delayed_orders
    FROM orders_view
    WHERE delivery_status = 'Delivered Late'
    GROUP BY customer_name
    ORDER BY delayed_orders DESC
    LIMIT 5
""").show()

+-------------+--------------+
|customer_name|delayed_orders|
+-------------+--------------+
|  Amit Sharma|             1|
+-------------+--------------+

