In [0]:
spark

<pyspark.sql.connect.session.SparkSession at 0xffff2c335e90>

In [0]:
from pyspark.sql.functions import col, to_date, datediff, current_date, when

#Data
df = spark.read.option("header", "true").csv("/Volumes/workspace/supplychain/csv_data/orders_week4.csv", inferSchema=True)
df.show()

+--------+-----------+-------------+
|order_id|supplier_id|delivery_date|
+--------+-----------+-------------+
|       1|        101|   2025-05-01|
|       2|        102|   2025-04-30|
|       3|        103|   2025-05-20|
+--------+-----------+-------------+



In [0]:
#clean and filter the data
# Convert delivery_date to date type
df = df.withColumn("delivery_date", to_date(col("delivery_date")))
df.show()

# Calculate delay_days
df = df.withColumn("delay_days", datediff(current_date(), col("delivery_date")))
df.show()

# Mark delayed orders
df = df.withColumn("is_delayed", when(col("delay_days") > 0, 1).otherwise(0))
df.show()

# Preview cleaned data
df.select("order_id", "supplier_id", "delivery_date", "delay_days", "is_delayed")
df.show()

+--------+-----------+-------------+
|order_id|supplier_id|delivery_date|
+--------+-----------+-------------+
|       1|        101|   2025-05-01|
|       2|        102|   2025-04-30|
|       3|        103|   2025-05-20|
+--------+-----------+-------------+

+--------+-----------+-------------+----------+
|order_id|supplier_id|delivery_date|delay_days|
+--------+-----------+-------------+----------+
|       1|        101|   2025-05-01|        53|
|       2|        102|   2025-04-30|        54|
|       3|        103|   2025-05-20|        34|
+--------+-----------+-------------+----------+

+--------+-----------+-------------+----------+----------+
|order_id|supplier_id|delivery_date|delay_days|is_delayed|
+--------+-----------+-------------+----------+----------+
|       1|        101|   2025-05-01|        53|         1|
|       2|        102|   2025-04-30|        54|         1|
|       3|        103|   2025-05-20|        34|         1|
+--------+-----------+-------------+----------+--

In [0]:
#Save cleaned output as Delta or CSV
#Delta
df.write.mode("overwrite").format("delta") \
    .save("/Volumes/workspace/supplychain/supply_output/week4_cleaned_orders_delta")

#CSV
df.write.mode("overwrite").option("header", "true") \
    .csv("/Volumes/workspace/supplychain/supply_output/week4_cleaned_orders_csv")

In [0]:
#Run basic analysis queries using SQL or PySpark
df.createOrReplaceTempView("week4_cleaned_orders")

#SQL query for delayed orders count by supplier
spark.sql("""
SELECT supplier_id, COUNT(*) AS delayed_count
FROM week4_cleaned_orders
WHERE is_delayed = 1
GROUP BY supplier_id
ORDER BY delayed_count DESC
""").show()

+-----------+-------------+
|supplier_id|delayed_count|
+-----------+-------------+
|        102|            1|
|        101|            1|
|        103|            1|
+-----------+-------------+

