In [0]:
spark

<pyspark.sql.connect.session.SparkSession at 0xfffe20601dd0>

In [0]:
#Load order data from CSV using PySpark
df = spark.read.csv("/Volumes/workspace/supplychain/csv_data/cleaned_supply_chain_data.csv", header=True, inferSchema=True)
df.show()

+--------+-----------+-------------+----------+----------+
|order_id|supplier_id|delivery_date|delay_days|is_delayed|
+--------+-----------+-------------+----------+----------+
|       1|        101|   2025-05-01|        32|         1|
|       2|        102|   2025-04-30|        33|         1|
|       3|        103|   2025-05-20|        13|         1|
+--------+-----------+-------------+----------+----------+



In [0]:
#Filter delayed shipments
from pyspark.sql.functions import col
delayed_df = df.filter(col("is_delayed") == 1)
delayed_df.show()

+--------+-----------+-------------+----------+----------+
|order_id|supplier_id|delivery_date|delay_days|is_delayed|
+--------+-----------+-------------+----------+----------+
|       1|        101|   2025-05-01|        32|         1|
|       2|        102|   2025-04-30|        33|         1|
|       3|        103|   2025-05-20|        13|         1|
+--------+-----------+-------------+----------+----------+



In [0]:
#Group by supplier and count delayed orders
grouped_df = delayed_df.groupBy("supplier_id").count().withColumnRenamed("count", "delayed_orders")
grouped_df.show()

+-----------+--------------+
|supplier_id|delayed_orders|
+-----------+--------------+
|        102|             1|
|        101|             1|
|        103|             1|
+-----------+--------------+



In [0]:
#Save processed data to CSV or Parquet
# Create volume using spark.sql if you can't use a SQL notebook
spark.sql("""
    CREATE VOLUME IF NOT EXISTS workspace.supplychain.supply_output
    COMMENT 'For storing processed supply chain data'
""")

# Save results to CSV 
grouped_df.write.mode("overwrite").option("header", "true") \
    .csv("/Volumes/workspace/supplychain/supply_output/week3_delayed_orders_csv")

# Save results to Parquet
grouped_df.write.mode("overwrite") \
    .parquet("/Volumes/workspace/supplychain/supply_output/week3_delayed_orders_parquet")