In [0]:
from pyspark.sql.functions import col, when, to_date

# Create a DataFrame from the CSV string
csv_data = """order_id,supplier_id,product_name,quantity,expected_date,delivery_date,status
1001,SUP001,Laptop,10,2025-10-01,2025-10-03,Delivered
1002,SUP002,Mouse,50,2025-10-02,2025-10-02,Delivered
1003,SUP001,Keyboard,20,2025-10-05,2025-10-07,Delivered
1004,SUP003,Monitor,15,2025-10-06,2025-10-05,Delivered
1005,SUP002,USB Cable,100,2025-10-03,2025-10-08,Delivered
1006,SUP004,SSD,25,2025-10-07,2025-10-06,Delivered
1007,SUP001,Headset,30,2025-10-05,2025-10-09,Delivered
1008,SUP005,Printer,5,2025-10-01,2025-10-10,Delivered
1009,SUP003,Router,12,2025-10-04,2025-10-04,Delivered
1010,SUP004,Camera,8,2025-10-02,2025-10-05,Delivered
"""

# Convert the CSV string to a list of rows
import io
import pandas as pd

pdf = pd.read_csv(io.StringIO(csv_data))
df = spark.createDataFrame(pdf)

df = (
    df.withColumn("delivery_date", to_date(col("delivery_date"), "yyyy-MM-dd"))
      .withColumn("expected_date", to_date(col("expected_date"), "yyyy-MM-dd"))
      .withColumn("is_delayed", when(col("delivery_date") > col("expected_date"), 1).otherwise(0))
)

cleaned_df = df.select(
    "order_id", "supplier_id", "expected_date", "delivery_date", "is_delayed"
)

# Use dbfs:/ paths for Spark I/O
cleaned_df.write.mode("overwrite").format("delta").save("dbfs:/FileStore/cleaned_orders_delta")
cleaned_df.write.mode("overwrite").csv("dbfs:/FileStore/cleaned_orders_csv", header=True)

cleaned_df.createOrReplaceTempView("orders")

result = spark.sql("""
    SELECT supplier_id,
           COUNT(*) AS total_orders,
           SUM(is_delayed) AS delayed_orders
    FROM orders
    GROUP BY supplier_id
    ORDER BY delayed_orders DESC
""")

display(result)

supplier_id,total_orders,delayed_orders
SUP001,3,3
SUP002,2,1
SUP004,2,1
SUP005,1,1
SUP003,2,0
