In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count
import os

# Initialize Spark session
spark = SparkSession.builder \
    .appName("SupplyChain_Week3") \
    .getOrCreate()

# List files in the /content/ directory to verify upload
print("Files in /content/:", os.listdir("/content/"))

# Load order data from CSV
df = spark.read.option("header", True).csv("/content/drive/My Drive/orders_data.csv")

# Convert date columns to date type
df = df.withColumn("delivery_date", col("delivery_date").cast("date")) \
       .withColumn("expected_date", col("expected_date").cast("date"))

# Calculate delay
df = df.withColumn("is_delayed", when(col("delivery_date") > col("expected_date"), 1).otherwise(0))

# Filter delayed shipments
delayed_df = df.filter(col("is_delayed") == 1)

# Group by supplier and count delayed orders
supplier_delay_summary = delayed_df.groupBy("supplier_id").agg(count("*").alias("delayed_orders"))

# Show results
supplier_delay_summary.show()

# Save processed data to CSV and Parquet
supplier_delay_summary.write.mode("overwrite").csv("output/supplier_delay_summary.csv", header=True)
supplier_delay_summary.write.mode("overwrite").parquet("output/supplier_delay_summary.parquet")

spark.stop()

Files in /content/: ['.config', 'drive', 'sample_data']
+-----------+--------------+
|supplier_id|delayed_orders|
+-----------+--------------+
|     SUP004|             1|
|     SUP001|             3|
|     SUP005|             1|
|     SUP002|             1|
+-----------+--------------+



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Note:** Please upload your `orders_data.csv` file to the Colab environment before running the next cell.