In [0]:
# COMMAND ----------
# Language: Python (PySpark)

# **REPLACE "THE_EXACT_FOLDER_NAME_HERE"** with the name you found in Step 1!

dbfs_path = "/FileStore/tables/supplier_metrics_output.txt/" 
# For example: dbfs_path = "/FileStore/tables/metrics_run_2025/"

# Read the data from DBFS into a DataFrame
orders_summary_df = spark.read.format("csv") \
  .option("header", "true") \
  .option("inferSchema", "true") \
  .load(dbfs_path) 

print("--- Data Loaded Successfully ---")
orders_summary_df.printSchema()
orders_summary_df.show(5)

--- Data Loaded Successfully ---
root
 |-- supplier_id: string (nullable = true)
 |-- Delayed_Order_Count: integer (nullable = true)
 |-- Total_Delayed_Value: double (nullable = true)

+-----------+-------------------+-------------------+
|supplier_id|Delayed_Order_Count|Total_Delayed_Value|
+-----------+-------------------+-------------------+
|       S001|                  1|             2500.0|
+-----------+-------------------+-------------------+



In [0]:
# COMMAND ----------
# Language: Python (PySpark)

from pyspark.sql.functions import col

print("--- 2. Transformation: Cleaning and Filtering ---")

# Filter the data: Keep only suppliers with a minimum threshold of delayed orders (e.g., 2 or more).
# This helps identify critical suppliers for optimization.
cleaned_df = orders_summary_df.filter(
    col("Delayed_Order_Count") >= 2
)

print("\nCritical Suppliers (Filtered Output):")
cleaned_df.show()

--- 2. Transformation: Cleaning and Filtering ---

Critical Suppliers (Filtered Output):
+-----------+-------------------+-------------------+
|supplier_id|Delayed_Order_Count|Total_Delayed_Value|
+-----------+-------------------+-------------------+
+-----------+-------------------+-------------------+



In [0]:
# COMMAND ----------
# Language: Python (PySpark)

# Define a persistent path for the Delta Table in your data lake
delta_output_path = "/mnt/data_lake/critical_supplier_metrics_delta"

# Save the final, filtered DataFrame as a Delta table
cleaned_df.write \
  .format("delta") \
  .mode("overwrite") \
  .save(delta_output_path)

print(f"--- 3. Load: Cleaned Output Saved to Delta ---")
print(f"Path: {delta_output_path}")

# DELIVERABLE: Cleaned output stored in Delta format

--- 3. Load: Cleaned Output Saved to Delta ---
Path: /mnt/data_lake/critical_supplier_metrics_delta


In [0]:
# COMMAND ----------
# Language: Python (PySpark)

# Define the Delta path
delta_output_path = '/mnt/data_lake/critical_supplier_metrics_delta'

# Read the Delta table back into a DataFrame
delta_df = spark.read.format("delta").load(delta_output_path)

# Create the temporary view from the DataFrame
delta_df.createOrReplaceTempView("critical_metrics_vw")

print("--- Analysis Query Results ---")
# Now, run the SQL query against the temporary view
spark.sql("""
SELECT 
    supplier_id,
    Delayed_Order_Count,
    Total_Delayed_Value
FROM critical_metrics_vw
ORDER BY Total_Delayed_Value DESC
""").show()

--- Analysis Query Results ---
+-----------+-------------------+-------------------+
|supplier_id|Delayed_Order_Count|Total_Delayed_Value|
+-----------+-------------------+-------------------+
+-----------+-------------------+-------------------+

