In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("InventoryAlertingSystem") \
    .getOrCreate()

spark

In [0]:
# Scenario 1: Inventory Alerting System
# Tasks:
# 1. Load the data using PySpark.
df = spark.read.option("header", True).option("inferSchema", True).csv("file:/Workspace/Shared/Coding Assessment Datasets/inventory_supply.csv")
df.printSchema()
df.show()

# 2. Create a new column NeedsReorder = StockQty < ReorderLevel .
from pyspark.sql.functions import col

df_with_alerts = df.withColumn("NeedsReorder", col("StockQty") < col("ReorderLevel"))
df_with_alerts.show()

# 3. Create a view of all items that need restocking.
items_to_reorder = df_with_alerts.filter(col("NeedsReorder") == True)
items_to_reorder.createOrReplaceTempView("RestockView")

spark.sql("SELECT * FROM RestockView").show()

# 4. Highlight warehouses with more than 2 such items.
from pyspark.sql.functions import count

critical_warehouses = items_to_reorder.groupBy("Warehouse") \
    .agg(count("*").alias("RestockItemCount")) \
    .filter(col("RestockItemCount") > 1)  

critical_warehouses.show()

root
 |-- ItemID: string (nullable = true)
 |-- ItemName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Warehouse: string (nullable = true)
 |-- StockQty: integer (nullable = true)
 |-- ReorderLevel: integer (nullable = true)
 |-- LastRestocked: date (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- Supplier: string (nullable = true)

+------+------------+-----------+----------+--------+------------+-------------+---------+---------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|   2024-03-15|    30000|   AVTech|
|  I002|      Laptop|Electronics|WarehouseB|      10|          15|   2024-04-01|    70000|TechWorld|
|  I003|Office Chair|  Furniture|WarehouseA|      40|          10|   2024-03-25|     6000|  ChairCo|
|  I004|Refrigerat

In [0]:
#Scenario 2: Supplier Price Optimization
# Tasks:
# 1. Group items by Supplier and compute average price.
from pyspark.sql.functions import avg

supplier_avg_price = df.groupBy("Supplier").agg(avg("UnitPrice").alias("AvgPricePerSupplier"))
supplier_avg_price.show()

# 2. Find which suppliers offer items below average price in their category.
category_avg_price = df.groupBy("Category").agg(avg("UnitPrice").alias("AvgCategoryPrice"))
category_avg_price.show()

df_with_category_avg = df.join(category_avg_price, on="Category", how="left")

from pyspark.sql.functions import col

df_deals = df_with_category_avg.withColumn(
    "IsBelowCategoryAvg", col("UnitPrice") < col("AvgCategoryPrice")
)
df_deals.select("ItemID", "ItemName", "Category", "UnitPrice", "AvgCategoryPrice", "Supplier", "IsBelowCategoryAvg").show()

# 3. Tag suppliers with Good Deal if >50% of their items are below market average.
from pyspark.sql.functions import sum as _sum, count as _count, when

supplier_deal_stats = df_deals.groupBy("Supplier").agg(
    _count("*").alias("TotalItems"),
    _sum(when(col("IsBelowCategoryAvg") == True, 1).otherwise(0)).alias("BelowAvgItems")
)
supplier_deal_tagged = supplier_deal_stats.withColumn(
    "GoodDeal",
    (col("BelowAvgItems") / col("TotalItems") > 0.5)
)
supplier_deal_tagged.show()

+---------+-------------------+
| Supplier|AvgPricePerSupplier|
+---------+-------------------+
|   AVTech|            30000.0|
|TechWorld|            70000.0|
|PrintFast|             8000.0|
| FreezeIt|            25000.0|
|  ChairCo|             6000.0|
+---------+-------------------+

+-----------+----------------+
|   Category|AvgCategoryPrice|
+-----------+----------------+
|Electronics|         36000.0|
| Appliances|         25000.0|
|  Furniture|          6000.0|
+-----------+----------------+

+------+------------+-----------+---------+----------------+---------+------------------+
|ItemID|    ItemName|   Category|UnitPrice|AvgCategoryPrice| Supplier|IsBelowCategoryAvg|
+------+------------+-----------+---------+----------------+---------+------------------+
|  I001|      LED TV|Electronics|    30000|         36000.0|   AVTech|              true|
|  I002|      Laptop|Electronics|    70000|         36000.0|TechWorld|             false|
|  I003|Office Chair|  Furniture|     6000|

In [0]:
# Scenario 3: Cost Forecasting
# Tasks:
# 1. Calculate TotalStockValue = StockQty * UnitPrice .
from pyspark.sql.functions import col

df_with_stock_value = df.withColumn(
    "TotalStockValue", col("StockQty") * col("UnitPrice")
)

df_with_stock_value.select("ItemID", "ItemName", "StockQty", "UnitPrice", "TotalStockValue").show()

# 2. Identify top 3 highest-value items.
top_3_items = df_with_stock_value.orderBy(col("TotalStockValue").desc()).limit(3)
top_3_items.show()

# 3. Export the result as a Parquet file partitioned by Warehouse .
df_with_stock_value.write \
    .partitionBy("Warehouse") \
    .mode("overwrite") \
    .parquet("output/inventory_stock_value/")

+------+------------+--------+---------+---------------+
|ItemID|    ItemName|StockQty|UnitPrice|TotalStockValue|
+------+------------+--------+---------+---------------+
|  I001|      LED TV|      50|    30000|        1500000|
|  I002|      Laptop|      10|    70000|         700000|
|  I003|Office Chair|      40|     6000|         240000|
|  I004|Refrigerator|       5|    25000|         125000|
|  I005|     Printer|       3|     8000|          24000|
+------+------------+--------+---------+---------------+

+------+------------+-----------+----------+--------+------------+-------------+---------+---------+---------------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|TotalStockValue|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+---------------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|   2024-03-15|    30000|   AVTech|        1500000|
|  I002|      Lapt

In [0]:
# Scenario 4: Warehouse Utilization
# Tasks:
# 1. Count items stored per warehouse.
from pyspark.sql.functions import countDistinct

items_per_warehouse = df.groupBy("Warehouse").agg(
    countDistinct("ItemID").alias("ItemCount")
)
items_per_warehouse.show()

# 2. Average stock per category in each warehouse.
from pyspark.sql.functions import avg

avg_stock_per_category_warehouse = df.groupBy("Warehouse", "Category").agg(
    avg("StockQty").alias("AvgStock")
)
avg_stock_per_category_warehouse.show()

# 3. Determine underutilized warehouses ( total stock < 100 ).
from pyspark.sql.functions import sum as _sum

total_stock_per_warehouse = df.groupBy("Warehouse").agg(
    _sum("StockQty").alias("TotalStock")
)

underutilized_warehouses = total_stock_per_warehouse.filter(col("TotalStock") < 100)
underutilized_warehouses.show()

+----------+---------+
| Warehouse|ItemCount|
+----------+---------+
|WarehouseA|        2|
|WarehouseC|        1|
|WarehouseB|        2|
+----------+---------+

+----------+-----------+--------+
| Warehouse|   Category|AvgStock|
+----------+-----------+--------+
|WarehouseB|Electronics|     6.5|
|WarehouseC| Appliances|     5.0|
|WarehouseA|  Furniture|    40.0|
|WarehouseA|Electronics|    50.0|
+----------+-----------+--------+

+----------+----------+
| Warehouse|TotalStock|
+----------+----------+
|WarehouseA|        90|
|WarehouseC|         5|
|WarehouseB|        13|
+----------+----------+



In [0]:
# Scenario 5: Delta Audit Trail
# Tasks:
# 1. Save as Delta table retail_inventory .
df.write.format("delta").mode("overwrite").saveAsTable("retail_inventory")

# 2. Update stock of 'Laptop' to 20.
spark.sql("""
UPDATE retail_inventory
SET StockQty = 20
WHERE ItemName = 'Laptop'
""").show()

# 3. Delete any item with StockQty = 0 .
spark.sql("""
DELETE FROM retail_inventory
WHERE StockQty = 0
""").show()

# 4. Run DESCRIBE HISTORY and query VERSION AS OF previous state.
spark.sql("DESCRIBE HISTORY retail_inventory").show(truncate=False)

+-----------------+
|num_affected_rows|
+-----------------+
|                1|
+-----------------+

+-----------------+
|num_affected_rows|
+-----------------+
|                0|
+-----------------+

+-------+-------------------+----------------+----------------------------------+---------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+------------------------------------------+
|version|timestamp          |userId          |userName               

In [0]:
# Scenario 6: Alerts from Restock Logs (Join Task)
# restock_logs.csv :
# ItemID,RestockDate,QuantityAdded
# I002,2024-04-20,10
# I005,2024-04-22,5
# I001,2024-04-25,20
# Tasks:
# 1. Join with inventory table to update StockQty.
restock_df = spark.read.option("header", True).option("inferSchema", True).csv("file:/Workspace/Shared/Coding Assessment Datasets/restock_logs.csv")
inventory_df = spark.table("retail_inventory")

joined_df = inventory_df.alias("inv") \
    .join(restock_df.alias("log"), on="ItemID", how="left")

# 2. Calculate new stock and flag RestockedRecently = true for updated items.
from pyspark.sql.functions import when, col

updated_df = joined_df.withColumn(
    "NewStockQty",
    when(col("QuantityAdded").isNotNull(), col("StockQty") + col("QuantityAdded")).otherwise(col("StockQty"))
).withColumn(
    "RestockedRecently",
    col("QuantityAdded").isNotNull()
)

updated_df.select("ItemID", "ItemName", "StockQty", "QuantityAdded", "NewStockQty", "RestockedRecently").show()

# 3. Use MERGE INTO to update in Delta.
from pyspark.sql.functions import lit
df_existing = spark.table("retail_inventory")
df_extended = df_existing.withColumn("RestockedRecently", lit(False))
df_extended.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("retail_inventory")

spark.sql("""
MERGE INTO retail_inventory AS target
USING restock_updates AS source
ON target.ItemID = source.ItemID
WHEN MATCHED THEN UPDATE SET 
    StockQty = source.NewStockQty,
    LastRestocked = current_date(),
    RestockedRecently = source.RestockedRecently
""").show()

+------+------------+--------+-------------+-----------+-----------------+
|ItemID|    ItemName|StockQty|QuantityAdded|NewStockQty|RestockedRecently|
+------+------------+--------+-------------+-----------+-----------------+
|  I002|      Laptop|      30|           10|         40|             true|
|  I001|      LED TV|      70|           20|         90|             true|
|  I005|     Printer|       8|            5|         13|             true|
|  I003|Office Chair|      40|         NULL|         40|            false|
|  I004|Refrigerator|       5|         NULL|          5|            false|
+------+------------+--------+-------------+-----------+-----------------+

+-----------------+----------------+----------------+-----------------+
|num_affected_rows|num_updated_rows|num_deleted_rows|num_inserted_rows|
+-----------------+----------------+----------------+-----------------+
|                5|               5|               0|                0|
+-----------------+----------------+

In [0]:
# Scenario 7: Report Generation with SQL Views
# Tasks:
# 1. Create SQL view inventory_summary with: ItemName, Category, StockQty, NeedsReorder, TotalStockValue
spark.sql("""
CREATE OR REPLACE TEMP VIEW inventory_summary AS
SELECT
  ItemName,
  Category,
  StockQty,
  CASE WHEN StockQty < ReorderLevel THEN true ELSE false END AS NeedsReorder,
  (StockQty * UnitPrice) AS TotalStockValue
FROM retail_inventory
""")

spark.sql("SELECT * FROM inventory_summary").show()

# 2. Create view supplier_leaderboard sorted by average price
spark.sql("""
CREATE OR REPLACE TEMP VIEW supplier_leaderboard AS
SELECT
  Supplier,
  ROUND(AVG(UnitPrice), 2) AS AvgUnitPrice
FROM retail_inventory
GROUP BY Supplier
ORDER BY AvgUnitPrice ASC
""")

spark.sql("SELECT * FROM supplier_leaderboard").show()

+------------+-----------+--------+------------+---------------+
|    ItemName|   Category|StockQty|NeedsReorder|TotalStockValue|
+------------+-----------+--------+------------+---------------+
|      Laptop|Electronics|      40|       false|        2800000|
|      LED TV|Electronics|      90|       false|        2700000|
|Office Chair|  Furniture|      40|       false|         240000|
|Refrigerator| Appliances|       5|        true|         125000|
|     Printer|Electronics|      13|       false|         104000|
+------------+-----------+--------+------------+---------------+

+---------+------------+
| Supplier|AvgUnitPrice|
+---------+------------+
|  ChairCo|      6000.0|
|PrintFast|      8000.0|
| FreezeIt|     25000.0|
|   AVTech|     30000.0|
|TechWorld|     70000.0|
+---------+------------+



In [0]:
# Scenario 8: Advanced Filtering
# Tasks:
# 1. Use when / otherwise to categorize items:
# "Overstocked" (>2x ReorderLevel)
# "LowStock"
from pyspark.sql.functions import when, col

categorized_df = df.withColumn(
    "StockStatus",
    when(col("StockQty") > 2 * col("ReorderLevel"), "Overstocked")
    .when(col("StockQty") < col("ReorderLevel"), "LowStock")
    .otherwise("Normal")
)

categorized_df.select("ItemID", "ItemName", "StockQty", "ReorderLevel", "StockStatus").show()

# 2. Use .filter() and .where() for the same and compare.
low_stock_items = categorized_df.filter(col("StockStatus") == "LowStock")
low_stock_items.show()

low_stock_items_alt = categorized_df.where(col("StockStatus") == "LowStock")
low_stock_items_alt.show()

+------+------------+--------+------------+-----------+
|ItemID|    ItemName|StockQty|ReorderLevel|StockStatus|
+------+------------+--------+------------+-----------+
|  I001|      LED TV|      50|          20|Overstocked|
|  I002|      Laptop|      10|          15|   LowStock|
|  I003|Office Chair|      40|          10|Overstocked|
|  I004|Refrigerator|       5|          10|   LowStock|
|  I005|     Printer|       3|           5|   LowStock|
+------+------------+--------+------------+-----------+

+------+------------+-----------+----------+--------+------------+-------------+---------+---------+-----------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|StockStatus|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+-----------+
|  I002|      Laptop|Electronics|WarehouseB|      10|          15|   2024-04-01|    70000|TechWorld|   LowStock|
|  I004|Refrigerator| Appliances|WarehouseC

In [0]:
# Scenario 9: Feature Engineering
# Tasks:
# 1. Extract RestockMonth from LastRestocked .
from pyspark.sql.functions import month, date_format

# Use month name (e.g., "March")
df_with_month = df.withColumn("RestockMonth", date_format("LastRestocked", "MMMM"))
df_with_month.select("ItemID", "LastRestocked", "RestockMonth").show()

# 2. Create feature: StockAge = CURRENT_DATE - LastRestocked
from pyspark.sql.functions import datediff, current_date

df_with_age = df_with_month.withColumn(
    "StockAge", datediff(current_date(), "LastRestocked")
)

df_with_age.select("ItemID", "LastRestocked", "StockAge").show()

# 3. Bucket StockAge into: New, Moderate, Stale
from pyspark.sql.functions import when

df_bucketted = df_with_age.withColumn(
    "StockAgeCategory",
    when(col("StockAge") < 30, "New")
    .when((col("StockAge") >= 30) & (col("StockAge") <= 90), "Moderate")
    .otherwise("Stale")
)

df_bucketted.select("ItemID", "LastRestocked", "StockAge", "StockAgeCategory").show()

+------+-------------+------------+
|ItemID|LastRestocked|RestockMonth|
+------+-------------+------------+
|  I001|   2024-03-15|       March|
|  I002|   2024-04-01|       April|
|  I003|   2024-03-25|       March|
|  I004|   2024-02-20|    February|
|  I005|   2024-03-30|       March|
+------+-------------+------------+

+------+-------------+--------+
|ItemID|LastRestocked|StockAge|
+------+-------------+--------+
|  I001|   2024-03-15|     461|
|  I002|   2024-04-01|     444|
|  I003|   2024-03-25|     451|
|  I004|   2024-02-20|     485|
|  I005|   2024-03-30|     446|
+------+-------------+--------+

+------+-------------+--------+----------------+
|ItemID|LastRestocked|StockAge|StockAgeCategory|
+------+-------------+--------+----------------+
|  I001|   2024-03-15|     461|           Stale|
|  I002|   2024-04-01|     444|           Stale|
|  I003|   2024-03-25|     451|           Stale|
|  I004|   2024-02-20|     485|           Stale|
|  I005|   2024-03-30|     446|           S

In [0]:
# Scenario 10: Export Options
# Tasks:
data = [
    {"Item": "item1", "StockAgeCategory": "Fresh", "Warehouse": "A"},
    {"Item": "item2", "StockAgeCategory": "Stale", "Warehouse": "B"},
    {"Item": "item3", "StockAgeCategory": "Stale", "Warehouse": "A"},
    {"Item": "item4", "StockAgeCategory": "Fresh", "Warehouse": "B"}
]
final_df = spark.createDataFrame(data)
# 1. Write full DataFrame to:
# CSV for analysts
final_df.write \
  .mode("overwrite") \
  .option("header", True) \
  .csv("/export/inventory/csv_full/")

# JSON for integration
final_df.write \
  .mode("overwrite") \
  .json("/export/inventory/json_full/")

# Delta for pipelines
final_df.write \
  .format("delta") \
  .mode("overwrite") \
  .save("/export/inventory/delta_full/")


# 2. Save with meaningful file and partition names like /export/inventory/stale_items/
stale_df = final_df.filter(final_df.StockAgeCategory == "Stale")

stale_df.write \
  .mode("overwrite") \
  .partitionBy("Warehouse") \
  .parquet("/export/inventory/stale_items/")