In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import *

# 1. Setup
spark.sql("CREATE SCHEMA IF NOT EXISTS supply_chain_opt")
spark.sql("DROP TABLE IF EXISTS supply_chain_opt.inventory")
spark.sql("DROP TABLE IF EXISTS supply_chain_opt.vendor_logs")

# 2. Generate Inventory Data (100k rows)
inventory_df = spark.range(1, 100001).select(
    F.col("id").alias("product_id"),
    F.concat(F.lit("Item_"), F.col("id")).alias("product_name"),
    F.floor(F.rand() * 1000).alias("current_stock"),
    F.floor(F.rand() * 200 + 50).alias("reorder_point"),
    F.round(F.rand() * 500 + 10, 2).alias("unit_cost"),
    F.floor(F.rand() * 20 + 2).alias("lead_time_days")
)

# 3. Generate Vendor Logs (100k rows) with Datatype Fixes
vendor_logs_df = spark.range(1, 100001).select(
    F.concat(F.lit("ORD_"), F.col("id")).alias("order_id"),
    F.concat(F.lit("VND_"), F.floor(F.rand() * 50 + 1)).alias("vendor_id"),
    # Fix 1: Cast the random day offset to INT for order_date
    F.date_add(F.to_date(F.lit("2025-01-01")), F.floor(F.rand() * 365).cast("int")).alias("order_date"),
    F.floor(F.rand() * 500 + 10).alias("quantity_ordered"),
    F.round(F.rand() * 0.4 + 0.6, 2).alias("quality_score")
)

# Fix 2: Cast the random delivery offset to INT
vendor_logs_df = vendor_logs_df.withColumn(
    "delivery_date", 
    F.date_add(F.col("order_date"), F.floor(F.rand() * 15 + 2).cast("int"))
)

# 4. Save to Delta Tables
inventory_df.write.mode("overwrite").saveAsTable("supply_chain_opt.inventory")
vendor_logs_df.write.mode("overwrite").saveAsTable("supply_chain_opt.vendor_logs")

print("Success! 100,000 rows generated with correct INT types for all date operations.")

Success! 100,000 rows generated with correct INT types for all date operations.


In [0]:
display(spark.table("supply_chain_opt.inventory").limit(10))

product_id,product_name,current_stock,reorder_point,unit_cost,lead_time_days
37501,Item_37501,106,223,36.06,13
37502,Item_37502,613,100,14.81,16
37503,Item_37503,298,140,193.1,5
37504,Item_37504,792,140,464.02,14
37505,Item_37505,122,199,11.7,8
37506,Item_37506,964,201,258.47,10
37507,Item_37507,246,170,50.63,10
37508,Item_37508,68,225,51.94,21
37509,Item_37509,564,137,498.54,14
37510,Item_37510,732,191,360.12,11


In [0]:
display(spark.sql("SELECT count(*) FROM supply_chain_opt.inventory"))

count(*)
100000


In [0]:
display(spark.table("supply_chain_opt.vendor_logs").limit(10))

order_id,vendor_id,order_date,quantity_ordered,quality_score,delivery_date
ORD_75001,VND_26,2025-05-12,427,0.79,2025-05-27
ORD_75002,VND_40,2025-02-12,247,0.84,2025-02-24
ORD_75003,VND_28,2025-11-03,43,0.73,2025-11-18
ORD_75004,VND_12,2025-10-08,314,1.0,2025-10-19
ORD_75005,VND_10,2025-12-26,249,0.8,2026-01-03
ORD_75006,VND_16,2025-05-27,497,0.6,2025-06-09
ORD_75007,VND_2,2025-01-23,91,0.64,2025-02-03
ORD_75008,VND_10,2025-11-19,14,0.64,2025-11-25
ORD_75009,VND_22,2025-06-08,304,0.82,2025-06-23
ORD_75010,VND_49,2025-05-15,243,0.99,2025-05-29


In [0]:
display(spark.sql("SELECT count(*) FROM supply_chain_opt.vendor_logs"))

count(*)
100000
