In [0]:
%python

from pyspark.sql import functions as F
from pyspark.sql import types as T

# ---------------- 1) Load & bound dimension sizes early ----------------
# Limit dims to keep the Cartesian product in check
cust = spark.table("dev.silver.dim_customer").select("customer_id").limit(500)
prod = spark.table("dev.silver.dim_product").select(
    "product_id",
    F.col("price").cast("decimal(18,2)").alias("list_price")
).limit(500)
store = spark.table("dev.silver.dim_store").select("store_id").limit(50)
pay = spark.table("dev.silver.dim_payments").select("payment_id").limit(10)
dt = spark.table("dev.silver.dim_date") \
    .select(F.col("date").alias("order_date")) \
    .where(
        (F.col("order_date") >= F.to_date(F.lit("2026-01-01"))) &
        (F.col("order_date") <= F.to_date(F.lit("2026-12-31")))
    ) \
    .limit(120) \
    

# Optional: adjust shuffle partitions to match cluster cores
# spark.conf.set("spark.sql.shuffle.partitions", "200")

# ---------------- 2) CROSS JOIN with probabilistic thinning ----------------
# Avoid global sort: use rand() < p to probabilistically sample combinations.
# Tune 'p' to control row count (depends on your limits above).
p = 0.002  # ~0.2% of the Cartesian product; adjust based on desired volume

cross_df = (
    cust.crossJoin(prod.hint("broadcast"))
        .crossJoin(store.hint("broadcast"))
        .crossJoin(pay.hint("broadcast"))
        .crossJoin(dt.hint("broadcast"))
    # Thin BEFORE measure computation to reduce work
    .where(F.rand() < p)
)

# If you need an upper bound, apply LIMIT AFTER thinning (cheap compared to sort)
target_rows = 1000
cross_sampled = cross_df.limit(target_rows)

# ---------------- 3) Derive identifiers & measures (vectorized ops) ----------------
sales_tx = (
    cross_sampled
        .withColumn("order_id", F.concat_ws("|", F.col("customer_id"), F.date_format(F.col("order_date"), "yyyy-MM-dd")))
        .withColumn("order_line_id", F.expr("uuid()"))  # unique per run, no sort required
        .withColumn("quantity", (F.floor(F.rand() * 4) + F.lit(1)).cast("int"))
        .withColumn("unit_price", F.col("list_price").cast("decimal(18,2)"))
        .withColumn("unit_price_effective",
                    F.round(F.col("unit_price") * (1 + (F.rand() - 0.5) * 0.10), 2).cast("decimal(18,2)"))
        .withColumn("gross_amount", F.round(F.col("quantity") * F.col("unit_price_effective"), 2).cast("decimal(18,2)"))
        .withColumn("discount_amount", F.round(F.col("gross_amount") * F.lit(0.05), 2).cast("decimal(18,2)"))
        .withColumn("net_amount", F.round(F.col("gross_amount") - F.col("discount_amount"), 2).cast("decimal(18,2)"))
        .withColumn("tax_amount", F.round(F.col("net_amount") * F.lit(0.18), 2).cast("decimal(18,2)"))
        .withColumn("total_amount", F.round(F.col("net_amount") + F.col("tax_amount"), 2).cast("decimal(18,2)"))
        .select(
            "order_line_id", "order_id",
            "customer_id", "product_id", "store_id", "payment_id",
            "order_date",
            "quantity", "unit_price", "unit_price_effective",
            "gross_amount", "discount_amount", "net_amount", "tax_amount", "total_amount"
        )
)


# ---------------- 4) Append to a Delta table with partitioning ----------------
# Partition by order_date to improve write/read performance.
spark.sql("""
CREATE TABLE IF NOT EXISTS dev.silver.sales_transactions_tbl (
  order_line_id STRING,
  order_id STRING,
  customer_id STRING,
  product_id STRING,
  store_id STRING,
  payment_id STRING,
  order_date DATE,
  quantity INT,
  unit_price DECIMAL(18,2),
  unit_price_effective DECIMAL(18,2),
  gross_amount DECIMAL(18,2),
  discount_amount DECIMAL(18,2),
  net_amount DECIMAL(18,2),
  tax_amount DECIMAL(18,2),
  total_amount DECIMAL(18,2)
) USING DELTA
PARTITIONED BY (order_date)
""")

(sales_tx
 .repartition(F.col("order_date"))  # partition-aware write to avoid small files per partition
 .write
 .format("delta")
 .mode("append")
 .option("mergeSchema", "true")
 .saveAsTable("dev.silver.sales_transactions_tbl")
)

# ---------------- 5) Optional post-write maintenance (Delta Lake) ----------------
# If table grows, run OPTIMIZE + ZORDER periodically (not every batch).
# spark.sql("OPTIMIZE dev.silver.sales_transactions_tbl ZORDER BY (order_date, product_id)")


In [0]:
select * from dev.silver.sales_transactions_tbl