In [0]:
from pyspark.sql.functions import sum, count, countDistinct, col

# ------------------------------
# READ ONLY REQUIRED COLUMNS
# ------------------------------
fact_sales_df = (
    spark.table("real_time_projects.ecommerce_historical.fact_sales")
    .select(
        "product_id",
        "product_category_name",
        "order_id",
        "revenue"
    )
)

# ------------------------
# TRANSFORM & AGGREGATE
# ------------------------
gold_top_products_df = (
    fact_sales_df
    .groupBy("product_id", "product_category_name")
    .agg(
        sum("revenue").alias("total_revenue"),
        countDistinct("order_id").alias("total_orders"),
        count("*").alias("total_items")
    )
    .orderBy(col("total_revenue").desc())
)

# -------------------------
# WRITE GOLD DELTA TABLE
# -------------------------
(
    gold_top_products_df
    .write
    .format("delta")
    .mode("overwrite")
    .partitionBy("product_category_name")
    .option("overwriteSchema", "true")
    .saveAsTable("real_time_projects.ecommerce_historical.gold_top_products")
)

# ---------------------
# DELTA OPTIMIZATION
# ---------------------

# ----------------------
# ENABLE AUTO OPTIMIZE
# ----------------------
spark.sql("""
ALTER TABLE real_time_projects.ecommerce_historical.gold_top_products
SET TBLPROPERTIES (
  delta.autoOptimize.optimizeWrite = true,
  delta.autoOptimize.autoCompact = true
)
""")

# -------------------------------------------
# Z-ORDER FOR FAST FILTERING BY product_id
# -------------------------------------------
spark.sql("""
OPTIMIZE real_time_projects.ecommerce_historical.gold_top_products
ZORDER BY (product_id)
""")

In [0]:
%sql
SELECT * FROM real_time_projects.ecommerce_historical.gold_top_products LIMIT 10;