In [0]:
# DAILY SALES TREND

from pyspark.sql.functions import sum, count, countDistinct, col

# ------------------------------
# READ ONLY REQUIRED COLUMNS
# ------------------------------
fact_sales_df = (
    spark.table("real_time_projects.ecommerce_historical.fact_sales")
    .select(
        "order_id",
        "order_date",
        "revenue"
    )
)

# ---------------------------------------------
# REPARTITION BEFORE AGGREGATION (KEY POINT)
# ---------------------------------------------
fact_sales_df = fact_sales_df.repartition("order_date")

# ------------------------
# TRANSFORM & AGGREGATE
# ------------------------
gold_daily_sales_df = (
    fact_sales_df
    .groupBy("order_date")
    .agg(
        sum("revenue").alias("daily_revenue"),
        countDistinct("order_id").alias("daily_orders"),
        count("*").alias("daily_items")
    )
    .orderBy("order_date")
)

# -------------------------
# WRITE GOLD DELTA TABLE
# -------------------------
(
    gold_daily_sales_df
    .write
    .format("delta")
    .mode("overwrite")
    .partitionBy("order_date")
    .option("overwriteSchema", "true")
    .saveAsTable("real_time_projects.ecommerce_historical.gold_daily_sales")
)

# ----------------------
# DELTA OPTIMIZATION
# ----------------------

# -----------------------
# ENABLE AUTO OPTIMIZE
# -----------------------
spark.sql("""
ALTER TABLE real_time_projects.ecommerce_historical.gold_daily_sales
SET TBLPROPERTIES (
  delta.autoOptimize.optimizeWrite = true,
  delta.autoOptimize.autoCompact = true
)
""")

In [0]:
# MONTHLY SALES TREND

from pyspark.sql.functions import sum, count, countDistinct, col, year, month, approx_count_distinct

# -------------------------------
# READ ONLY REQUIRED COLUMNS
# -------------------------------
fact_sales_df = (
    spark.table("real_time_projects.ecommerce_historical.fact_sales")
    .select(
        "order_id",
        "order_date",
        "revenue"
    )
)

# ---------------------------------------------
# REPARTITION BEFORE AGGREGATION (KEY POINT)
# ---------------------------------------------
fact_sales_df = fact_sales_df.repartition("order_date")

# ------------------------
# TRANSFORM & AGGREGATE
# ------------------------
gold_monthly_sales_df = (
    fact_sales_df
    .withColumn("year", year("order_date"))
    .withColumn("month", month("order_date"))
    .groupBy("year", "month")
    .agg(
        sum("revenue").alias("monthly_revenue"),
        approx_count_distinct("order_id").alias("monthly_orders")
    )
    .orderBy("year", "month")
)

# --------------------------
# WRITE GOLD DELTA TABLE
# --------------------------
(
    gold_monthly_sales_df
    .write
    .format("delta")
    .mode("overwrite")
    .partitionBy("year", "month")
    .option("overwriteSchema", "true")
    .saveAsTable("real_time_projects.ecommerce_historical.gold_monthly_sales")
)

# ----------------------
# DELTA OPTIMIZATION
# ----------------------

# -----------------------
# ENABLE AUTO OPTIMIZE
# -----------------------
spark.sql("""
ALTER TABLE real_time_projects.ecommerce_historical.gold_monthly_sales
SET TBLPROPERTIES (
  delta.autoOptimize.optimizeWrite = true,
  delta.autoOptimize.autoCompact = true
)
""")