In [0]:
# Volumes roots
ROOT   = "/Volumes/tabular/dataexpert/benchmarking_capstone"
BRONZE = f"{ROOT}"
SILVER = f"{ROOT}/silver"

# Bronze inputs
TX_BRONZE_PATH   = f"{BRONZE}/raw_transactions_daily"
CUST_BRONZE_PATH = f"{BRONZE}/raw_customers"

# Silver outputs
TX_SILVER_PATH   = f"{SILVER}/transactions_clean"
CUST_SILVER_PATH = f"{SILVER}/customers_clean"

# --- Widgets for flexibility ---
dbutils.widgets.dropdown("WRITE_MODE", "append", ["append","overwrite_full","overwrite_range"])
dbutils.widgets.text("DATE_FROM", "")   # e.g. 2024-03-01 (inclusive)
dbutils.widgets.text("DATE_TO", "")     # e.g. 2024-04-01 (exclusive); leave blank for open-ended

WRITE_MODE = dbutils.widgets.get("WRITE_MODE")
DATE_FROM  = dbutils.widgets.get("DATE_FROM").strip() or None
DATE_TO    = dbutils.widgets.get("DATE_TO").strip() or None

# Shuffle for 60M+ scale; tune if needed
spark.conf.set("spark.sql.shuffle.partitions", "1600")


In [0]:
from pyspark.sql import functions as F

cust_bronze = spark.read.format("delta").load(CUST_BRONZE_PATH)

cust_silver = (
    cust_bronze
      .dropDuplicates(["customer_id"])
      .filter(F.col("customer_id").isNotNull())
)

# Atomic full refresh of customers (dimension table is small)
(cust_silver.write
   .format("delta")
   .mode("overwrite")
   .save(CUST_SILVER_PATH))

print("✅ Wrote customers →", CUST_SILVER_PATH)
print("Rows:", spark.read.format("delta").load(CUST_SILVER_PATH).count())


✅ Wrote customers → /Volumes/tabular/dataexpert/benchmarking_capstone/silver/customers_clean
Rows: 200000


In [0]:
if WRITE_MODE == "append":
    # ✅ Normal daily ingestion
    (tx_silver_df.write
       .format("delta")
       .mode("append")
       .partitionBy("ingest_day")
       .save(TX_SILVER_PATH))
    print("✅ APPEND complete →", TX_SILVER_PATH)

elif WRITE_MODE == "overwrite_full":
    # ✅ Atomic full refresh of entire table (same path, no rm/mv)
    (tx_silver_df.write
       .format("delta")
       .mode("overwrite")
       .partitionBy("ingest_day")
       .save(TX_SILVER_PATH))
    print("✅ FULL OVERWRITE complete →", TX_SILVER_PATH)

elif WRITE_MODE == "overwrite_range":
    # ✅ Replace only the specified window (DATE_FROM/DATE_TO must define the range you’re writing)
    if not DATE_FROM and not DATE_TO:
        raise ValueError("overwrite_range requires DATE_FROM and/or DATE_TO")
    # Build Delta predicate string to match what we filtered
    pred = []
    if DATE_FROM: pred.append(f"ingest_day >= '{DATE_FROM}'")
    if DATE_TO:   pred.append(f"ingest_day < '{DATE_TO}'")
    replace_where = " AND ".join(pred)

    # Filter the DataFrame to match the replaceWhere condition
    filtered_df = tx_silver_df.filter(replace_where)

    (filtered_df.write
       .format("delta")
       .mode("overwrite")
       .option("replaceWhere", replace_where)
       .partitionBy("ingest_day")
       .save(TX_SILVER_PATH))
    print(f"✅ PARTITION OVERWRITE where {replace_where} → {TX_SILVER_PATH}")

else:
    raise ValueError(f"Unknown WRITE_MODE: {WRITE_MODE}")

✅ PARTITION OVERWRITE where ingest_day >= '2024-01-11' AND ingest_day < '2024-05-31' → /Volumes/tabular/dataexpert/benchmarking_capstone/silver/transactions_clean


In [0]:
tx_silver = spark.read.format("delta").load(TX_SILVER_PATH)
cust_silver = spark.read.format("delta").load(CUST_SILVER_PATH)

print("Silver tx rows:", tx_silver.count())
print("Silver cust rows:", cust_silver.count())

# Partition health
display(tx_silver.groupBy("ingest_day").count().orderBy("ingest_day"))

# Basic stats (useful to spot anomalies)
display(tx_silver.select("qty","price","amount").summary("count","mean","stddev","min","25%","50%","75%","max"))


Silver tx rows: 270000000
Silver cust rows: 200000


ingest_day,count
2024-01-11,1000000
2024-01-16,1000000
2024-01-31,1000000
2024-03-01,1000000
2024-03-02,1000000
2024-03-03,1000000
2024-03-04,1500000
2024-03-05,1500000
2024-03-06,1500000
2024-03-07,1500000


summary,qty,price,amount
count,270000000.0,270000000.0,270000000.0
mean,3.4988466444444444,20.932150134442605,73.23765674844637
stddev,1.7087857024454165,5.994443161305232,42.71738401180422
min,1.0,12.18,12.18
25%,2.0,15.64,38.28
50%,3.0,20.09,66.5
75%,5.0,25.79,99.09
max,6.0,33.12,198.72


In [0]:
%sql
select max(ingest_day) from cust_bronze;
select max(ingest_day) from cust_silver

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-5221284682473625>, line 1[0m
[0;32m----> 1[0m get_ipython()[38;5;241m.[39mrun_cell_magic([38;5;124m'[39m[38;5;124msql[39m[38;5;124m'[39m, [38;5;124m'[39m[38;5;124m'[39m, [38;5;124m'[39m[38;5;124mselect max(ingest_day) from cust_bronze;[39m[38;5;130;01m\n[39;00m[38;5;124mselect max(ingest_day) from cust_silver[39m[38;5;130;01m\n[39;00m[38;5;124m'[39m)

File [0;32m/databricks/python/lib/python3.11/site-packages/IPython/core/interactiveshell.py:2493[0m, in [0;36mInteractiveShell.run_cell_magic[0;34m(self, magic_name, line, cell)[0m
[1;32m   2491[0m [38;5;28;01mwith[39;00m [38;5;28mself[39m[38;5;241m.[39mbuiltin_trap:
[1;32m   2492[0m     args [38;5;241m=[39m (magic_arg_s, cell)
[0;32m-> 2493[0m     result [38;5;241m=[39m fn([38;5;241m*[39margs, [3