In [0]:
# Volumes paths
bronze_root = "/Volumes/tabular/dataexpert/benchmarking_capstone"
tx_path     = f"{bronze_root}/raw_transactions_daily"   # single Delta table, daily partitions
cust_path   = f"{bronze_root}/raw_customers"

# sizes
CUSTOMER_ROWS  = 200_000

from pyspark.sql import functions as F

# ---- customers builder (same as before) ----
def make_customers(n_rows: int):
    states = F.array(*[F.lit(s) for s in ["CA","TX","NY","FL","WA","IL","GA","NC","PA","OH"]])
    seg = (F.when(F.rand(31) < 0.2,"VIP")
             .when(F.rand(32) < 0.6,"LOYAL")
             .otherwise("CASUAL"))
    df = (spark.range(1, n_rows+1).toDF("customer_id")
            .withColumn("signup_date", F.expr("date_add(to_date('2023-01-01'), cast(rand(41)*365 as int))"))
            .withColumn("segment", seg)
            .withColumn("state", F.element_at(states, (F.rand(33)*10 + 1).cast("int")))
            .withColumn("city", F.concat(F.lit("City_"), F.floor(F.rand(34)*500))))
    return df

# write customers ONCE (overwrite is fine here)
cust_df = make_customers(CUSTOMER_ROWS)
(cust_df.write.mode("overwrite").format("delta").save(cust_path))
print("Wrote customers ->", cust_path)


Wrote customers -> /Volumes/tabular/dataexpert/benchmarking_capstone/raw_customers


In [0]:
from pyspark.sql import functions as F
from datetime import date, timedelta

# transactions builder that pins all rows to a single 'ingest_day'
def make_transactions_for_day(n_rows: int, customer_upper: int, ingest_day: str):
    """
    n_rows: number of tx for that day
    ingest_day: 'YYYY-MM-DD' partition (arrival date)
    """
    # Spread event timestamps within the day; link to customers
    df = (spark.range(0, n_rows).toDF("txn_id")
            .withColumn("store_id", (F.rand(7)*500 + 1).cast("int"))
            .withColumn("customer_id", (F.rand(11)*customer_upper + 1).cast("int"))
            .withColumn("sku", (F.rand(13)*90000 + 10000).cast("int"))
            .withColumn("qty", (F.rand(17)*6 + 1).cast("int"))
            .withColumn("price", F.round(F.exp(F.rand(19)*1.0 + 2.5), 2))
            # random minute in the day
            .withColumn("minutes_offset", (F.rand(23)*24*60).cast("int"))
            .withColumn("ts", F.expr(f"timestampadd(MINUTE, minutes_offset, to_timestamp('{ingest_day} 00:00:00'))"))
            .drop("minutes_offset")
            .withColumn("amount", F.col("qty")*F.col("price"))
            .withColumn("ingest_day", F.lit(ingest_day))  # partition column
         )
    return df

def write_daily_batch(n_rows:int, ingest_day:str):
    df = make_transactions_for_day(n_rows=n_rows, customer_upper=CUSTOMER_ROWS, ingest_day=ingest_day)
    # APPEND and PARTITION BY day → new files per day, Delta log tracks versions
    (df.write
       .format("delta")
       .mode("append")
       .partitionBy("ingest_day")
       .save(tx_path))
    print(f"Appended {n_rows:,} rows for day={ingest_day} -> {tx_path}")


In [0]:


ROWS_PER_DAY = 1_500_000   # adjust up/down for final volume
days = [f"2024-06-{d:02d}" for d in range(4, 32)] + \
       [f"2024-07-{d:02d}" for d in range(1, 31)] + \
       [f"2024-08-{d:02d}" for d in range(1, 31)]

for d in days:
    write_daily_batch(ROWS_PER_DAY, d)



Appended 1,500,000 rows for day=2024-06-04 -> /Volumes/tabular/dataexpert/benchmarking_capstone/raw_transactions_daily
Appended 1,500,000 rows for day=2024-06-05 -> /Volumes/tabular/dataexpert/benchmarking_capstone/raw_transactions_daily
Appended 1,500,000 rows for day=2024-06-06 -> /Volumes/tabular/dataexpert/benchmarking_capstone/raw_transactions_daily
Appended 1,500,000 rows for day=2024-06-07 -> /Volumes/tabular/dataexpert/benchmarking_capstone/raw_transactions_daily
Appended 1,500,000 rows for day=2024-06-08 -> /Volumes/tabular/dataexpert/benchmarking_capstone/raw_transactions_daily
Appended 1,500,000 rows for day=2024-06-09 -> /Volumes/tabular/dataexpert/benchmarking_capstone/raw_transactions_daily
Appended 1,500,000 rows for day=2024-06-10 -> /Volumes/tabular/dataexpert/benchmarking_capstone/raw_transactions_daily
Appended 1,500,000 rows for day=2024-06-11 -> /Volumes/tabular/dataexpert/benchmarking_capstone/raw_transactions_daily
Appended 1,500,000 rows for day=2024-06-12 -> /V

In [0]:
tx = spark.read.format("delta").load(tx_path)

print("Total rows:", tx.count())
display(tx.groupBy("ingest_day").count().orderBy("ingest_day"))

# Check that all customer_ids are valid (no orphans)
cust = spark.read.format("delta").load(cust_path).select("customer_id")
orphans = tx.join(cust, on="customer_id", how="left_anti").count()
print("Orphan transactions:", orphans)

# Show Delta history (should see APPEND writes)
display(spark.sql(f"DESCRIBE HISTORY delta.`{tx_path}`"))


Total rows: 270000000


ingest_day,count
2024-01-11,1000000
2024-01-16,1000000
2024-01-31,1000000
2024-03-01,1000000
2024-03-02,1000000
2024-03-03,1000000
2024-03-04,1500000
2024-03-05,1500000
2024-03-06,1500000
2024-03-07,1500000


Orphan transactions: 0


version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
181,2025-08-16T02:58:51Z,78016171715504,rohithkumar0955@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [""ingest_day""])",,List(1323301528906842),0120-034800-q4ixh8v8,180.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 1500000, numOutputBytes -> 27317715)",,Databricks-Runtime/15.4.x-scala2.12
180,2025-08-16T02:58:47Z,78016171715504,rohithkumar0955@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [""ingest_day""])",,List(1323301528906842),0120-034800-q4ixh8v8,179.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 1500000, numOutputBytes -> 27318412)",,Databricks-Runtime/15.4.x-scala2.12
179,2025-08-16T02:58:43Z,78016171715504,rohithkumar0955@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [""ingest_day""])",,List(1323301528906842),0120-034800-q4ixh8v8,178.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 1500000, numOutputBytes -> 27317704)",,Databricks-Runtime/15.4.x-scala2.12
178,2025-08-16T02:58:39Z,78016171715504,rohithkumar0955@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [""ingest_day""])",,List(1323301528906842),0120-034800-q4ixh8v8,177.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 1500000, numOutputBytes -> 27318413)",,Databricks-Runtime/15.4.x-scala2.12
177,2025-08-16T02:58:35Z,78016171715504,rohithkumar0955@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [""ingest_day""])",,List(1323301528906842),0120-034800-q4ixh8v8,176.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 1500000, numOutputBytes -> 27317670)",,Databricks-Runtime/15.4.x-scala2.12
176,2025-08-16T02:58:31Z,78016171715504,rohithkumar0955@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [""ingest_day""])",,List(1323301528906842),0120-034800-q4ixh8v8,175.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 1500000, numOutputBytes -> 27318410)",,Databricks-Runtime/15.4.x-scala2.12
175,2025-08-16T02:58:27Z,78016171715504,rohithkumar0955@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [""ingest_day""])",,List(1323301528906842),0120-034800-q4ixh8v8,174.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 1500000, numOutputBytes -> 27317702)",,Databricks-Runtime/15.4.x-scala2.12
174,2025-08-16T02:58:23Z,78016171715504,rohithkumar0955@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [""ingest_day""])",,List(1323301528906842),0120-034800-q4ixh8v8,173.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 1500000, numOutputBytes -> 27318404)",,Databricks-Runtime/15.4.x-scala2.12
173,2025-08-16T02:58:18Z,78016171715504,rohithkumar0955@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [""ingest_day""])",,List(1323301528906842),0120-034800-q4ixh8v8,172.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 1500000, numOutputBytes -> 27318410)",,Databricks-Runtime/15.4.x-scala2.12
172,2025-08-16T02:58:14Z,78016171715504,rohithkumar0955@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [""ingest_day""])",,List(1323301528906842),0120-034800-q4ixh8v8,171.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 1500000, numOutputBytes -> 27317703)",,Databricks-Runtime/15.4.x-scala2.12


In [0]:
cust = spark.read.format("delta").load(cust_path)

print("Customer rows:", cust.count())
print("Distinct customer_id:", cust.select("customer_id").distinct().count())

display(cust.orderBy("customer_id").limit(5))
display(cust.groupBy("segment").count().orderBy(F.desc("count")))
display(cust.groupBy("state").count().orderBy(F.desc("count")))


Customer rows: 200000
Distinct customer_id: 200000


customer_id,signup_date,segment,state,city
1,2023-12-13,CASUAL,WA,City_285
2,2023-11-15,CASUAL,PA,City_352
3,2023-07-19,CASUAL,CA,City_26
4,2023-06-16,LOYAL,GA,City_19
5,2023-05-17,VIP,TX,City_206


segment,count
LOYAL,96042
CASUAL,63949
VIP,40009


state,count
GA,20199
WA,20187
FL,20167
PA,20144
TX,20011
CA,19963
NC,19951
IL,19883
OH,19785
NY,19710


In [0]:
tx = spark.read.format("delta").load(tx_path).select("customer_id","ingest_day","amount","ts")

# Orphans (tx with no matching customer)
orphans = (tx.join(cust.select("customer_id"), on="customer_id", how="left_anti")).count()
print("Orphan transactions:", orphans)

# Basic joined sample
joined = tx.join(cust, "customer_id", "left")
display(joined.select("customer_id","segment","state","amount","ingest_day","ts").limit(10))

# Useful distributions for later KPIs
display(joined.groupBy("segment").agg(F.count("*").alias("txn_cnt"),
                                      F.sum("amount").alias("revenue")).orderBy(F.desc("revenue")))


Orphan transactions: 0


customer_id,segment,state,amount,ingest_day,ts
73093,LOYAL,GA,66.57000000000001,2024-03-01,2024-03-01T09:27:00Z
186252,LOYAL,WA,107.15,2024-03-01,2024-03-01T11:40:00Z
140693,CASUAL,OH,31.15,2024-03-01,2024-03-01T07:01:00Z
32719,VIP,OH,29.16,2024-03-01,2024-03-01T08:00:00Z
1266,LOYAL,FL,30.4,2024-03-01,2024-03-01T09:33:00Z
91772,CASUAL,PA,46.64,2024-03-01,2024-03-01T07:42:00Z
44595,LOYAL,WA,27.62,2024-03-01,2024-03-01T00:49:00Z
188066,VIP,IL,66.48,2024-03-01,2024-03-01T21:45:00Z
65399,LOYAL,GA,38.22,2024-03-01,2024-03-01T10:20:00Z
86407,LOYAL,TX,36.93,2024-03-01,2024-03-01T17:56:00Z


segment,txn_cnt,revenue
LOYAL,66218674,4850496454.019671
CASUAL,44136764,3231684651.4396777
VIP,27644562,2024527583.259989
