In [0]:
df = spark.read.parquet("abfss://labdata@dbdeltalabstorageacct.dfs.core.windows.net/invoices/invoices_201_99457.parquet")
df = df.select("customer_id", "category", "price", "quantity", "invoice_date")

In [0]:
display(df.limit(5))

customer_id,category,price,quantity,invoice_date
201,Clothing,900.24,3,2021-07-04
202,Clothing,900.24,3,2022-01-14
203,Toys,35.84,1,2022-02-20
204,Clothing,1500.4,5,2022-06-18
205,Souvenir,35.19,3,2022-04-27


In [0]:
df_union = df
expected_rows = 20000000 # 20,000,000

while df_union.count() <= expected_rows:
    df_union = df_union.union(df_union)
    print(f"count: {df_union.count()}")

print(f"final count: {df_union.count()}")

count: 198514
count: 397028
count: 794056
count: 1588112
count: 3176224
count: 6352448
count: 12704896
count: 25409792
final count: 25409792


In [0]:
df_union.write.mode("overwrite").saveAsTable("deltacatalog.deltadb.zorder_ex1")

In [0]:
%%time
spark.sql(
    """
    SELECT category,
        SUM(price * quantity) as total_sales
    FROM deltacatalog.deltadb.zorder_ex1
    WHERE customer_id = 201
    GROUP BY category
    """
)

CPU times: user 3.18 ms, sys: 510 µs, total: 3.69 ms
Wall time: 785 ms


DataFrame[category: string, total_sales: double]

In [0]:
%sql
OPTIMIZE deltacatalog.deltadb.zorder_ex1
ZORDER BY customer_id;

In [0]:
# from delta.tables import DeltaTable

# table = DeltaTable.forName(spark, "deltacatalog.deltadb.zorder_ex1")
# table.optimize().executeZOrderBy("customer_id")

In [0]:
%%time
spark.sql(
    """
    SELECT category,
        SUM(price * quantity) as total_sales
    FROM deltacatalog.deltadb.zorder_ex1
    WHERE customer_id = 201
    GROUP BY category
    """
)

CPU times: user 2.24 ms, sys: 345 µs, total: 2.58 ms
Wall time: 144 ms


DataFrame[category: string, total_sales: double]

In [0]:
df.write.mode("overwrite").partitionBy("invoice_date").saveAsTable("deltacatalog.deltadb.zorder_ex2")

- Hive Style Partition: `invoice_date`
- ZORDER BY: `customer_id` 
- For each of those `invoice_date`

In [0]:
import pyspark.sql.functions as F
df_new_partition = df.filter(F.col("invoice_date") == "2023-01-01").withColumn("invoice_date", F.to_date(F.lit("2025-05-04")))

In [0]:
display(df_new_partition.limit(5))

customer_id,category,price,quantity,invoice_date
986,Clothing,1500.4,5,2025-05-04
1127,Souvenir,58.65,5,2025-05-04
3271,Shoes,1800.51,3,2025-05-04
3485,Technology,4200.0,4,2025-05-04
4648,Clothing,1200.32,4,2025-05-04


In [0]:
df_new_partition.write.mode("append").partitionBy("invoice_date").saveAsTable("deltacatalog.deltadb.zorder_ex2")

In [0]:
%sql
SELECT MAX(invoice_date) FROM deltacatalog.deltadb.zorder_ex2;

max(invoice_date)
2025-05-04


In [0]:
%sql
OPTIMIZE deltacatalog.deltadb.zorder_ex2 
WHERE invoice_date = '{current_day - 1}'
ZORDER BY customer_id;