Data Preparation

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import rand, expr, current_timestamp

spark = SparkSession.builder.getOrCreate()

df = (
    spark.range(0, 1_000_000)
    .withColumn("user_id", (rand() * 10_000).cast("int"))
    # Correct event_time calculation using timestamp_seconds to avoid interval error
    .withColumn("event_time", expr("timestamp_seconds(id % 100000 + unix_timestamp(current_timestamp()))"))
    .withColumn("event_type", (rand() * 100).cast("int"))
)

Catalog and Schema Setup

In [0]:
spark.sql("CREATE CATALOG IF NOT EXISTS lakeflow_demo")
spark.sql("CREATE SCHEMA IF NOT EXISTS lakeflow_demo.bronze")
spark.sql("USE CATALOG lakeflow_demo")
spark.sql("USE SCHEMA bronze")

Save Partitioned Table (Managed)

In [0]:
df.write.format("delta") \
  .mode("overwrite") \
  .partitionBy("event_type") \
  .saveAsTable("events_partitioned")

Save ZORDER Table (Managed) and Optimize

In [0]:
df.write.format("delta") \
  .mode("overwrite") \
  .saveAsTable("events_zorder")

spark.sql("OPTIMIZE lakeflow_demo.bronze.events_zorder ZORDER BY (user_id, event_time)")


Save Liquid Clustering Table (Managed) and Enable Clustering

In [0]:
df.write.format("delta") \
  .mode("overwrite") \
  .saveAsTable("events_liquid_clustered")

spark.sql("""
ALTER TABLE lakeflow_demo.bronze.events_liquid_clustered CLUSTER BY (user_id, event_time)
""")

spark.sql("OPTIMIZE lakeflow_demo.bronze.events_liquid_clustered FULL")


Benchmark Full Table Scan

In [0]:
import time

def benchmark(table_name):
    start = time.time()
    spark.sql(f"SELECT COUNT(*) FROM {table_name}").collect()
    elapsed = time.time() - start
    print(f"{table_name} scanned in {elapsed:.2f} seconds")

tables = [
    "lakeflow_demo.bronze.events_partitioned",
    "lakeflow_demo.bronze.events_zorder",
    "lakeflow_demo.bronze.events_liquid_clustered"
]

for t in tables:
    benchmark(t)