Define Paths (Once)

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp

spark = SparkSession.builder.getOrCreate()

base_path = "/Volumes/main/default/sales_volume/delta/events"
updates_path = "/Volumes/main/default/sales_volume/delta/incoming_updates"


Implement Incremental MERGE 
Step 1.1: Create Base Delta Table (Initial Load)

In [0]:
data = [
    ("sess_1", "click", 101, "2024-01-01 10:00:00"),
    ("sess_2", "view", 102, "2024-01-01 10:05:00"),
    ("sess_3", "purchase", 103, "2024-01-01 10:10:00")
]

columns = ["user_session", "event_type", "user_id", "event_time"]

df = spark.createDataFrame(data, columns) \
    .withColumn("event_time", to_timestamp("event_time"))

df.write.format("delta") \
    .mode("overwrite") \
    .save(base_path)


In [0]:
spark.read.format("delta").load(base_path).printSchema()


root
 |-- user_id: long (nullable = true)
 |-- event_type: string (nullable = true)
 |-- event_time: timestamp (nullable = true)
 |-- user_session: string (nullable = true)



Create Incremental Update Data

In [0]:
updates_data = [
    # UPDATE existing row
    ("sess_2", "click", 102, "2024-01-01 10:05:00"),

    # INSERT new row
    ("sess_4", "view", 104, "2024-01-01 10:15:00")
]

updates_df = spark.createDataFrame(updates_data, columns) \
    .withColumn("event_time", to_timestamp("event_time"))

updates_df.write \
    .format("delta") \
    .mode("overwrite") \
    .save(updates_path)


In [0]:
spark.read.format("delta").load(updates_path).printSchema()


root
 |-- user_session: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- user_id: long (nullable = true)
 |-- event_time: timestamp (nullable = true)



Perform Incremental MERGE (UPSERT)

In [0]:
from delta.tables import DeltaTable

deltaTable = DeltaTable.forPath(spark, base_path)

updates = spark.read.format("delta").load(updates_path)

deltaTable.alias("t").merge(
    updates.alias("s"),
    "t.user_session = s.user_session AND t.event_time = s.event_time"
).whenMatchedUpdateAll() \
 .whenNotMatchedInsertAll() \
 .execute()


DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [0]:
spark.read.format("delta").load(base_path).show()


+-------+----------+-------------------+------------+
|user_id|event_type|         event_time|user_session|
+-------+----------+-------------------+------------+
|    101|     click|2024-01-01 10:00:00|      sess_1|
|    103|  purchase|2024-01-01 10:10:00|      sess_3|
|    102|     click|2024-01-01 10:05:00|      sess_2|
|    104|      view|2024-01-01 10:15:00|      sess_4|
+-------+----------+-------------------+------------+



View Delta History

In [0]:
%sql
DESCRIBE HISTORY delta.`/Volumes/main/default/sales_volume/delta/events`


version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
8,2026-01-13T11:57:03.000Z,77363117283784,testwest1221@gmail.com,OPTIMIZE,"Map(predicate -> [], auto -> true, clusterBy -> [], zOrderBy -> [], batchId -> 0)",,List(3212715136256620),0113-115049-hqootdsd-v2n,7.0,SnapshotIsolation,False,"Map(numRemovedFiles -> 2, numRemovedBytes -> 2720, p25FileSize -> 1420, numDeletionVectorsRemoved -> 1, minFileSize -> 1420, numAddedFiles -> 1, maxFileSize -> 1420, p75FileSize -> 1420, p50FileSize -> 1420, numAddedBytes -> 1420)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
7,2026-01-13T11:57:00.000Z,77363117283784,testwest1221@gmail.com,MERGE,"Map(predicate -> [""((user_session#13670 = user_session#13671) AND (event_time#13669 = event_time#13674))""], clusterBy -> [], matchedPredicates -> [{""actionType"":""update""}], statsOnLoad -> false, notMatchedBySourcePredicates -> [], notMatchedPredicates -> [{""actionType"":""insert""}])",,List(3212715136256620),0113-115049-hqootdsd-v2n,6.0,WriteSerializable,False,"Map(numTargetRowsCopied -> 0, numTargetRowsDeleted -> 0, numTargetFilesAdded -> 1, numTargetBytesAdded -> 1345, numTargetBytesRemoved -> 0, numTargetDeletionVectorsAdded -> 1, numTargetRowsMatchedUpdated -> 1, executionTimeMs -> 5199, materializeSourceTimeMs -> 4, numTargetRowsInserted -> 1, numTargetRowsMatchedDeleted -> 0, numTargetDeletionVectorsUpdated -> 0, scanTimeMs -> 2721, numTargetRowsUpdated -> 1, numOutputRows -> 2, numTargetDeletionVectorsRemoved -> 0, numTargetRowsNotMatchedBySourceUpdated -> 0, numTargetChangeFilesAdded -> 0, numSourceRows -> 2, numTargetFilesRemoved -> 0, numTargetRowsNotMatchedBySourceDeleted -> 0, rewriteTimeMs -> 2313)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
6,2026-01-13T11:55:46.000Z,77363117283784,testwest1221@gmail.com,WRITE,"Map(mode -> Overwrite, statsOnLoad -> false, partitionBy -> [])",,List(3212715136256620),0113-115049-hqootdsd-v2n,5.0,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 1, numRemovedBytes -> 1350, numDeletionVectorsRemoved -> 0, numOutputRows -> 3, numOutputBytes -> 1375)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
5,2026-01-13T10:58:12.000Z,77363117283784,testwest1221@gmail.com,WRITE,"Map(mode -> Overwrite, statsOnLoad -> false, partitionBy -> [])",,List(3212715136256620),0113-103913-4cqnptgd-v2n,4.0,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 1, numRemovedBytes -> 1350, numDeletionVectorsRemoved -> 0, numOutputRows -> 3, numOutputBytes -> 1350)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
4,2026-01-13T10:54:22.000Z,77363117283784,testwest1221@gmail.com,WRITE,"Map(mode -> Overwrite, statsOnLoad -> false, partitionBy -> [])",,List(3212715136256620),0113-103913-4cqnptgd-v2n,3.0,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 1, numRemovedBytes -> 1350, numDeletionVectorsRemoved -> 0, numOutputRows -> 3, numOutputBytes -> 1350)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
3,2026-01-13T10:50:54.000Z,77363117283784,testwest1221@gmail.com,WRITE,"Map(mode -> Overwrite, statsOnLoad -> false, partitionBy -> [])",,List(3212715136256620),0113-103913-4cqnptgd-v2n,2.0,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 1, numRemovedBytes -> 1350, numDeletionVectorsRemoved -> 0, numOutputRows -> 3, numOutputBytes -> 1350)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
2,2026-01-13T10:44:56.000Z,77363117283784,testwest1221@gmail.com,WRITE,"Map(mode -> Overwrite, statsOnLoad -> false, partitionBy -> [])",,List(3212715136256620),0113-103913-4cqnptgd-v2n,1.0,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 1, numRemovedBytes -> 1350, numDeletionVectorsRemoved -> 0, numOutputRows -> 3, numOutputBytes -> 1350)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
1,2026-01-13T10:44:05.000Z,77363117283784,testwest1221@gmail.com,WRITE,"Map(mode -> Overwrite, statsOnLoad -> false, partitionBy -> [])",,List(3212715136256620),0113-103913-4cqnptgd-v2n,0.0,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 1, numRemovedBytes -> 1350, numDeletionVectorsRemoved -> 0, numOutputRows -> 3, numOutputBytes -> 1350)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
0,2026-01-13T10:43:39.000Z,77363117283784,testwest1221@gmail.com,WRITE,"Map(mode -> Overwrite, statsOnLoad -> false, partitionBy -> [])",,List(3212715136256620),0113-103913-4cqnptgd-v2n,,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 0, numRemovedBytes -> 0, numDeletionVectorsRemoved -> 0, numOutputRows -> 3, numOutputBytes -> 1350)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13


: Query Old Version (Before MERGE)

In [0]:
# Version 0 (before MERGE)
spark.read.format("delta") \
    .option("versionAsOf", 0) \
    .load(base_path) \
    .show()


+-------+----------+-------------------+------------+
|user_id|event_type|         event_time|user_session|
+-------+----------+-------------------+------------+
|      1|     login|2024-01-01 10:00:00|          A1|
|      2|     click|2024-01-01 10:05:00|          A2|
|      3|    logout|2024-01-01 10:10:00|          A3|
+-------+----------+-------------------+------------+



- Optimize Tables (OPTIMIZE + ZORDER)
Register Table in Unity Catalog

In [0]:
%sql
CREATE TABLE IF NOT EXISTS main.default.events_table
USING DELTA
AS
SELECT *
FROM delta.`/Volumes/main/default/sales_volume/delta/events`


num_affected_rows,num_inserted_rows


In [0]:
%sql
SELECT * FROM main.default.events_table;


user_id,event_type,event_time,user_session
101,click,2024-01-01T10:00:00.000Z,sess_1
103,purchase,2024-01-01T10:10:00.000Z,sess_3
102,click,2024-01-01T10:05:00.000Z,sess_2
104,view,2024-01-01T10:15:00.000Z,sess_4


OPTIMIZE with ZORDER

In [0]:
%sql
OPTIMIZE main.default.events_table
ZORDER BY (event_type, user_id)


path,metrics
,"List(0, 0, List(null, null, 0.0, 0, 0), List(null, null, 0.0, 0, 0), 0, List(minCubeSize(107374182400), List(0, 0), List(1, 1420), 0, List(0, 0), 0, null), null, 0, 0, 1, 1, false, 0, 0, 1768305794465, 1768305795260, 8, 0, null, List(0, 0), null, 4, 4, 0, 0, null)"


Run VACUUM (Safe Retention)

In [0]:
%sql
VACUUM main.default.events_table RETAIN 168 HOURS


path


In [0]:
%sql
DESCRIBE HISTORY main.default.events_table


version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
2,2026-01-13T12:03:43.000Z,77363117283784,testwest1221@gmail.com,VACUUM END,Map(status -> COMPLETED),,List(3212715136256620),0113-115049-hqootdsd-v2n,1.0,SnapshotIsolation,True,"Map(numDeletedFiles -> 0, numVacuumedDirectories -> 1)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
1,2026-01-13T12:03:42.000Z,77363117283784,testwest1221@gmail.com,VACUUM START,"Map(retentionCheckEnabled -> true, defaultRetentionMillis -> 604800000, specifiedRetentionMillis -> 604800000)",,List(3212715136256620),0113-115049-hqootdsd-v2n,0.0,SnapshotIsolation,True,"Map(numFilesToDelete -> 0, sizeOfDataToDelete -> 0)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
0,2026-01-13T12:00:06.000Z,77363117283784,testwest1221@gmail.com,CREATE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(3212715136256620),0113-115049-hqootdsd-v2n,,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 4, numOutputBytes -> 1420)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
