Objective

In this notebook, I have explored **Advanced Delta Lake** capabilities by implementing incremental data updates, tracking historical changes, optimizing query performance, and managing storage cleanup.

### Load Source Data

In [0]:
events = spark.table("default.ecommerce_transactions")

### Create Managed Delta Table

In [0]:
events.write.format("delta").mode("overwrite").saveAsTable("events_table")

In [0]:
display(spark.table("events_table").limit(5))

Transaction_ID,User_Name,Age,Country,Product_Category,Purchase_Amount,Payment_Method,Transaction_Date
1,Ava Hall,63,Mexico,Clothing,780.69,Debit Card,2023-04-14
2,Sophia Hall,59,India,Beauty,738.56,PayPal,2023-07-30
3,Elijah Thompson,26,France,Books,178.34,Credit Card,2023-09-17
4,Elijah White,43,Mexico,Sports,401.09,UPI,2023-06-21
5,Ava Harris,48,Germany,Beauty,594.83,Net Banking,2024-10-29


In [0]:
events = spark.table("events_table")
events.printSchema()

root
 |-- Transaction_ID: long (nullable = true)
 |-- User_Name: string (nullable = true)
 |-- Age: long (nullable = true)
 |-- Country: string (nullable = true)
 |-- Product_Category: string (nullable = true)
 |-- Purchase_Amount: double (nullable = true)
 |-- Payment_Method: string (nullable = true)
 |-- Transaction_Date: date (nullable = true)



### Create Incremental Updates DataFrame

Simulated an incremental batch containing:

Existing Transaction_IDs → UPDATE

New Transaction_IDs → INSERT

In [0]:
from pyspark.sql import Row
from pyspark.sql import functions as F

updates_data = [
    Row(1001, "Rahul_UPDATED", 29, "India", "Electronics", 1200.50, "Credit Card", "2026-01-10"),
    Row(1002, "Anita_UPDATED", 34, "USA", "Fashion", 250.75, "Debit Card", "2026-01-10"),
    Row(999001, "New_User_1", 26, "Canada", "Books", 89.99, "UPI", "2026-01-11"),
    Row(999002, "New_User_2", 41, "UK", "Home Appliances", 560.00, "Credit Card", "2026-01-11")
]

columns = [
    "Transaction_ID",
    "User_Name",
    "Age",
    "Country",
    "Product_Category",
    "Purchase_Amount",
    "Payment_Method",
    "Transaction_Date"
]

updates_df = spark.createDataFrame(updates_data, columns) \
    .withColumn("Transaction_Date", F.to_date("Transaction_Date"))

display(updates_df)
updates_df.printSchema()


Transaction_ID,User_Name,Age,Country,Product_Category,Purchase_Amount,Payment_Method,Transaction_Date
1001,Rahul_UPDATED,29,India,Electronics,1200.5,Credit Card,2026-01-10
1002,Anita_UPDATED,34,USA,Fashion,250.75,Debit Card,2026-01-10
999001,New_User_1,26,Canada,Books,89.99,UPI,2026-01-11
999002,New_User_2,41,UK,Home Appliances,560.0,Credit Card,2026-01-11


### Incremental MERGE (Upserts)

Used Transaction_ID as a unique business key.

In [0]:
from delta.tables import DeltaTable

deltaTable = DeltaTable.forName(spark, "events_table")

deltaTable.alias("t").merge(
    updates_df.alias("s"),
    "t.Transaction_ID = s.Transaction_ID"
).whenMatchedUpdateAll() \
 .whenNotMatchedInsertAll() \
 .execute()

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

### Ensuring idempotent updates and prevents duplicates.

In [0]:
display(
    spark.table("events_table")
    .filter("Transaction_ID IN (999001, 999002)")
)

Transaction_ID,User_Name,Age,Country,Product_Category,Purchase_Amount,Payment_Method,Transaction_Date
999002,New_User_2,41,UK,Home Appliances,560.0,Credit Card,2026-01-11
999001,New_User_1,26,Canada,Books,89.99,UPI,2026-01-11


In [0]:
display(
    spark.table("events_table")
    .filter("Transaction_ID IN (1001, 1002)")
)

Transaction_ID,User_Name,Age,Country,Product_Category,Purchase_Amount,Payment_Method,Transaction_Date
1001,Rahul_UPDATED,29,India,Electronics,1200.5,Credit Card,2026-01-10
1002,Anita_UPDATED,34,USA,Fashion,250.75,Debit Card,2026-01-10


### Time Travel (Version History)

Checking Delta transaction log to identify when updates occurred.

In [0]:
%sql
DESCRIBE HISTORY events_table;


version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
11,2026-01-13T04:06:55.000Z,76787938685907,saitejaswikondapally@gmail.com,MERGE,"Map(predicate -> [""(Transaction_ID#14219L = Transaction_ID#14196L)""], clusterBy -> [], matchedPredicates -> [{""actionType"":""update""}], statsOnLoad -> true, notMatchedBySourcePredicates -> [], notMatchedPredicates -> [{""actionType"":""insert""}])",,List(1772620888573772),0113-034239-elud51y1-v2n,10.0,WriteSerializable,False,"Map(numTargetRowsCopied -> 0, numTargetRowsDeleted -> 0, numTargetFilesAdded -> 4, numTargetBytesAdded -> 8829, numTargetBytesRemoved -> 0, numTargetDeletionVectorsAdded -> 1, numTargetRowsMatchedUpdated -> 2, executionTimeMs -> 5913, materializeSourceTimeMs -> 398, numTargetRowsInserted -> 2, numTargetRowsMatchedDeleted -> 0, numTargetDeletionVectorsUpdated -> 0, scanTimeMs -> 2369, numTargetRowsUpdated -> 2, numOutputRows -> 4, numTargetDeletionVectorsRemoved -> 0, numTargetRowsNotMatchedBySourceUpdated -> 0, numTargetChangeFilesAdded -> 0, numSourceRows -> 4, numTargetFilesRemoved -> 0, numTargetRowsNotMatchedBySourceDeleted -> 0, rewriteTimeMs -> 2970)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
10,2026-01-13T03:48:21.000Z,76787938685907,saitejaswikondapally@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(1772620888573772),0113-034239-elud51y1-v2n,9.0,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 1, numRemovedBytes -> 460193, numDeletionVectorsRemoved -> 0, numOutputRows -> 50000, numOutputBytes -> 460193)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
9,2026-01-13T03:46:34.000Z,76787938685907,saitejaswikondapally@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(1772620888573772),0113-034239-elud51y1-v2n,8.0,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 1, numRemovedBytes -> 499384, numDeletionVectorsRemoved -> 0, numOutputRows -> 50000, numOutputBytes -> 460193)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
8,2026-01-12T14:39:35.000Z,76787938685907,saitejaswikondapally@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(367210358311360),0112-130823-elv6wqmg-v2n,7.0,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 1, numRemovedBytes -> 499384, numDeletionVectorsRemoved -> 0, numOutputRows -> 50000, numOutputBytes -> 499384)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
7,2026-01-12T14:38:38.000Z,76787938685907,saitejaswikondapally@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(367210358311360),0112-130823-elv6wqmg-v2n,6.0,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 1, numRemovedBytes -> 499384, numDeletionVectorsRemoved -> 0, numOutputRows -> 50000, numOutputBytes -> 499384)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
6,2026-01-12T14:38:25.000Z,76787938685907,saitejaswikondapally@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(367210358311360),0112-130823-elv6wqmg-v2n,5.0,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 1, numRemovedBytes -> 460193, numDeletionVectorsRemoved -> 0, numOutputRows -> 50000, numOutputBytes -> 499384)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
5,2026-01-12T14:35:22.000Z,76787938685907,saitejaswikondapally@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(367210358311360),0112-130823-elv6wqmg-v2n,4.0,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 1, numRemovedBytes -> 460193, numDeletionVectorsRemoved -> 0, numOutputRows -> 50000, numOutputBytes -> 460193)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
4,2026-01-12T14:30:09.000Z,76787938685907,saitejaswikondapally@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(367210358311360),0112-130823-elv6wqmg-v2n,3.0,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 1, numRemovedBytes -> 460193, numDeletionVectorsRemoved -> 0, numOutputRows -> 50000, numOutputBytes -> 460193)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
3,2026-01-12T14:29:51.000Z,76787938685907,saitejaswikondapally@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(367210358311360),0112-130823-elv6wqmg-v2n,2.0,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 1, numRemovedBytes -> 460193, numDeletionVectorsRemoved -> 0, numOutputRows -> 50000, numOutputBytes -> 460193)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
2,2026-01-12T13:08:57.000Z,76787938685907,saitejaswikondapally@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(367210358311360),0112-130823-elv6wqmg-v2n,1.0,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 1, numRemovedBytes -> 460193, numDeletionVectorsRemoved -> 0, numOutputRows -> 50000, numOutputBytes -> 460193)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13


### Time Travel: Viewing Record Before MERGE (Version 10)


In [0]:
%sql
SELECT *
FROM events_table VERSION AS OF 10
WHERE Transaction_ID = 1001;

Transaction_ID,User_Name,Age,Country,Product_Category,Purchase_Amount,Payment_Method,Transaction_Date
1001,Olivia Thompson,67,Australia,Clothing,961.19,Debit Card,2023-12-26


### OPTIMIZE & ZORDER (Performance)

Multiple small files → 1 optimized file

In [0]:
%sql
OPTIMIZE events_table
ZORDER BY (Transaction_ID);


path,metrics
,"List(1, 5, List(661401, 661401, 661401.0, 1, 661401), List(2155, 460193, 93804.4, 5, 469022), 0, List(minCubeSize(107374182400), List(0, 0), List(5, 469022), 0, List(5, 469022), 1, null), null, 0, 1, 5, 0, false, 0, 0, 1768278725284, 1768278728090, 8, 1, null, List(1, 2), null, 8, 8, 609, 0, null)"


### VACUUM (Storage Cleanup)

Removes unused old files while retaining time travel safety

In [0]:
%sql
VACUUM events_table;


path


#### ReCheck of Time Travel (Version History)

In [0]:
%sql
DESCRIBE HISTORY events_table;


version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
14,2026-01-13T04:33:15.000Z,76787938685907,saitejaswikondapally@gmail.com,VACUUM END,Map(status -> COMPLETED),,List(1772620888573772),0113-034239-elud51y1-v2n,13.0,SnapshotIsolation,True,"Map(numDeletedFiles -> 0, numVacuumedDirectories -> 1)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
13,2026-01-13T04:33:14.000Z,76787938685907,saitejaswikondapally@gmail.com,VACUUM START,"Map(retentionCheckEnabled -> true, defaultRetentionMillis -> 604800000)",,List(1772620888573772),0113-034239-elud51y1-v2n,12.0,SnapshotIsolation,True,"Map(numFilesToDelete -> 0, sizeOfDataToDelete -> 0)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
12,2026-01-13T04:32:08.000Z,76787938685907,saitejaswikondapally@gmail.com,OPTIMIZE,"Map(predicate -> [], auto -> false, clusterBy -> [], zOrderBy -> [""Transaction_ID""], batchId -> 0)",,List(1772620888573772),0113-034239-elud51y1-v2n,11.0,SnapshotIsolation,False,"Map(numRemovedFiles -> 5, numRemovedBytes -> 469022, p25FileSize -> 661401, numDeletionVectorsRemoved -> 1, minFileSize -> 661401, numAddedFiles -> 1, maxFileSize -> 661401, p75FileSize -> 661401, p50FileSize -> 661401, numAddedBytes -> 661401)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
11,2026-01-13T04:06:55.000Z,76787938685907,saitejaswikondapally@gmail.com,MERGE,"Map(predicate -> [""(Transaction_ID#14219L = Transaction_ID#14196L)""], clusterBy -> [], matchedPredicates -> [{""actionType"":""update""}], statsOnLoad -> true, notMatchedBySourcePredicates -> [], notMatchedPredicates -> [{""actionType"":""insert""}])",,List(1772620888573772),0113-034239-elud51y1-v2n,10.0,WriteSerializable,False,"Map(numTargetRowsCopied -> 0, numTargetRowsDeleted -> 0, numTargetFilesAdded -> 4, numTargetBytesAdded -> 8829, numTargetBytesRemoved -> 0, numTargetDeletionVectorsAdded -> 1, numTargetRowsMatchedUpdated -> 2, executionTimeMs -> 5913, materializeSourceTimeMs -> 398, numTargetRowsInserted -> 2, numTargetRowsMatchedDeleted -> 0, numTargetDeletionVectorsUpdated -> 0, scanTimeMs -> 2369, numTargetRowsUpdated -> 2, numOutputRows -> 4, numTargetDeletionVectorsRemoved -> 0, numTargetRowsNotMatchedBySourceUpdated -> 0, numTargetChangeFilesAdded -> 0, numSourceRows -> 4, numTargetFilesRemoved -> 0, numTargetRowsNotMatchedBySourceDeleted -> 0, rewriteTimeMs -> 2970)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
10,2026-01-13T03:48:21.000Z,76787938685907,saitejaswikondapally@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(1772620888573772),0113-034239-elud51y1-v2n,9.0,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 1, numRemovedBytes -> 460193, numDeletionVectorsRemoved -> 0, numOutputRows -> 50000, numOutputBytes -> 460193)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
9,2026-01-13T03:46:34.000Z,76787938685907,saitejaswikondapally@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(1772620888573772),0113-034239-elud51y1-v2n,8.0,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 1, numRemovedBytes -> 499384, numDeletionVectorsRemoved -> 0, numOutputRows -> 50000, numOutputBytes -> 460193)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
8,2026-01-12T14:39:35.000Z,76787938685907,saitejaswikondapally@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(367210358311360),0112-130823-elv6wqmg-v2n,7.0,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 1, numRemovedBytes -> 499384, numDeletionVectorsRemoved -> 0, numOutputRows -> 50000, numOutputBytes -> 499384)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
7,2026-01-12T14:38:38.000Z,76787938685907,saitejaswikondapally@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(367210358311360),0112-130823-elv6wqmg-v2n,6.0,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 1, numRemovedBytes -> 499384, numDeletionVectorsRemoved -> 0, numOutputRows -> 50000, numOutputBytes -> 499384)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
6,2026-01-12T14:38:25.000Z,76787938685907,saitejaswikondapally@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(367210358311360),0112-130823-elv6wqmg-v2n,5.0,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 1, numRemovedBytes -> 460193, numDeletionVectorsRemoved -> 0, numOutputRows -> 50000, numOutputBytes -> 499384)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
5,2026-01-12T14:35:22.000Z,76787938685907,saitejaswikondapally@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(367210358311360),0112-130823-elv6wqmg-v2n,4.0,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 1, numRemovedBytes -> 460193, numDeletionVectorsRemoved -> 0, numOutputRows -> 50000, numOutputBytes -> 460193)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
