# DAY 5 â€“ Delta Lake Advanced Operations (Incremental Loads)

## Learn
- Time travel (version history)
- MERGE operations (upserts)
- OPTIMIZE & ZORDER
- VACUUM for cleanup

## Table Schema (events_delta)

- event_time (timestamp)
- event_type (string)
- product_id (int)
- category_id (bigint)
- category_code (string)
- brand (string)
- price (double)
- user_id (int)
- user_session (string)

In [0]:
from delta.tables import DeltaTable
from pyspark.sql import functions as F

delta_table = DeltaTable.forName(spark, "events_delta")

## Load October CSV (Initial Load)

In [0]:
oct_events = spark.read.csv(
    "/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv",
    header=True,
    inferSchema=True
)

oct_events = oct_events.withColumn(
    "event_time",
    F.to_timestamp("event_time")
)

oct_events.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("events_delta")

## Load November CSV (Incremental Data)

In [0]:
nov_events = spark.read.csv(
    "/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv",
    header=True,
    inferSchema=True
)

nov_events = nov_events.withColumn(
    "event_time",
    F.to_timestamp("event_time")
)

nov_events.show(5)

+-------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code| brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|2019-11-01 00:00:00|      view|   1003461|2053013555631882655|electronics.smart...|xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|2019-11-01 00:00:00|      view|   5000088|2053013566100866035|appliances.sewing...|janome|293.65|530496790|8e5f4f83-366c-4f7...|
|2019-11-01 00:00:01|      view|  17302664|2053013553853497655|                NULL| creed| 28.31|561587266|755422e7-9040-477...|
|2019-11-01 00:00:01|      view|   3601530|2053013563810775923|appliances.kitche...|    lg|712.87|518085591|3bfb58cd-7892-48c...|
|2019-11-01 00:00:01|      view|   1004775|2053013555631882655|electronics.smart...|xiaomi

## MERGE (UPSERT) November Data

Composite Key:
- user_session
- event_time
- event_type

In [0]:
delta_table.alias("t").merge(
    nov_events.alias("s"),
    """
    t.user_session = s.user_session
    AND t.event_time = s.event_time
    AND t.event_type = s.event_type
    """
).whenMatchedUpdateAll() \
 .whenNotMatchedInsertAll() \
 .execute()

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

## Delta History & Time Travel

In [0]:
display(spark.sql("DESCRIBE HISTORY events_delta"))

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
5,2026-01-13T15:35:28.000Z,76783030137163,sreekavya198@gmail.com,VACUUM END,Map(status -> COMPLETED),,List(3092126651394337),0113-152955-d8n0iznv-v2n,4.0,SnapshotIsolation,True,"Map(numDeletedFiles -> 0, numVacuumedDirectories -> 1)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
4,2026-01-13T15:35:27.000Z,76783030137163,sreekavya198@gmail.com,VACUUM START,"Map(retentionCheckEnabled -> true, defaultRetentionMillis -> 604800000, specifiedRetentionMillis -> 604800000)",,List(3092126651394337),0113-152955-d8n0iznv-v2n,3.0,SnapshotIsolation,True,"Map(numFilesToDelete -> 0, sizeOfDataToDelete -> 0)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
3,2026-01-13T15:35:17.000Z,76783030137163,sreekavya198@gmail.com,OPTIMIZE,"Map(predicate -> [], auto -> false, clusterBy -> [], zOrderBy -> [""event_type"",""user_id""], batchId -> 0)",,List(3092126651394337),0113-152955-d8n0iznv-v2n,2.0,SnapshotIsolation,False,"Map(numRemovedFiles -> 111, numRemovedBytes -> 1899446986, p25FileSize -> 69397775, numDeletionVectorsRemoved -> 0, minFileSize -> 50555389, numAddedFiles -> 26, maxFileSize -> 91950859, p75FileSize -> 81339114, p50FileSize -> 76094549, numAddedBytes -> 1948428958)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
2,2026-01-13T15:34:29.000Z,76783030137163,sreekavya198@gmail.com,MERGE,"Map(predicate -> [""(((user_session#13838 = user_session#13775) AND (event_time#13830 = event_time#13819)) AND (event_type#13831 = event_type#13768))""], clusterBy -> [], matchedPredicates -> [{""actionType"":""update""}], statsOnLoad -> true, notMatchedBySourcePredicates -> [], notMatchedPredicates -> [{""actionType"":""insert""}])",,List(3092126651394337),0113-152955-d8n0iznv-v2n,1.0,WriteSerializable,False,"Map(numTargetRowsCopied -> 0, numTargetRowsDeleted -> 0, numTargetFilesAdded -> 68, numTargetBytesAdded -> 1171886398, numTargetBytesRemoved -> 0, numTargetDeletionVectorsAdded -> 0, numTargetRowsMatchedUpdated -> 0, executionTimeMs -> 94332, materializeSourceTimeMs -> 46463, numTargetRowsInserted -> 67501979, numTargetRowsMatchedDeleted -> 0, numTargetDeletionVectorsUpdated -> 0, scanTimeMs -> 27228, numTargetRowsUpdated -> 0, numOutputRows -> 67501979, numTargetDeletionVectorsRemoved -> 0, numTargetRowsNotMatchedBySourceUpdated -> 0, numTargetChangeFilesAdded -> 0, numSourceRows -> 67501979, numTargetFilesRemoved -> 0, numTargetRowsNotMatchedBySourceDeleted -> 0, rewriteTimeMs -> 20465)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
1,2026-01-13T15:32:16.000Z,76783030137163,sreekavya198@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.parquet.compression.codec"":""zstd"",""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(3092126651394337),0113-152955-d8n0iznv-v2n,0.0,WriteSerializable,False,"Map(numFiles -> 43, numRemovedFiles -> 16, numRemovedBytes -> 1947232774, numDeletionVectorsRemoved -> 0, numOutputRows -> 42448764, numOutputBytes -> 727560588)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
0,2026-01-12T14:21:25.000Z,76783030137163,sreekavya198@gmail.com,CREATE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.parquet.compression.codec"":""zstd"",""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(4031764343762533),0112-141809-8bbm01wz-v2n,,WriteSerializable,True,"Map(numFiles -> 16, numOutputRows -> 109950743, numOutputBytes -> 1947232774)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13


In [0]:
spark.sql(
    "SELECT COUNT(*) FROM events_delta VERSION AS OF 0"
).show()

+---------+
| COUNT(*)|
+---------+
|109950743|
+---------+



## OPTIMIZE & VACUUM

In [0]:
spark.sql("OPTIMIZE events_delta ZORDER BY (event_type, user_id)")
spark.sql("VACUUM events_delta RETAIN 168 HOURS")

DataFrame[path: string]

## Key Takeaways
- MERGE enables reliable monthly incremental loads
- Composite keys replace missing event_id
- Time travel supports audit and rollback
- OPTIMIZE and VACUUM improve performance