In [0]:

# Step 1: Load raw events CSV
events = (spark.read
          .option("header", True)
          .option("inferSchema", True)
          .csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv"))


In [0]:
# Step 2: Deduplicate data
deduped_events = events.dropDuplicates(["event_time", "user_id", "product_id"])

In [0]:

# Step 3: Write deduplicated data to Delta
deduped_events.write.format("delta")\
.mode("overwrite")\
.save("/Volumes/workspace/ecommerce/ecommerce_data/events_delta")

In [0]:
# Step 4: Create SQL table for easy querying
deduped_events.write.format("delta")\
.mode("overwrite")\
.saveAsTable("events_table")

In [0]:
# Step 5: Verify
print("Total rows:", deduped_events.count())
display(deduped_events.limit(5))

Total rows: 67351679


event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-11-01T00:06:37.000Z,view,1005099,2053013555631882655,electronics.smartphone,samsung,139.64,532572658,b1df0fce-6e03-47cc-b777-782b77b1b100
2019-11-01T00:18:49.000Z,view,53300006,2141355068383822734,,perilla,25.71,515474976,222c370b-0fac-4287-982b-e340f5eaf3a1
2019-11-01T00:25:18.000Z,view,3601406,2053013563810775923,appliances.kitchen.washer,beko,215.75,549757937,3c486d91-a01c-480b-a3c8-9dd75ab2e138
2019-11-01T00:28:16.000Z,view,13400614,2053013557066334713,,,131.53,538802610,7c2e7628-6ac8-4cb6-8eee-934b989d02a7
2019-11-01T00:39:06.000Z,view,14700105,2053013557133443581,furniture.living_room.cabinet,,385.85,527859779,c695882a-1a47-48fa-9e73-b35bc5d9ee5a


In [0]:
from delta.tables import DeltaTable
from pyspark.sql import functions as f

# Step 1: Attach Delta table
events_delta = DeltaTable.forPath(spark,
    "/Volumes/workspace/ecommerce/ecommerce_data/events_delta"
)
   

In [0]:

# Step 2: Load deduplicated existing events
deduped_events = events_delta.toDF()
     

In [0]:
# Step 3: Simulate incremental data
incremental_df = deduped_events.limit(100)\
.withColumn("price", f.col("price") + 10)

display(incremental_df.limit(5))

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-11-01T00:06:37.000Z,view,1005099,2053013555631882655,electronics.smartphone,samsung,149.64,532572658,b1df0fce-6e03-47cc-b777-782b77b1b100
2019-11-01T00:18:49.000Z,view,53300006,2141355068383822734,,perilla,35.71,515474976,222c370b-0fac-4287-982b-e340f5eaf3a1
2019-11-01T00:25:18.000Z,view,3601406,2053013563810775923,appliances.kitchen.washer,beko,225.75,549757937,3c486d91-a01c-480b-a3c8-9dd75ab2e138
2019-11-01T00:28:16.000Z,view,13400614,2053013557066334713,,,141.53,538802610,7c2e7628-6ac8-4cb6-8eee-934b989d02a7
2019-11-01T00:39:06.000Z,view,14700105,2053013557133443581,furniture.living_room.cabinet,,395.85,527859779,c695882a-1a47-48fa-9e73-b35bc5d9ee5a


In [0]:

# Step 4: Handle NULLs before MERGE
incremental_df_clean = incremental_df \
.dropna(subset=["user_session", "event_time"])\
.fillna({
    "price": 0.0,
    "brand": "unknown",
    "category_code": "unknown",
    "category_id": -1
    })

display(incremental_df_clean.limit(5))

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-11-01T00:06:37.000Z,view,1005099,2053013555631882655,electronics.smartphone,samsung,149.64,532572658,b1df0fce-6e03-47cc-b777-782b77b1b100
2019-11-01T00:18:49.000Z,view,53300006,2141355068383822734,unknown,perilla,35.71,515474976,222c370b-0fac-4287-982b-e340f5eaf3a1
2019-11-01T00:25:18.000Z,view,3601406,2053013563810775923,appliances.kitchen.washer,beko,225.75,549757937,3c486d91-a01c-480b-a3c8-9dd75ab2e138
2019-11-01T00:28:16.000Z,view,13400614,2053013557066334713,unknown,unknown,141.53,538802610,7c2e7628-6ac8-4cb6-8eee-934b989d02a7
2019-11-01T00:39:06.000Z,view,14700105,2053013557133443581,furniture.living_room.cabinet,unknown,395.85,527859779,c695882a-1a47-48fa-9e73-b35bc5d9ee5a


In [0]:
# Step 5: MERGE incremental data
merge_summary = events_delta.alias("t").merge(
    incremental_df_clean.alias("s"),
    "t.user_session = s.user_session AND t.event_time = s.event_time"
).whenMatchedUpdateAll()\
 .whenNotMatchedInsertAll()\
 .execute()
 
 # Shows updated/inserted rows
display(merge_summary)  

num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
100,100,0,0


In [0]:

# Step 6: Verify
events_df = events_delta.toDF()
print("Total rows after merge:", events_df.count())
display(events_df.orderBy("event_time", ascending=False).limit(5))

Total rows after merge: 67351679


event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-11-30T23:59:59.000Z,view,1004233,2053013555631882655,electronics.smartphone,apple,1312.52,579969851,90aca71c-ed8a-4670-866a-761ebacb732d
2019-11-30T23:59:59.000Z,view,2701706,2053013563911439225,appliances.kitchen.refrigerators,samsung,566.27,531607492,368ddc8b-5db9-40fb-b7ff-b6582a1192c0
2019-11-30T23:59:59.000Z,view,1004833,2053013555631882655,electronics.smartphone,samsung,167.03,557794415,6fecf566-ebb0-4e70-a243-cdc13ce044cb
2019-11-30T23:59:58.000Z,view,15700137,2053013559733912211,,,277.74,532714000,02b4131c-0112-4231-aafa-ceaa08e77c1b
2019-11-30T23:59:58.000Z,view,28719425,2053013565639492569,apparel.shoes,baden,62.81,545223467,734c5eef-0742-4f8b-9d22-48f75b0bc359


In [0]:

from delta.tables import DeltaTable

# Attach Delta Table
events_delta = DeltaTable.forPath(spark, "/Volumes/workspace/ecommerce/ecommerce_data/events_delta")

In [0]:

# Check history
spark.sql("DESCRIBE HISTORY events_table").show(truncate=False)

+-------+-------------------+--------------+-------------------------+---------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------+----+----------------+------------------------+-----------+-----------------+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+--------------------------------------------------+
|version|timestamp          |userId        |userName                 |operation                        |operationParameters                                                                                                                                    |job |notebook        |clusterId               |readVersion|isolationLevel   |isBlindAppend|operationMetrics                                                                 

In [0]:

# Query by timestamp
# Corrected: timestamp after first write/merge
yesterday = spark.read.format("delta")\
    .option("timestampAsOf", "2026-01-13 18:07:35")\
    .load("/Volumes/workspace/ecommerce/ecommerce_data/events_delta")

display(yesterday.limit(5))

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-11-01T00:00:00.000Z,view,1003461,2053013555631882655,electronics.smartphone,xiaomi,489.07,520088904,4d3b30da-a5e4-49df-b1a8-ba5943f1dd33
2019-11-01T00:00:00.000Z,view,5000088,2053013566100866035,appliances.sewing_machine,janome,293.65,530496790,8e5f4f83-366c-4f70-860e-ca7417414283
2019-11-01T00:00:01.000Z,view,17302664,2053013553853497655,,creed,28.31,561587266,755422e7-9040-477b-9bd2-6a6e8fd97387
2019-11-01T00:00:01.000Z,view,3601530,2053013563810775923,appliances.kitchen.washer,lg,712.87,518085591,3bfb58cd-7892-48cc-8020-2f17e6de6e7f
2019-11-01T00:00:01.000Z,view,1004775,2053013555631882655,electronics.smartphone,xiaomi,183.27,558856683,313628f1-68b8-460d-84f6-cec7a8796ef2


In [0]:

latest_version = spark.sql("DESCRIBE HISTORY events_table").select("version").first()[0]
print(f"Latest Delta version: {latest_version}")

Latest Delta version: 1


In [0]:
# Oldest version
df_v0 = spark.read.format("delta")\
    .option("versionAsOf", 0)\
    .load("/Volumes/workspace/ecommerce/ecommerce_data/events_delta")

# Latest version
df_latest = spark.read.format("delta")\
    .option("versionAsOf", latest_version)\
    .load("/Volumes/workspace/ecommerce/ecommerce_data/events_delta")


In [0]:
# Attach table (if using path or table name)
events_delta_path = "/Volumes/workspace/ecommerce/ecommerce_data/events_delta"

In [0]:

# Optimize table for faster queries
spark.sql("OPTIMIZE events_table ZORDER BY (event_type, user_id)")

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,clusteringStats:struct<inputZCubeFiles:struct<numFiles:bigint,size:bigint>,inputOtherFiles:struct<numFiles:bigint,size:bigint>,inputNumZCubes:bigint,mergedFiles:struct<numFiles:bigint,size:bigint>,numOutputZCubes:bigint>,numBins:bigint,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,

In [0]:

spark.sql("VACUUM events_table RETAIN 168 HOURS")

DataFrame[path: string]

In [0]:
# Read table normally
df_latest = spark.read.format("delta").table("events_table")
display(df_latest.limit(5))

# Count rows to make sure nothing is lost
print("Total rows after VACUUM:", df_latest.count())

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-11-01T06:41:44.000Z,view,1005239,2053013555631882655,electronics.smartphone,xiaomi,280.57,516393997,d9b6c3b0-38e8-487d-9609-7d8700ac9427
2019-11-01T07:59:14.000Z,view,1306558,2053013558920217191,computers.notebook,acer,1801.82,516598705,e04131af-9bc8-44e0-b0a2-6172638c33bd
2019-11-01T08:32:41.000Z,view,1306569,2053013558920217191,computers.notebook,acer,1029.6,516160779,58c20669-03f0-4c71-b86b-e40c6c06762f
2019-11-01T08:39:15.000Z,view,34800270,2062461754293617058,,cantra,101.67,516394055,3b371427-7c3c-4358-90e4-ca03b720c656
2019-11-01T09:12:18.000Z,view,16700024,2053013559901684381,furniture.kitchen.chair,,12.84,516402113,f486aa34-ec8f-43ac-a11c-8df1ee3340b7


Total rows after VACUUM: 67351679
