#### OPTIMIZE Intro

In [0]:
df = spark.read.parquet("abfss://labdata@dbdeltalabstorageacct.dfs.core.windows.net/invoices/invoices_201_99457.parquet")
display(df.limit(5))

customer_id,invoice_no,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall,_rescued_data
201,I885979,Female,26,Clothing,3,900.24,Debit Card,2021-07-04,Metrocity,
202,I810217,Female,51,Clothing,3,900.24,Cash,2022-01-14,Metrocity,
203,I499170,Female,38,Toys,1,35.84,Credit Card,2022-02-20,Kanyon,
204,I792963,Female,59,Clothing,5,1500.4,Debit Card,2022-06-18,Emaar Square Mall,
205,I311151,Female,39,Souvenir,3,35.19,Credit Card,2022-04-27,Mall of Istanbul,


In [0]:
print(df.count())
print(df.select("category").distinct().count())

99257
8


In [0]:
df.repartition(5).write.mode("overwrite").partitionBy("category").saveAsTable("deltacatalog.deltadb.optimize_ex1")

In [0]:
%%time
df_ex1 = spark.read.table("deltacatalog.deltadb.optimize_ex1")
df_out = df_ex1.where(df_ex1.category == "Clothing").collect()

CPU times: user 514 ms, sys: 22.1 ms, total: 536 ms
Wall time: 2.3 s


In [0]:
from delta.tables import DeltaTable
table = DeltaTable.forName(spark, "deltacatalog.deltadb.optimize_ex1")
table.optimize().executeCompaction()

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,clusteringStats:struct<inputZCubeFiles:struct<numFiles:bigint,size:bigint>,inputOtherFiles:struct<numFiles:bigint,size:bigint>,inputNumZCubes:bigint,mergedFiles:struct<numFiles:bigint,size:bigint>,numOutputZCubes:bigint>,numBins:bigint,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,

In [0]:
table.vacuum(0)

DataFrame[]

In [0]:
%sql 
SET spark.databricks.delta.retentionDurationCheck.enabled = false;

key,value
spark.databricks.delta.retentionDurationCheck.enabled,False


In [0]:
%%time
df_ex1 = spark.read.table("deltacatalog.deltadb.optimize_ex1")
df_out = df_ex1.where(df_ex1.category == "Clothing").collect()

CPU times: user 157 ms, sys: 16.7 ms, total: 173 ms
Wall time: 1.4 s


In [0]:
import pyspark.sql.functions as F
df_fruits = df.filter(df.category == "Clothing").withColumn("category", F.lit("Fruits"))

In [0]:
df_fruits.select("category").distinct().count()

1

In [0]:
df_fruits.repartition(5).write.mode("overwrite").partitionBy("category").saveAsTable("deltacatalog.deltadb.optimize_ex1")

In [0]:
# table.optimize().where("category = 'Fruits'").executeCompaction()

In [0]:
%sql
OPTIMIZE deltacatalog.deltadb.optimize_ex1
WHERE category = 'Fruits';

In [0]:
%sql
VACUUM deltacatalog.deltadb.optimize_ex1 RETAIN 0 HOURS;

path
abfss://metastore@dbdeltalabstorageacct.dfs.core.windows.net/9f512693-87be-436d-8e5b-ff08b7b4f77f/tables/27dc5d5f-ae60-4375-ad5a-658a6ced2696


#### Optimized Write

In [0]:
df.repartition(288).write.mode("overwrite").partitionBy("category").saveAsTable("deltacatalog.deltadb.optimize_ex2")

In [0]:
%%time
df_ex2 = spark.read.table("deltacatalog.deltadb.optimize_ex2")
df_out = df_ex2.where(df_ex2.category == "Clothing").collect()

CPU times: user 199 ms, sys: 23.6 ms, total: 223 ms
Wall time: 6.97 s


In [0]:
df.repartition(288).write.mode("overwrite").partitionBy("category").option("optimizeWrite", "True").saveAsTable("deltacatalog.deltadb.optimize_ex3")

In [0]:
%%time
df_ex3 = spark.read.table("deltacatalog.deltadb.optimize_ex3")
df_out = df_ex3.where(df_ex3.category == "Clothing").collect()

CPU times: user 162 ms, sys: 22.9 ms, total: 185 ms
Wall time: 1.82 s


#### Auto Compaction

In [0]:
%sql 
ALTER TABLE deltacatalog.deltadb.optimize_ex3 SET TBLPROPERTIES ('delta.autoOptimize.optimizeWrite' = 'false');
ALTER TABLE deltacatalog.deltadb.optimize_ex3 SET TBLPROPERTIES ('delta.autoOptimize.autoCompact' = 'true');

In [0]:
spark.conf.get("spark.databricks.delta.autoCompact.minNumFiles")
# spark.conf.set("spark.databricks.delta.autoCompact.minNumFiles", "3")

'3'

In [0]:
df_detergents = df.filter(df.category == "Clothing").withColumn("category", F.lit("Detergents"))

In [0]:
df_detergents.repartition(5).write.mode("overwrite").partitionBy("category").saveAsTable("deltacatalog.deltadb.optimize_ex3")

#### VACUUM 
(How it limits ability to time travel)

In [0]:
df_101_150 = (
    spark.read.parquet("abfss://labdata@dbdeltalabstorageacct.dfs.core.windows.net/invoices/invoices_101_200.parquet")
    .filter(F.col("customer_id").between(101, 150))
)

df_151_200 = (
    spark.read.parquet("abfss://labdata@dbdeltalabstorageacct.dfs.core.windows.net/invoices/invoices_101_200.parquet")
    .filter(F.col("customer_id").between(151, 200))
)

In [0]:
df_101_150.write.mode("overwrite").saveAsTable("deltacatalog.deltadb.vacuum_ex1")

In [0]:
%sql
DESCRIBE HISTORY deltacatalog.deltadb.vacuum_ex1;

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,2025-05-03T14:21:24Z,2095506903161383,linkedafaque_gmail.com#ext#@linkedafaquegmail.onmicrosoft.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> false)",,List(663055675511912),0503-103113-z182bj2u,,WriteSerializable,False,"Map(numFiles -> 1, numOutputRows -> 50, numOutputBytes -> 4854)",,Databricks-Runtime/15.4.x-scala2.12


In [0]:
df_151_200.write.mode("append").saveAsTable("deltacatalog.deltadb.vacuum_ex1")

In [0]:
%sql
DESCRIBE HISTORY deltacatalog.deltadb.vacuum_ex1;

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
1,2025-05-03T14:22:06Z,2095506903161383,linkedafaque_gmail.com#ext#@linkedafaquegmail.onmicrosoft.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(663055675511912),0503-103113-z182bj2u,0.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 50, numOutputBytes -> 4902)",,Databricks-Runtime/15.4.x-scala2.12
0,2025-05-03T14:21:24Z,2095506903161383,linkedafaque_gmail.com#ext#@linkedafaquegmail.onmicrosoft.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> false)",,List(663055675511912),0503-103113-z182bj2u,,WriteSerializable,False,"Map(numFiles -> 1, numOutputRows -> 50, numOutputBytes -> 4854)",,Databricks-Runtime/15.4.x-scala2.12


In [0]:
%sql
DELETE FROM deltacatalog.deltadb.vacuum_ex1 WHERE customer_id BETWEEN 151 AND 200; 

num_affected_rows
50


In [0]:
%sql
DESCRIBE HISTORY deltacatalog.deltadb.vacuum_ex1;

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
2,2025-05-03T14:23:47Z,2095506903161383,linkedafaque_gmail.com#ext#@linkedafaquegmail.onmicrosoft.com,DELETE,"Map(predicate -> [""((customer_id#8510 >= 151) AND (customer_id#8510 <= 200))""])",,List(663055675511912),0503-103113-z182bj2u,1.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 4902, numCopiedRows -> 0, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 1389, numDeletionVectorsUpdated -> 0, numDeletedRows -> 50, scanTimeMs -> 1195, numAddedFiles -> 0, numAddedBytes -> 0, rewriteTimeMs -> 191)",,Databricks-Runtime/15.4.x-scala2.12
1,2025-05-03T14:22:06Z,2095506903161383,linkedafaque_gmail.com#ext#@linkedafaquegmail.onmicrosoft.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(663055675511912),0503-103113-z182bj2u,0.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 50, numOutputBytes -> 4902)",,Databricks-Runtime/15.4.x-scala2.12
0,2025-05-03T14:21:24Z,2095506903161383,linkedafaque_gmail.com#ext#@linkedafaquegmail.onmicrosoft.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> false)",,List(663055675511912),0503-103113-z182bj2u,,WriteSerializable,False,"Map(numFiles -> 1, numOutputRows -> 50, numOutputBytes -> 4854)",,Databricks-Runtime/15.4.x-scala2.12


In [0]:
%sql 
SELECT MIN(customer_id), MAX(customer_id)
FROM deltacatalog.deltadb.vacuum_ex1;

min(customer_id),max(customer_id)
101,150


In [0]:
%sql 
SELECT MIN(customer_id), MAX(customer_id)
FROM deltacatalog.deltadb.vacuum_ex1 VERSION AS OF 1;

min(customer_id),max(customer_id)
101,200


In [0]:
%sql
SET spark.databricks.delta.retentionDurationCheck.enabled = false;

key,value
spark.databricks.delta.retentionDurationCheck.enabled,False


In [0]:
%sql
VACUUM deltacatalog.deltadb.vacuum_ex1 RETAIN 0 HOURS;

path
abfss://metastore@dbdeltalabstorageacct.dfs.core.windows.net/9f512693-87be-436d-8e5b-ff08b7b4f77f/tables/af4a7ed2-6344-453a-98fb-ee3a6c220b14


In [0]:
%sql
DESCRIBE HISTORY deltacatalog.deltadb.vacuum_ex1;

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
4,2025-05-03T14:27:10Z,2095506903161383,linkedafaque_gmail.com#ext#@linkedafaquegmail.onmicrosoft.com,VACUUM END,Map(status -> COMPLETED),,List(663055675511912),0503-103113-z182bj2u,3.0,SnapshotIsolation,True,"Map(numDeletedFiles -> 2, numVacuumedDirectories -> 1)",,Databricks-Runtime/15.4.x-scala2.12
3,2025-05-03T14:27:02Z,2095506903161383,linkedafaque_gmail.com#ext#@linkedafaquegmail.onmicrosoft.com,VACUUM START,"Map(retentionCheckEnabled -> false, defaultRetentionMillis -> 604800000, specifiedRetentionMillis -> 0)",,List(663055675511912),0503-103113-z182bj2u,2.0,SnapshotIsolation,True,"Map(numFilesToDelete -> 2, sizeOfDataToDelete -> 4942)",,Databricks-Runtime/15.4.x-scala2.12
2,2025-05-03T14:23:47Z,2095506903161383,linkedafaque_gmail.com#ext#@linkedafaquegmail.onmicrosoft.com,DELETE,"Map(predicate -> [""((customer_id#8510 >= 151) AND (customer_id#8510 <= 200))""])",,List(663055675511912),0503-103113-z182bj2u,1.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 4902, numCopiedRows -> 0, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 1389, numDeletionVectorsUpdated -> 0, numDeletedRows -> 50, scanTimeMs -> 1195, numAddedFiles -> 0, numAddedBytes -> 0, rewriteTimeMs -> 191)",,Databricks-Runtime/15.4.x-scala2.12
1,2025-05-03T14:22:06Z,2095506903161383,linkedafaque_gmail.com#ext#@linkedafaquegmail.onmicrosoft.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(663055675511912),0503-103113-z182bj2u,0.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 50, numOutputBytes -> 4902)",,Databricks-Runtime/15.4.x-scala2.12
0,2025-05-03T14:21:24Z,2095506903161383,linkedafaque_gmail.com#ext#@linkedafaquegmail.onmicrosoft.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> false)",,List(663055675511912),0503-103113-z182bj2u,,WriteSerializable,False,"Map(numFiles -> 1, numOutputRows -> 50, numOutputBytes -> 4854)",,Databricks-Runtime/15.4.x-scala2.12


In [0]:
%sql
SELECT * 
FROM deltacatalog.deltadb.vacuum_ex1 VERSION AS OF 0;

customer_id,invoice_no,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall,_rescued_data
101,I302068,Female,58,Shoes,2,1200.34,Debit Card,2022-07-15,Metropol AVM,
102,I193229,Male,40,Clothing,5,1500.4,Cash,2021-03-05,Cevahir AVM,
103,I313092,Female,23,Cosmetics,2,81.32,Credit Card,2022-04-23,Zorlu Center,
104,I258750,Female,40,Food & Beverage,3,15.69,Cash,2022-04-04,Istinye Park,
105,I126182,Female,57,Clothing,5,1500.4,Cash,2022-02-06,Istinye Park,
106,I155060,Female,51,Souvenir,1,11.73,Cash,2023-02-28,Forum Istanbul,
107,I296886,Female,50,Cosmetics,4,162.64,Cash,2022-12-30,Kanyon,
108,I997903,Female,57,Clothing,5,1500.4,Cash,2021-08-19,Istinye Park,
109,I962731,Female,38,Books,3,45.45,Credit Card,2021-01-07,Metrocity,
110,I157442,Female,20,Shoes,2,1200.34,Cash,2021-12-05,Metrocity,
