# Delta Version Control & Storage Optimization

## multiple set of data taken from dataset

In [0]:
df_01 = spark.read.table("workspace.ecommerce.df_november").orderBy("event_time", ascending=False).limit(100)
df_02 = spark.read.table("workspace.ecommerce.df_october").orderBy("event_time", ascending=False).limit(100)
df_03 = spark.read.table("workspace.ecommerce.df_november").orderBy("brand", ascending=False).limit(100)
df_04 = spark.read.table("workspace.ecommerce.df_october").orderBy("brand", ascending=False).limit(100)

In [0]:
from datetime import datetime, date
from pyspark.sql import Row

add_row = Row(
    event_time=datetime.now(),       # Python datetime
    event_type="view",
    product_id=123,
    category_id=456,
    category_code="electronics",
    brand="BrandX",
    price=99,                        # becomes double by schema
    user_id=789,
    user_session="sess1"
)
dup_events = spark.read.table("workspace.ecommerce.df_november").limit(10)
# Add a row to the dup_events table for insert action
dup_events = dup_events.union(spark.createDataFrame([add_row]))

## Creating new dataset & upserts

In [0]:
df_01.write.format("delta").mode("overwrite").save("/Volumes/workspace/ecommerce/ecommerce_data/Newdataset/")

Pyspark -

In [0]:
from delta.tables import DeltaTable

deltatable_tgt = DeltaTable.forPath(spark,"/Volumes/workspace/ecommerce/ecommerce_data/Newdataset")
deltatable_tgt.alias("tgt").merge(
    df_02.alias("src"),
    "tgt.event_time = src.event_time AND \
    tgt.event_type = src.event_type AND \
    tgt.user_id = src.user_id AND \
    tgt.user_session = src.user_session"
).whenMatchedUpdateAll()\
.whenNotMatchedInsertAll()\
.execute()


In [0]:
from delta.tables import DeltaTable

deltatable = DeltaTable.forPath(spark, "/Volumes/workspace/ecommerce/ecommerce_data/Newdataset")
history_df = deltatable.history()
display((history_df).select("operationMetrics"))


SQL -

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW new_dataset AS
SELECT * FROM delta.`/Volumes/workspace/ecommerce/ecommerce_data/Newdataset`

In [0]:
df_03.createOrReplaceTempView("df_03")
df_04.createOrReplaceTempView("df_04")

In [0]:
%sql
MERGE INTO new_dataset T
USING df_03 S
ON T.user_id = S.user_id
AND T.user_session = S.user_session
AND T.event_type = S.event_type
AND T.event_time = S.event_time
WHEN MATCHED THEN
UPDATE SET *
WHEN NOT MATCHED THEN
INSERT *

List of operations from history table with their metrics

In [0]:
from pyspark.sql.functions import col, map_keys, explode

# Get Delta history
deltatable = DeltaTable.forPath(spark, "/Volumes/workspace/ecommerce/ecommerce_data/Newdataset")
history_df = deltatable.history()  # columns include: version, timestamp, operation, operationMetrics, etc.

# 1) Collect all unique metric keys present in operationMetrics
metric_keys = (
    history_df
      .select(explode(map_keys(col("operationMetrics"))).alias("k"))
      .distinct()
      .toPandas()["k"].tolist()
)

# 2) Build a select with one column per metric key
select_cols = list(history_df.columns) + [
    col("operationMetrics").getItem(k).alias(k) for k in metric_keys
]

metrics_df = history_df.select(*select_cols)

# 3) (Optional) Cast metric columns to numeric types where applicable
# Most metrics are integers (e.g., numTargetRowsUpdated); if cast fails it will become null.
for k in metric_keys:
    metrics_df = metrics_df.withColumn(k, col(k).cast("long"))

display(metrics_df)


## Time travel

In [0]:
df_version_1 = spark.read.format("delta").option("versionAsOf", 1).load("/Volumes/workspace/ecommerce/ecommerce_data/Newdataset")
display(df_version_1)

Restore using version \ point in time

In [0]:
df_version_1.write.format("delta").mode("overwrite").save("/Volumes/workspace/ecommerce/ecommerce_data/Newdataset")

In [0]:
from delta import DeltaTable
deltatable = DeltaTable.forPath(spark, "/Volumes/workspace/ecommerce/ecommerce_data/Newdataset")
display(deltatable.history())

Even after restore we can reuse the older verions

In [0]:
df_version4 = spark.read.format("delta").option("versionAsOf", 4).load("/Volumes/workspace/ecommerce/ecommerce_data/Newdataset")
df_version4.count()

In [0]:
df_latest = spark.read.format("delta").load("/Volumes/workspace/ecommerce/ecommerce_data/Newdataset")
df_latest.count()

In [0]:
%sql
select count(*) FROM delta.`/Volumes/workspace/ecommerce/ecommerce_data/Newdataset` timestamp as of '2026-01-13T12:58:02.000+00:00'

In [0]:
%sql
select count(*) FROM delta.`/Volumes/workspace/ecommerce/ecommerce_data/Newdataset` timestamp as of '2026-01-13T12:54:02.000+00:00'

## Optimize & Zorder

In [0]:

%sql

CREATE OR REPLACE TABLE delta.`/Volumes/workspace/ecommerce/ecommerce_data/Netdata` AS
SELECT * FROM workspace.ecommerce.df_november UNION SELECT event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session FROM workspace.ecommerce.df_october

total 128 files created

In [0]:

from pyspark.dbutils import DBUtils
dbutils = DBUtils(spark)

path = "/Volumes/workspace/ecommerce/ecommerce_data/Netdata"

files = dbutils.fs.ls(f"{path}/_delta_log/../")
data_files = [f for f in files if f.name.endswith('.parquet')]
print("Number of data files:", len(data_files))


In [0]:
dup_events.createOrReplaceTempView("dup_events")
df_04.createOrReplaceTempView("df_04")

In [0]:
%sql

CREATE OR REPLACE TEMP VIEW Netdata AS
SELECT * FROM delta.`/Volumes/workspace/ecommerce/ecommerce_data/Netdata`;

MERGE INTO Netdata T
USING dup_events S
ON T.user_id = S.user_id
AND T.user_session = S.user_session
AND T.event_type = S.event_type
AND T.event_time = S.event_time
WHEN MATCHED THEN
UPDATE SET *
WHEN NOT MATCHED THEN
INSERT *

files loaded again

In [0]:

from pyspark.dbutils import DBUtils
dbutils = DBUtils(spark)

path = "/Volumes/workspace/ecommerce/ecommerce_data/Netdata"

files = dbutils.fs.ls(f"{path}/_delta_log/../")
data_files = [f for f in files if f.name.endswith('.parquet')]
print("Number of data files:", len(data_files))


In [0]:
spark.sql("""
OPTIMIZE delta.`/Volumes/workspace/ecommerce/ecommerce_data/Netdata`
""")

156 file in total but only 26 files will be used for optimize version

In [0]:
from pyspark.dbutils import DBUtils
dbutils = DBUtils(spark)

path = "/Volumes/workspace/ecommerce/ecommerce_data/Netdata"

files = dbutils.fs.ls(f"{path}/_delta_log/../")
data_files = [f for f in files if f.name.endswith('.parquet')]
print("Number of data files:", len(data_files))

In [0]:
from delta import DeltaTable
deltatable = DeltaTable.forPath(spark, "/Volumes/workspace/ecommerce/ecommerce_data/Netdata")
display(deltatable.history())

Optimize operation metrics in table format

In [0]:
# Databricks markdown cell with table design for metric meanings

displayHTML("""
<table>
  <tr>
    <th>Metric</th>
    <th>Value</th>
    <th>Meaning</th>
    <th>for audience understanding</th>
  </tr>
  <tr>
    <td>numRemovedFiles</td>
    <td>130</td>
    <td>Removed small files</td>
    <td>Table was fragmented</td>
  </tr>
  <tr>
    <td>numAddedFiles</td>
    <td>26</td>
    <td>Added optimized files</td>
    <td>Table became compact & efficient</td>
  </tr>
  <tr>
    <td>numRemovedBytes</td>
    <td>6.17 GB</td>
    <td>Old data rewritten</td>
    <td>Large rewrite occurred</td>
  </tr>
  <tr>
    <td>numAddedBytes</td>
    <td>6.14 GB</td>
    <td>New optimized layout</td>
    <td>Highly optimized data layout</td>
  </tr>
  <tr>
    <td>p50FileSize</td>
    <td>240 MB</td>
    <td>Median file size</td>
    <td>Perfect large files for Spark</td>
  </tr>
  <tr>
    <td>numDeletionVectorsRemoved</td>
    <td>9</td>
    <td>Cleanup of DV</td>
    <td>Faster reads after DV removal</td>
  </tr>
</table>
""")

In [0]:
%sql
describe detail delta.`/Volumes/workspace/ecommerce/ecommerce_data/Netdata`

Zordering same table to 24 files

In [0]:
%sql
OPTIMIZE delta.`/Volumes/workspace/ecommerce/ecommerce_data/Netdata`
ZORDER BY (brand, user_id)


In [0]:
from delta import DeltaTable
deltatable = DeltaTable.forPath(spark, "/Volumes/workspace/ecommerce/ecommerce_data/Netdata")
display(deltatable.history())

total files 203

In [0]:
from pyspark.dbutils import DBUtils
dbutils = DBUtils(spark)

path = "/Volumes/workspace/ecommerce/ecommerce_data/Netdata"

files = dbutils.fs.ls(f"{path}/_delta_log/../")
data_files = [f for f in files if f.name.endswith('.parquet')]
print("Number of data files:", len(data_files))

reading all 129 files for a where clause for version 1

In [0]:
%sql

select * FROM delta.`/Volumes/workspace/ecommerce/ecommerce_data/Netdata` version as of 1 Where brand = 'apple' and user_id > 514766963 

Reading 26 files after optimize version for same where clause

In [0]:
%sql

select * FROM delta.`/Volumes/workspace/ecommerce/ecommerce_data/Netdata` version as of 2 Where brand = 'apple' and user_id > 514766963 

Reading only 4 files after Zordering version for same where clause

In [0]:
%sql

select * FROM delta.`/Volumes/workspace/ecommerce/ecommerce_data/Netdata` version as of 5 Where brand = 'apple' and user_id > 514766963 

## Vacuum (Default time is 7 days retention)

changing the retention check to disable (so we can run retain 0 hours to delete the optimize & version 1 before zordering) * only works in clsuter not in serverless

In [0]:
%sql
SET spark.databricks.delta.retentionDurationCheck.enabled = false;


In [0]:
%sql
Vacuum delta.`/Volumes/workspace/ecommerce/ecommerce_data/Netdata` retain 0 hours