1. Ingest sample order data into a Spark DataFrame.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType
from pyspark.sql.functions import to_date, col
import uuid, random, datetime

spark = SparkSession.builder.getOrCreate()

# Parameters
N_ROWS = 10000   
START = datetime.datetime(2025, 10, 1, 0, 0)
END   = datetime.datetime(2025, 11, 15, 23, 59)
COUNTRIES = ["US","IN","GB","DE","FR","BR","CA"]
STATUSES = ["CREATED","PAID","CANCELLED"]
CURRENCIES = {"US":"USD","IN":"INR","GB":"GBP","DE":"EUR","FR":"EUR","BR":"BRL","CA":"CAD"}

def rand_ts(s,e):
    return datetime.datetime.utcfromtimestamp(random.randint(int(s.timestamp()), int(e.timestamp())))

rows = []
for _ in range(N_ROWS):
    oid = str(uuid.uuid4())
    ts = rand_ts(START, END)
    cid = f"{random.randint(1,4000)}"
    country = random.choice(COUNTRIES)
    amount = round(random.uniform(1.0,500.0),2)
    currency = CURRENCIES[country]
    status = random.choices(STATUSES, weights=[0.1,0.85,0.05])[0]
    rows.append((oid, ts, cid, country, float(amount), currency, status))

schema = StructType([
    StructField("order_id", StringType(), False),
    StructField("order_timestamp", TimestampType(), False),
    StructField("customer_id", StringType(), False),
    StructField("country", StringType(), False),
    StructField("amount", DoubleType(), False),
    StructField("currency", StringType(), False),
    StructField("status", StringType(), False)
])

df_orders = spark.createDataFrame(rows, schema)
display(df_orders.limit(5))


  return datetime.datetime.utcfromtimestamp(random.randint(int(s.timestamp()), int(e.timestamp())))


order_id,order_timestamp,customer_id,country,amount,currency,status
4d9beb1d-446a-46a1-9f4b-b62646ef8140,2025-11-02T11:20:48.000Z,2627,IN,256.7,INR,CREATED
4db19074-53f3-406a-950f-76da12c1676d,2025-10-19T09:08:27.000Z,3045,BR,93.68,BRL,PAID
ab945c4e-31a1-4c2c-bfce-adb4e8ac2585,2025-10-21T13:48:22.000Z,1132,CA,442.17,CAD,CREATED
f45e8d2f-072a-46c6-b0e6-e76b7b9b4efe,2025-10-06T20:13:45.000Z,2033,BR,412.47,BRL,PAID
15d19618-4cbd-4fb0-95ba-effd2f847f2e,2025-10-10T19:59:15.000Z,2341,DE,188.59,EUR,PAID


2. Add a derived column order_date (date only from order_timestamp).


In [0]:
from pyspark.sql.functions import to_date

df_orders = df_orders.withColumn("order_date", to_date(col("order_timestamp")))
display(df_orders.select("order_id","order_timestamp","order_date").limit(5))


order_id,order_timestamp,order_date
4d9beb1d-446a-46a1-9f4b-b62646ef8140,2025-11-02T11:20:48.000Z,2025-11-02
4db19074-53f3-406a-950f-76da12c1676d,2025-10-19T09:08:27.000Z,2025-10-19
ab945c4e-31a1-4c2c-bfce-adb4e8ac2585,2025-10-21T13:48:22.000Z,2025-10-21
f45e8d2f-072a-46c6-b0e6-e76b7b9b4efe,2025-10-06T20:13:45.000Z,2025-10-06
15d19618-4cbd-4fb0-95ba-effd2f847f2e,2025-10-10T19:59:15.000Z,2025-10-10


3. Write the DataFrame as a Delta table partitioned by country and order_date.


In [0]:
# setting up the volume path
volume_path = "/Volumes/main/default/orders_vol/orders_delta"

(df_orders
 .write
 .format("delta")
 .mode("overwrite")
 .partitionBy("country", "order_date")
 .save(volume_path))


4. Verify the partition structure in the storage path.



In [0]:
#displays the structure in orders_delta
display(dbutils.fs.ls("/Volumes/main/default/orders_vol/orders_delta"))


path,name,size,modificationTime
dbfs:/Volumes/main/default/orders_vol/orders_delta/_delta_log/,_delta_log/,0,1764489519644
dbfs:/Volumes/main/default/orders_vol/orders_delta/country=BR/,country=BR/,0,1764489519644
dbfs:/Volumes/main/default/orders_vol/orders_delta/country=CA/,country=CA/,0,1764489519644
dbfs:/Volumes/main/default/orders_vol/orders_delta/country=DE/,country=DE/,0,1764489519644
dbfs:/Volumes/main/default/orders_vol/orders_delta/country=FR/,country=FR/,0,1764489519644
dbfs:/Volumes/main/default/orders_vol/orders_delta/country=GB/,country=GB/,0,1764489519644
dbfs:/Volumes/main/default/orders_vol/orders_delta/country=IN/,country=IN/,0,1764489519644
dbfs:/Volumes/main/default/orders_vol/orders_delta/country=US/,country=US/,0,1764489519644


In [0]:
#displays the structure in orders_delta with tha country partitioning
display(dbutils.fs.ls("/Volumes/main/default/orders_vol/orders_delta/country=IN"))


path,name,size,modificationTime
dbfs:/Volumes/main/default/orders_vol/orders_delta/country=IN/order_date=2025-10-01/,order_date=2025-10-01/,0,1764489520854
dbfs:/Volumes/main/default/orders_vol/orders_delta/country=IN/order_date=2025-10-02/,order_date=2025-10-02/,0,1764489520854
dbfs:/Volumes/main/default/orders_vol/orders_delta/country=IN/order_date=2025-10-03/,order_date=2025-10-03/,0,1764489520854
dbfs:/Volumes/main/default/orders_vol/orders_delta/country=IN/order_date=2025-10-04/,order_date=2025-10-04/,0,1764489520854
dbfs:/Volumes/main/default/orders_vol/orders_delta/country=IN/order_date=2025-10-05/,order_date=2025-10-05/,0,1764489520854
dbfs:/Volumes/main/default/orders_vol/orders_delta/country=IN/order_date=2025-10-06/,order_date=2025-10-06/,0,1764489520854
dbfs:/Volumes/main/default/orders_vol/orders_delta/country=IN/order_date=2025-10-07/,order_date=2025-10-07/,0,1764489520854
dbfs:/Volumes/main/default/orders_vol/orders_delta/country=IN/order_date=2025-10-08/,order_date=2025-10-08/,0,1764489520854
dbfs:/Volumes/main/default/orders_vol/orders_delta/country=IN/order_date=2025-10-09/,order_date=2025-10-09/,0,1764489520854
dbfs:/Volumes/main/default/orders_vol/orders_delta/country=IN/order_date=2025-10-10/,order_date=2025-10-10/,0,1764489520854


In [0]:
#displays partitioning structure with a specified condition, a country along with the order date
display(dbutils.fs.ls("/Volumes/main/default/orders_vol/orders_delta/country=IN/order_date=2025-10-10"))


path,name,size,modificationTime
dbfs:/Volumes/main/default/orders_vol/orders_delta/country=IN/order_date=2025-10-10/part-00291-2c127300-fb3e-4302-9449-cbb5bd4d01fa.c000.snappy.parquet,part-00291-2c127300-fb3e-4302-9449-cbb5bd4d01fa.c000.snappy.parquet,3521,1764487418000
dbfs:/Volumes/main/default/orders_vol/orders_delta/country=IN/order_date=2025-10-10/part-00291-745c28f0-d9a1-4208-bb3b-ff33ac6ecbdb.c000.snappy.parquet,part-00291-745c28f0-d9a1-4208-bb3b-ff33ac6ecbdb.c000.snappy.parquet,3316,1764489517000


5. Run queries that demonstrate partition pruning (e.g., filter on a single country and/or date).


In [0]:
df = spark.read.format("delta").load("/Volumes/main/default/orders_vol/orders_delta")
#the next commands will return the first record present in the table
sample = df.select("country", "order_date").limit(1).collect()[0]
sample_country = sample["country"]
sample_date = sample["order_date"]

print("Country:", sample_country)
print("Order Date:", sample_date)


Country: FR
Order Date: 2025-10-13


In [0]:
#applying a filter condition and display all records who;s count
df_pruned = df.filter(
    (df.country == sample_country) &
    (df.order_date == sample_date)
)
display(df_pruned)
df_pruned.count()

order_id,order_timestamp,customer_id,country,amount,currency,status,order_date
971d0eeb-f600-4bf4-a4e8-2b8819ac5f6f,2025-10-13T14:06:00.000Z,1905,FR,77.0,EUR,PAID,2025-10-13
7ecdd316-3cec-4b87-b3b9-c4d937c3d3b9,2025-10-13T01:03:55.000Z,1196,FR,456.68,EUR,PAID,2025-10-13
ccddf0f1-334b-4a29-b868-1a51003812e5,2025-10-13T00:15:07.000Z,3129,FR,213.44,EUR,PAID,2025-10-13
2dc4e69a-a8a4-49ca-8042-6cbcbe7bec49,2025-10-13T11:33:00.000Z,2365,FR,66.6,EUR,PAID,2025-10-13
8c6a420e-a349-46a1-932e-4bdd501cd74c,2025-10-13T06:45:30.000Z,3189,FR,433.9,EUR,CREATED,2025-10-13
3b48b70d-e952-48c3-8233-c722ab191d8b,2025-10-13T23:33:39.000Z,3814,FR,221.9,EUR,PAID,2025-10-13
18c86bb7-ece2-4727-8cf1-d40163b9aca0,2025-10-13T11:08:37.000Z,2487,FR,3.98,EUR,PAID,2025-10-13
aa4c38b1-4774-4763-8851-c59e5581df34,2025-10-13T11:27:34.000Z,3130,FR,265.29,EUR,PAID,2025-10-13
7c461be4-3727-4c57-838a-f28ad8e2cb9f,2025-10-13T00:00:42.000Z,2751,FR,196.19,EUR,PAID,2025-10-13
42a086cc-45c9-479f-b4d4-bd0ac99abc4d,2025-10-13T11:14:26.000Z,3219,FR,255.41,EUR,PAID,2025-10-13


39

In [0]:
#without pruned data
df_nonpruned = df.filter(df.customer_id == "cust_50")
#displays the plan
df_nonpruned.explain(True)


== Parsed Logical Plan ==
'Filter '`==`('customer_id, cust_50)
+- Relation [order_id#11358,order_timestamp#11359,customer_id#11360,country#11361,amount#11362,currency#11363,status#11364,order_date#11365] parquet

== Analyzed Logical Plan ==
order_id: string, order_timestamp: timestamp, customer_id: string, country: string, amount: double, currency: string, status: string, order_date: date
Filter (customer_id#11360 = cust_50)
+- Relation [order_id#11358,order_timestamp#11359,customer_id#11360,country#11361,amount#11362,currency#11363,status#11364,order_date#11365] parquet

== Optimized Logical Plan ==
LocalRelation <empty>, [order_id#11358, order_timestamp#11359, customer_id#11360, country#11361, amount#11362, currency#11363, status#11364, order_date#11365]

== Physical Plan ==
LocalTableScan <empty>, [order_id#11358, order_timestamp#11359, customer_id#11360, country#11361, amount#11362, currency#11363, status#11364, order_date#11365]

== Photon Explanation ==
Photon does not fully supp

6. Demonstrate Delta Lake Time Travel:
Write data, update some rows, then query older versions.


In [0]:
df.writeTo("main.default.orders_data").createOrReplace()


In [0]:
from delta.tables import DeltaTable
deltaTable = DeltaTable.forName(spark, "main.default.orders_data")
display(deltaTable.history())



version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,2025-11-30T08:06:46.000Z,70588280163463,aspranavi1110@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(157788261568517),1130-075739-gilj182r-v2n,,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 0, numRemovedBytes -> 0, numDeletionVectorsRemoved -> 0, numOutputRows -> 10000, numOutputBytes -> 521326)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13


In [0]:
#we make some new changes into the table
from pyspark.sql.functions import expr

sample_df = (
    spark.table("main.default.orders_data")
         .orderBy(expr("rand()"))
         .limit(5)
         .select("order_id")
)

sample_df.show()


+--------------------+
|            order_id|
+--------------------+
|cdb00b2e-bc90-430...|
|298df47e-6811-4d3...|
|8fa7642b-ab48-405...|
|be873aec-70d7-421...|
|1cf6c088-e310-4fc...|
+--------------------+



In [0]:
sample_ids = [row.order_id for row in sample_df.collect()]
sample_ids
#updateing the rows
id_list_sql = ", ".join([f"'{x}'" for x in sample_ids])




In [0]:
from pyspark.sql.functions import lit

deltaTable.update(
    condition=f"order_id IN ({id_list_sql})",
    set={"status": lit("PAID")}
)


DataFrame[num_affected_rows: bigint]

In [0]:
deltaHistory=deltaTable.history()
display(deltaHistory)


version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
1,2025-11-30T08:12:11.000Z,70588280163463,aspranavi1110@gmail.com,UPDATE,"Map(predicate -> [""order_id#12541 IN (c984cf53-b0b8-4093-801b-0a9ef4b54e23,1440a8eb-6c21-4f2a-bcdc-2da4958bf7cb,eadf958f-89ef-4a1d-b23f-d377c2d7ffc6,902ae20a-2850-49ed-b389-e4a12e703741,2b5ae8e1-1bd9-4780-bde4-72c4e7a2dd23)""])",,List(157788261568517),1130-075739-gilj182r-v2n,0.0,WriteSerializable,False,"Map(numRemovedFiles -> 0, numRemovedBytes -> 0, numCopiedRows -> 0, numDeletionVectorsAdded -> 1, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 3025, numDeletionVectorsUpdated -> 0, scanTimeMs -> 1366, numAddedFiles -> 1, numUpdatedRows -> 5, numAddedBytes -> 2612, rewriteTimeMs -> 1628)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
0,2025-11-30T08:06:46.000Z,70588280163463,aspranavi1110@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(157788261568517),1130-075739-gilj182r-v2n,,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 0, numRemovedBytes -> 0, numDeletionVectorsRemoved -> 0, numOutputRows -> 10000, numOutputBytes -> 521326)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13


In [0]:
display(deltaHistory.where("version=='1'"))

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
1,2025-11-30T08:12:11.000Z,70588280163463,aspranavi1110@gmail.com,UPDATE,"Map(predicate -> [""order_id#12541 IN (c984cf53-b0b8-4093-801b-0a9ef4b54e23,1440a8eb-6c21-4f2a-bcdc-2da4958bf7cb,eadf958f-89ef-4a1d-b23f-d377c2d7ffc6,902ae20a-2850-49ed-b389-e4a12e703741,2b5ae8e1-1bd9-4780-bde4-72c4e7a2dd23)""])",,List(157788261568517),1130-075739-gilj182r-v2n,0,WriteSerializable,False,"Map(numRemovedFiles -> 0, numRemovedBytes -> 0, numCopiedRows -> 0, numDeletionVectorsAdded -> 1, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 3025, numDeletionVectorsUpdated -> 0, scanTimeMs -> 1366, numAddedFiles -> 1, numUpdatedRows -> 5, numAddedBytes -> 2612, rewriteTimeMs -> 1628)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13


7. Demonstrate Schema Evolution:
Add payment_method & coupon_code to new data.

Write to the same Delta table, allowing schema evolution.


In [0]:
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType
from pyspark.sql.functions import to_date, col
import uuid, random, datetime

# New batch size
N_NEW = 500

payment_methods = ["CARD", "UPI", "COD", "WALLET"]

new_rows = []
for _ in range(N_NEW):
    oid = str(uuid.uuid4())
    ts = rand_ts(START, END + datetime.timedelta(days=10))
    cid = f"{random.randint(1,4000)}"
    country = random.choice(COUNTRIES)
    amount = round(random.uniform(1.0,500.0),2)
    currency = CURRENCIES[country]
    status = random.choice(STATUSES)
    payment_method = random.choice(payment_methods)
    coupon_code = random.choice([None, f"CPN{random.randint(100,999)}", None])

    new_rows.append((oid, ts, cid, country, amount, currency, status, payment_method, coupon_code))

# Schema with new fields
schema_new = StructType([
    StructField("order_id", StringType(), False),
    StructField("order_timestamp", TimestampType(), False),
    StructField("customer_id", StringType(), False),
    StructField("country", StringType(), False),
    StructField("amount", DoubleType(), False),
    StructField("currency", StringType(), False),
    StructField("status", StringType(), False),
    StructField("payment_method", StringType(), True),
    StructField("coupon_code", StringType(), True)
])

df_new = spark.createDataFrame(new_rows, schema_new)
df_new = df_new.withColumn("order_date", to_date(col("order_timestamp")))

display(df_new.limit(5))


  return datetime.datetime.utcfromtimestamp(random.randint(int(s.timestamp()), int(e.timestamp())))


order_id,order_timestamp,customer_id,country,amount,currency,status,payment_method,coupon_code,order_date
59827ed8-8757-4e85-bc3d-29c3e48ffb5b,2025-10-08T13:12:15.000Z,948,IN,406.01,INR,CREATED,CARD,,2025-10-08
df5261eb-f127-4ed0-9b4d-281480773876,2025-10-13T03:15:40.000Z,1392,US,433.47,USD,PAID,UPI,CPN697,2025-10-13
726a6aa1-384f-4e10-bf95-85907ced7018,2025-10-16T18:36:41.000Z,489,GB,489.79,GBP,CREATED,COD,,2025-10-16
5e450b45-58c5-482c-a46b-864d902ac6b0,2025-10-25T07:26:14.000Z,1759,US,478.55,USD,CREATED,CARD,,2025-10-25
17af65c7-5a5a-4d49-946f-f5138100f933,2025-11-17T15:17:35.000Z,315,BR,321.13,BRL,CANCELLED,COD,CPN566,2025-11-17


In [0]:
(df_new.write
    .format("delta")
    .mode("append")
    .option("mergeSchema", "true")    # allow new columns
    .saveAsTable("main.default.orders_data")
)


In [0]:
#validating schema evolution
spark.sql("DESCRIBE TABLE main.default.orders_data").show(truncate=False)


+---------------+---------+-------+
|col_name       |data_type|comment|
+---------------+---------+-------+
|order_id       |string   |NULL   |
|order_timestamp|timestamp|NULL   |
|customer_id    |string   |NULL   |
|country        |string   |NULL   |
|amount         |double   |NULL   |
|currency       |string   |NULL   |
|status         |string   |NULL   |
|order_date     |date     |NULL   |
|payment_method |string   |NULL   |
|coupon_code    |string   |NULL   |
+---------------+---------+-------+



8. Demonstrate Updates & Deletes using Delta:

Mark some orders as CANCELLED.

Delete orders below a certain amount (e.g., test data cleanup).


In [0]:
deltaTable = DeltaTable.forName(spark, "main.default.orders_data")
deltaTable.update(
    condition="amount < 300 AND status != 'CANCELLED'",
    set={"status": lit("CANCELLED")}
)


DataFrame[num_affected_rows: bigint]

In [0]:
#verify if update happend or no
spark.table("main.default.orders_data") \
     .filter("amount < 300") \
     .select("order_id", "amount", "status") \
     .show(20, truncate=False)


+------------------------------------+------+---------+
|order_id                            |amount|status   |
+------------------------------------+------+---------+
|2b5ae8e1-1bd9-4780-bde4-72c4e7a2dd23|3.18  |CANCELLED|
|902ae20a-2850-49ed-b389-e4a12e703741|29.32 |CANCELLED|
|3c6425dc-3b31-4085-9447-c667f4d25761|22.8  |CANCELLED|
|01423cb6-5d4b-4aca-bc8f-3bf752f36c33|83.97 |CANCELLED|
|92006128-bba6-42d2-b827-8c316304d6a0|194.81|CANCELLED|
|761e2f7f-9bc5-4433-ac3d-4fa2d9fc6578|258.68|CANCELLED|
|8c013cfe-02a0-4c66-849c-a4c7224bf9d6|55.79 |CANCELLED|
|58fb276d-f0db-4920-adae-4c06b993054d|110.19|CANCELLED|
|d83c065e-1d7a-4638-8c40-d9ac1df59c84|107.07|CANCELLED|
|35d284e0-f219-4613-b4f5-400d4e9ee3f8|248.35|CANCELLED|
|820c96aa-ca01-4208-8c4f-431ddce325a5|276.22|CANCELLED|
|e4e7b7e8-5ab9-4700-9e94-66213fdedba6|126.35|CANCELLED|
|9b2502f5-4137-4838-ab63-5339804331a3|85.23 |CANCELLED|
|5f9a3825-2f6d-4bc4-8f3b-757c2f086262|215.7 |CANCELLED|
|d2176dd2-b2dd-4d88-94ef-e34d548a2de5|282.65|CAN

In [0]:
#deleting some rows
deltaTable.delete("amount < 100")


DataFrame[num_affected_rows: bigint]

In [0]:
#verify delete
print(
    "Rows remaining with amount < 100:",
    spark.table("main.default.orders_data")
         .filter("amount < 100")
         .count()
)


Rows remaining with amount < 100: 0


In [0]:
display(deltaTable.history())


version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
6,2025-11-30T08:26:09.000Z,70588280163463,aspranavi1110@gmail.com,OPTIMIZE,"Map(predicate -> [], auto -> true, clusterBy -> [], zOrderBy -> [], batchId -> 0)",,List(157788261568517),1130-075739-gilj182r-v2n,5.0,SnapshotIsolation,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 548356, p25FileSize -> 443455, numDeletionVectorsRemoved -> 1, minFileSize -> 443455, numAddedFiles -> 1, maxFileSize -> 443455, p75FileSize -> 443455, p50FileSize -> 443455, numAddedBytes -> 443455)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
5,2025-11-30T08:26:08.000Z,70588280163463,aspranavi1110@gmail.com,DELETE,"Map(predicate -> [""(amount#14765 < 100.0)""])",,List(157788261568517),1130-075739-gilj182r-v2n,4.0,WriteSerializable,False,"Map(numRemovedFiles -> 0, numRemovedBytes -> 0, numCopiedRows -> 0, numDeletionVectorsAdded -> 1, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 1049, numDeletionVectorsUpdated -> 0, numDeletedRows -> 2070, scanTimeMs -> 675, numAddedFiles -> 0, numAddedBytes -> 0, rewriteTimeMs -> 373)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
4,2025-11-30T08:25:18.000Z,70588280163463,aspranavi1110@gmail.com,OPTIMIZE,"Map(predicate -> [], auto -> true, clusterBy -> [], zOrderBy -> [], batchId -> 0)",,List(157788261568517),1130-075739-gilj182r-v2n,3.0,SnapshotIsolation,False,"Map(numRemovedFiles -> 4, numRemovedBytes -> 871324, p25FileSize -> 548356, numDeletionVectorsRemoved -> 3, minFileSize -> 548356, numAddedFiles -> 1, maxFileSize -> 548356, p75FileSize -> 548356, p50FileSize -> 548356, numAddedBytes -> 548356)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
3,2025-11-30T08:25:15.000Z,70588280163463,aspranavi1110@gmail.com,UPDATE,"Map(predicate -> [""((amount#13770 < 300.0) AND NOT (status#13772 = CANCELLED))""])",,List(157788261568517),1130-075739-gilj182r-v2n,2.0,WriteSerializable,False,"Map(numRemovedFiles -> 0, numRemovedBytes -> 0, numCopiedRows -> 0, numDeletionVectorsAdded -> 3, numDeletionVectorsRemoved -> 1, numAddedChangeFiles -> 0, executionTimeMs -> 4167, numDeletionVectorsUpdated -> 1, scanTimeMs -> 901, numAddedFiles -> 1, numUpdatedRows -> 5928, numAddedBytes -> 314996, rewriteTimeMs -> 3265)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
2,2025-11-30T08:22:19.000Z,70588280163463,aspranavi1110@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(157788261568517),1130-075739-gilj182r-v2n,1.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 500, numOutputBytes -> 32390)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
1,2025-11-30T08:12:11.000Z,70588280163463,aspranavi1110@gmail.com,UPDATE,"Map(predicate -> [""order_id#12541 IN (c984cf53-b0b8-4093-801b-0a9ef4b54e23,1440a8eb-6c21-4f2a-bcdc-2da4958bf7cb,eadf958f-89ef-4a1d-b23f-d377c2d7ffc6,902ae20a-2850-49ed-b389-e4a12e703741,2b5ae8e1-1bd9-4780-bde4-72c4e7a2dd23)""])",,List(157788261568517),1130-075739-gilj182r-v2n,0.0,WriteSerializable,False,"Map(numRemovedFiles -> 0, numRemovedBytes -> 0, numCopiedRows -> 0, numDeletionVectorsAdded -> 1, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 3025, numDeletionVectorsUpdated -> 0, scanTimeMs -> 1366, numAddedFiles -> 1, numUpdatedRows -> 5, numAddedBytes -> 2612, rewriteTimeMs -> 1628)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
0,2025-11-30T08:06:46.000Z,70588280163463,aspranavi1110@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(157788261568517),1130-075739-gilj182r-v2n,,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 0, numRemovedBytes -> 0, numDeletionVectorsRemoved -> 0, numOutputRows -> 10000, numOutputBytes -> 521326)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13


9. (Bonus) Optimize the table:
Use OPTIMIZE and optionally ZORDER on customer_id or order_date.
****

In [0]:
%sql
OPTIMIZE main.default.orders_data;


path,metrics
,"List(0, 0, List(null, null, 0.0, 0, 0), List(null, null, 0.0, 0, 0), 0, null, null, 0, 0, 1, 1, true, 0, 0, 1764491269436, 1764491269915, 8, 0, null, List(0, 0), null, 10, 10, 0, 0, null)"


In [0]:
%sql
OPTIMIZE main.default.orders_data
ZORDER BY (customer_id);


path,metrics
,"List(0, 0, List(null, null, 0.0, 0, 0), List(null, null, 0.0, 0, 0), 0, List(minCubeSize(107374182400), List(0, 0), List(1, 443455), 0, List(0, 0), 0, null), null, 0, 0, 1, 1, false, 0, 0, 1764491354804, 1764491355277, 8, 0, null, List(0, 0), null, 10, 10, 0, 0, null)"


In [0]:
%sql
OPTIMIZE main.default.orders_data
ZORDER BY (order_date);


path,metrics
,"List(0, 0, List(null, null, 0.0, 0, 0), List(null, null, 0.0, 0, 0), 0, List(minCubeSize(107374182400), List(0, 0), List(1, 443455), 0, List(0, 0), 0, null), null, 0, 0, 1, 1, false, 0, 0, 1764491376922, 1764491377376, 8, 0, null, List(0, 0), null, 10, 10, 0, 0, null)"


10. (Bonus) Show how small file problems can occur with too many partitions and how OPTIMIZE helps.


In [0]:
# location for small-file demo
volume_small_path = "/Volumes/main/default/orders_vol/orders_smallfiles"

# Generate many tiny files
for i in range(100):    
    tiny_df = df_orders.limit(50)
    (tiny_df.write
           .format("delta")
           .mode("append")
           .option("mergeSchema", "true")
           .save(volume_small_path))


In [0]:
from delta.tables import DeltaTable

delta_small = DeltaTable.forPath(spark, "/Volumes/main/default/orders_vol/orders_smallfiles")


In [0]:
def count_files(path):
    total = 0
    for f in dbutils.fs.ls(path):
        if f.isDir():
            total += count_files(f.path)
        else:
            total += 1
    return total

before = count_files("/Volumes/main/default/orders_vol/orders_smallfiles")
print("Files BEFORE OPTIMIZE:", before)


Files BEFORE OPTIMIZE: 301


In [0]:
%sql
OPTIMIZE delta.`/Volumes/main/default/orders_vol/orders_smallfiles`;


path,metrics
dbfs:/Volumes/main/default/orders_vol/orders_smallfiles,"List(0, 0, List(null, null, 0.0, 0, 0), List(null, null, 0.0, 0, 0), 0, null, null, 0, 0, 1, 1, true, 0, 0, 1764492038073, 1764492039545, 8, 0, null, List(0, 0), null, 8, 8, 0, 0, null)"


In [0]:
%sql
DESCRIBE HISTORY delta.`/Volumes/main/default/orders_vol/orders_smallfiles`;


version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
100,2025-11-30T08:39:25.000Z,70588280163463,aspranavi1110@gmail.com,OPTIMIZE,"Map(predicate -> [], auto -> false, clusterBy -> [], zOrderBy -> [], batchId -> 0)",,List(157788261568517),1130-075739-gilj182r-v2n,99.0,SnapshotIsolation,False,"Map(numRemovedFiles -> 100, numRemovedBytes -> 554200, p25FileSize -> 7746, numDeletionVectorsRemoved -> 0, minFileSize -> 7746, numAddedFiles -> 1, maxFileSize -> 7746, p75FileSize -> 7746, p50FileSize -> 7746, numAddedBytes -> 7746)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
99,2025-11-30T08:32:54.000Z,70588280163463,aspranavi1110@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(157788261568517),1130-075739-gilj182r-v2n,98.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 50, numOutputBytes -> 5542)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
98,2025-11-30T08:32:53.000Z,70588280163463,aspranavi1110@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(157788261568517),1130-075739-gilj182r-v2n,97.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 50, numOutputBytes -> 5542)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
97,2025-11-30T08:32:52.000Z,70588280163463,aspranavi1110@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(157788261568517),1130-075739-gilj182r-v2n,96.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 50, numOutputBytes -> 5542)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
96,2025-11-30T08:32:51.000Z,70588280163463,aspranavi1110@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(157788261568517),1130-075739-gilj182r-v2n,95.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 50, numOutputBytes -> 5542)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
95,2025-11-30T08:32:50.000Z,70588280163463,aspranavi1110@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(157788261568517),1130-075739-gilj182r-v2n,94.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 50, numOutputBytes -> 5542)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
94,2025-11-30T08:32:49.000Z,70588280163463,aspranavi1110@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(157788261568517),1130-075739-gilj182r-v2n,93.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 50, numOutputBytes -> 5542)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
93,2025-11-30T08:32:47.000Z,70588280163463,aspranavi1110@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(157788261568517),1130-075739-gilj182r-v2n,92.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 50, numOutputBytes -> 5542)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
92,2025-11-30T08:32:46.000Z,70588280163463,aspranavi1110@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(157788261568517),1130-075739-gilj182r-v2n,91.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 50, numOutputBytes -> 5542)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
91,2025-11-30T08:32:45.000Z,70588280163463,aspranavi1110@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(157788261568517),1130-075739-gilj182r-v2n,90.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 50, numOutputBytes -> 5542)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
