In [0]:
from pyspark.sql import functions as F

events = spark.read.csv(
    "/Volumes/workspace/ecommerce/ecommerce_data/",
    header=True,
    inferSchema=True
)

events.printSchema()
events.show(5)


root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)

+-------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code| brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|2019-11-01 00:00:00|      view|   1003461|2053013555631882655|electronics.smart...|xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|2019-11-01 00:00:00|      view|   5000088|2053013566100866035|appliances.sewing...|janome|293.65|53

In [0]:
events.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("events_table")


In [0]:
%sql
CREATE TABLE IF NOT EXISTS events_delta
USING DELTA
AS
SELECT * FROM events_table;


num_affected_rows,num_inserted_rows


In [0]:
%sql
DESCRIBE DETAIL events_delta;


format,id,name,description,location,createdAt,lastModified,partitionColumns,clusteringColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion,tableFeatures,statistics,clusterByAuto
delta,dc326723-f8b8-4369-82f9-588e553ab7ad,workspace.default.events_delta,,,2026-01-12T14:21:08.094Z,2026-01-12T14:21:25.000Z,List(),List(),16,1947232774,"Map(delta.parquet.compression.codec -> zstd, delta.enableDeletionVectors -> true)",3,7,"List(appendOnly, deletionVectors, invariants)","Map(numRowsDeletedByDeletionVectors -> 0, numDeletionVectors -> 0)",False


In [0]:
try:
    wrong_schema = spark.createDataFrame(
        [("a", "b", "c")],
        ["x", "y", "z"]
    )

    wrong_schema.write.format("delta") \
        .mode("append") \
        .save("/delta/events")

except Exception as e:
    print("Schema enforcement error:")
    print(e)


Schema enforcement error:
Public DBFS root is disabled. Access is denied on path: /delta/events/_delta_log

JVM stacktrace:
java.lang.UnsupportedOperationException
	at com.databricks.backend.daemon.data.client.DisabledDatabricksFileSystem.rejectOperation(DisabledDatabricksFileSystem.scala:31)
	at com.databricks.backend.daemon.data.client.DisabledDatabricksFileSystem.getFileStatus(DisabledDatabricksFileSystem.scala:108)
	at com.databricks.backend.daemon.data.client.DatabricksFileSystemV2.$anonfun$getFileStatus$2(DatabricksFileSystemV2.scala:1227)
	at com.databricks.s3a.S3AExceptionUtils$.convertAWSExceptionToJavaIOException(DatabricksStreamUtils.scala:64)
	at com.databricks.backend.daemon.data.client.DatabricksFileSystemV2.$anonfun$getFileStatus$1(DatabricksFileSystemV2.scala:1224)
	at com.databricks.logging.UsageLogging.$anonfun$recordOperation$1(UsageLogging.scala:510)
	at com.databricks.logging.UsageLogging.executeThunkAndCaptureResultTags$1(UsageLogging.scala:616)
	at com.databricks

In [0]:

deduped_events = events.dropDuplicates([
    "user_id",
    "product_id",
    "event_time",
    "event_type"
])

deduped_events.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("events_table")

