In [0]:
# Day 4
# Reloading October data:
from pyspark.sql.types import StructType, StructField, TimestampType, StringType, LongType, DoubleType

schema = StructType([
    StructField("event_time", TimestampType(), True),
    StructField("event_type", StringType(), True),
    StructField("product_id", LongType(), True),
    StructField("category_id", LongType(), True),
    StructField("category_code", StringType(), True),
    StructField("brand", StringType(), True),
    StructField("price", DoubleType(), True),
    StructField("user_id", LongType(), True),
    StructField("user_session", StringType(), True)
])

events = spark.read.csv(
    "/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv",
    header=True,
    schema=schema
)

In [0]:
# Choosing Delta location inside volume:
delta_path = "/Volumes/workspace/ecommerce/ecommerce_data/delta/events_oct"

events.write.format("delta") \
    .mode("overwrite") \
    .save(delta_path)

Verfiying Delta Location

In [0]:
%fs ls /Volumes/workspace/ecommerce/ecommerce_data/delta/events_oct

In [0]:
# creating managed delta table
events.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("events_table")

In [0]:
# checking the table
spark.sql("SELECT COUNT(*) FROM events_table").show()

Using SQL based approach 

In [0]:
%sql
SELECT current_catalog(), current_schema();

In [0]:
%sql
CREATE OR REPLACE TABLE events_delta
USING DELTA
AS
SELECT *
FROM workspace.default.events_table;

In [0]:
%sql
SHOW TABLES;

In [0]:
try:
    wrong_schema = spark.createDataFrame(
        [("a", "b", "c")],
        ["x", "y", "z"]
    )
    
    wrong_schema.write.format("delta") \
        .mode("append") \
        .save(delta_path)

except Exception as e:
    print("Schema enforcement triggered:")
    print(e)

In [0]:
# checking duplicate entry
events.write.format("delta") \
    .mode("append") \
    .save(delta_path)

print("After first append:", spark.read.format("delta").load(delta_path).count())

events.write.format("delta") \
    .mode("append") \
    .save(delta_path)

print("After second append:", spark.read.format("delta").load(delta_path).count())