In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("DeltaLakeDay4") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()


In [0]:
events = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv")

events.show()
events.printSchema()



In [0]:
#Converting CSV to Delta
delta_path = "/Volumes/workspace/ecommerce/ecommerce_data/delta/events"

events.write \
    .format("delta") \
    .mode("overwrite") \
    .save(delta_path)


In [0]:
spark.read.format("delta").load(delta_path).show()


In [0]:
events.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("workspace.ecommerce.events_table")


In [0]:
%sql
CREATE TABLE workspace.ecommerce.events_delta
USING DELTA
AS SELECT * FROM workspace.ecommerce.events_table;


In [0]:
wrong_schema = spark.createDataFrame(
    [("a", "b", "c")],
    ["x", "y", "z"]
)

try:
    wrong_schema.write \
        .format("delta") \
        .mode("append") \
        .save(delta_path)
except Exception as e:
    print("Schema enforcement working:", e)


In [0]:
from delta.tables import DeltaTable

delta_table = DeltaTable.forPath(spark, delta_path)

delta_table.alias("t").merge(
    events.alias("s"),
    "t.product_id = s.product_id"
).whenNotMatchedInsertAll().execute()


In [0]:
%sql
DESCRIBE HISTORY workspace.ecommerce.events_table;
