In [0]:
#Converting CSV to Delta format
events = spark.read.csv(
    "/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv",
    header=True,
    inferSchema=True
)

events.write.format("delta").mode("overwrite").save(
    "/Volumes/workspace/ecommerce/ecommerce_data/delta/events"
)


In [0]:
events_delta = spark.read.format("delta").load(
    "/Volumes/workspace/ecommerce/ecommerce_data/delta/events"
)

events_delta.show(5)


+-------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code| brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|2019-11-01 00:00:00|      view|   1003461|2053013555631882655|electronics.smart...|xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|2019-11-01 00:00:00|      view|   5000088|2053013566100866035|appliances.sewing...|janome|293.65|530496790|8e5f4f83-366c-4f7...|
|2019-11-01 00:00:01|      view|  17302664|2053013553853497655|                NULL| creed| 28.31|561587266|755422e7-9040-477...|
|2019-11-01 00:00:01|      view|   3601530|2053013563810775923|appliances.kitche...|    lg|712.87|518085591|3bfb58cd-7892-48c...|
|2019-11-01 00:00:01|      view|   1004775|2053013555631882655|electronics.smart...|xiaomi

In [0]:
#PySpark delta table
events.write.format("delta").mode("overwrite").saveAsTable("events_delta")


In [0]:
events_delta.show(5)

+-------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code| brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|2019-11-01 00:00:00|      view|   1003461|2053013555631882655|electronics.smart...|xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|2019-11-01 00:00:00|      view|   5000088|2053013566100866035|appliances.sewing...|janome|293.65|530496790|8e5f4f83-366c-4f7...|
|2019-11-01 00:00:01|      view|  17302664|2053013553853497655|                NULL| creed| 28.31|561587266|755422e7-9040-477...|
|2019-11-01 00:00:01|      view|   3601530|2053013563810775923|appliances.kitche...|    lg|712.87|518085591|3bfb58cd-7892-48c...|
|2019-11-01 00:00:01|      view|   1004775|2053013555631882655|electronics.smart...|xiaomi

In [0]:
%sql
CREATE TABLE IF NOT EXISTS events_delta1
USING DELTA
AS
SELECT *
FROM csv.`/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv`;


num_affected_rows,num_inserted_rows


In [0]:
events_delta2 = spark.read.table("events_delta1")

events_delta2.show(5)

+--------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|                 _c0|       _c1|       _c2|                _c3|                 _c4|   _c5|   _c6|      _c7|                 _c8|
+--------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|          event_time|event_type|product_id|        category_id|       category_code| brand| price|  user_id|        user_session|
|2019-11-01 00:00:...|      view|   1003461|2053013555631882655|electronics.smart...|xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|2019-11-01 00:00:...|      view|   5000088|2053013566100866035|appliances.sewing...|janome|293.65|530496790|8e5f4f83-366c-4f7...|
|2019-11-01 00:00:...|      view|  17302664|2053013553853497655|                NULL| creed| 28.31|561587266|755422e7-9040-477...|
|2019-11-01 00:00:...|      view|   3601530|2053013563810775923|appliances.kitche..

In [0]:
#error occurs because this data does not match table structure.
wrong_df = spark.createDataFrame(
    [("abc", "wrong_value")],
    ["id", "amount"]
)
wrong_df.write.format("delta").mode("append").save("/Volumes/workspace/ecommerce/ecommerce_data/delta/events")

Schema mismatch error caught: [_LEGACY_ERROR_TEMP_DELTA_0007] A schema mismatch detected when writing to the Delta table (Table ID: 02f8b7b1-efdf-4fb6-9fc0-45e1864cb70f).
To enable schema migration using DataFrameWriter or DataStreamWriter, please set:
'.option("mergeSchema", "true")'.
For other operations, set the session configuration
spark.databricks.delta.schema.autoMerge.enabled to "true". See the documentation
specific to the operation for details.

Table schema:
root
-- event_time: timestamp (nullable = true)
-- event_type: string (nullable = true)
-- product_id: integer (nullable = true)
-- category_id: long (nullable = true)
-- category_code: string (nullable = true)
-- brand: string (nullable = true)
-- price: double (nullable = true)
-- user_id: integer (nullable = true)
-- user_session: string (nullable = true)


Data schema:
root
-- id: string (nullable = true)
-- amount: string (nullable = true)

         
Table ACLs are enabled in this cluster, so automatic schema migrat

In [0]:
# handle duplicates insert
new_events = spark.read.csv(
    "/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv",
    header=True,
    inferSchema=True
)

new_events_clean = new_events.dropDuplicates(["event_time", "user_id"])

new_events_clean.write \
    .format("delta") \
    .mode("append") \
    .save("/Volumes/workspace/ecommerce/ecommerce_data/delta/events")
