##### 1. Setup & Load Data

In [0]:
from pyspark.sql import functions as F
from delta.tables import *

# Load your data (Assuming you have this from Day 3)
# If starting fresh, load the parquet file:
base_path = "/Volumes/workspace/ecommerce/ecommerce_data/processed_data"
events = spark.read.parquet(f"{base_path}/oct_2019")

##### Task 1: Convert to Delta Format

In [0]:
# Convert to Delta
# 1. Define the correct path in your Volume (NOT /tmp)
delta_path = "/Volumes/workspace/ecommerce/ecommerce_data/delta/events_silver"

# 2. Write as Delta
events.write \
    .format("delta") \
    .mode("overwrite") \
    .save(delta_path)

print(f"Data successfully written to Delta format at: {delta_path}")

Data successfully written to Delta format at: /Volumes/workspace/ecommerce/ecommerce_data/delta/events_silver


##### Task 2: Create Delta Tables (SQL & PySpark)

In [0]:
# Create managed table (PySpark)
events.write.format("delta").mode("overwrite").saveAsTable("events_table")

# Create managed table (SQL)
spark.sql("""
    CREATE TABLE IF NOT EXISTS events_delta
    USING DELTA
    AS SELECT * FROM events_table
""")

DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]

##### Task 3: Test schema enforcement

In [0]:

# Define the correct path (SAME as where you created the table earlier)
delta_path = "/Volumes/workspace/ecommerce/ecommerce_data/delta/events_silver"

print("Attempting to write BAD data...")

try:
    # Create dummy data with WRONG columns (x, y, z)
    wrong_schema_df = spark.createDataFrame([("a", "b", "c")], ["x", "y", "z"])
    
    # Try to append to your REAL table
    wrong_schema_df.write \
        .format("delta") \
        .mode("append") \
        .save(delta_path)
        
except Exception as e:
    # Check if it's the RIGHT error
    error_msg = str(e)
    if "schema mismatch" in error_msg.lower() or "analysisexception" in error_msg.lower():
        print("\n SUCCESS: Schema Enforcement worked! Delta blocked the bad columns.")
        print(f"Error Details: {error_msg.split('stack trace')[0]}")
    else:
        print("\n UNEXPECTED ERROR: Something else went wrong (Permissions/Path issue).")
        print(f"Error: {error_msg}")

Attempting to write BAD data...

 SUCCESS: Schema Enforcement worked! Delta blocked the bad columns.
Error Details: [_LEGACY_ERROR_TEMP_DELTA_0007] A schema mismatch detected when writing to the Delta table (Table ID: d3bd0b66-4718-4ce8-86e2-328ac4bd0838).
To enable schema migration using DataFrameWriter or DataStreamWriter, please set:
'.option("mergeSchema", "true")'.
For other operations, set the session configuration
spark.databricks.delta.schema.autoMerge.enabled to "true". See the documentation
specific to the operation for details.

Table schema:
root
-- event_time: timestamp (nullable = true)
-- event_type: string (nullable = true)
-- product_id: integer (nullable = true)
-- category_id: long (nullable = true)
-- category_code: string (nullable = true)
-- brand: string (nullable = true)
-- price: double (nullable = true)
-- user_id: integer (nullable = true)
-- user_session: string (nullable = true)


Data schema:
root
-- x: string (nullable = true)
-- y: string (nullable = tru

##### Task 4: Handle duplicate inserts

In [0]:
# Create a "Updates" DataFrame
# Scenario: 
# - User 518958788 (Existing) bought the item again, but price changed to 999.99
# - User 999999999 (New) is a brand new user
# Note: We select specific columns to keep the example simple
columns = ["user_id", "product_id", "event_time", "event_type", "price", "category_code", "brand"]

new_data = [
    # Existing User (Update Price)
    (518958788, 1002544, "2019-10-01 00:00:00", "purchase", 999.99, "electronics.smartphone", "apple"),
    # New User (Insert)
    (999999999, 1234567, "2019-10-01 00:00:00", "purchase", 499.00, "electronics.console", "sony")
]

# We must ensure the schema matches the target table for a clean merge
# Since 'events' has more columns, we'll just select the ones we care about for this demo
target_table = spark.read.table("events_table") # Load the managed table we created earlier
updates_df = spark.createDataFrame(new_data, columns)

# Perform the Merge
# We use the DeltaTable API
deltaTable = DeltaTable.forName(spark, "events_table")

print("Merging data...")

deltaTable.alias("target").merge(
    updates_df.alias("source"),
    # The Condition: Match on User ID and Product ID
    "target.user_id = source.user_id AND target.product_id = source.product_id"
).whenMatchedUpdate(set = {
    # If match found, update the price
    "price": "source.price" 
}).whenNotMatchedInsert(values = {
    # If no match, insert specific columns (filling others with null)
    "user_id": "source.user_id",
    "product_id": "source.product_id",
    "event_time": "source.event_time",
    "event_type": "source.event_type",
    "price": "source.price",
    "category_code": "source.category_code",
    "brand": "source.brand"
}).execute()

print("Merge Complete!")

# Verify the changes
# Check the updated user
spark.sql("SELECT user_id, product_id, price FROM events_table WHERE user_id = 518958788 AND product_id = 1002544").show()
# Check the new user
spark.sql("SELECT user_id, product_id, price FROM events_table WHERE user_id = 999999999").show()

Merging data...
Merge Complete!
+---------+----------+------+
|  user_id|product_id| price|
+---------+----------+------+
|518958788|   1002544|999.99|
|518958788|   1002544|999.99|
|518958788|   1002544|999.99|
+---------+----------+------+

+---------+----------+-----+
|  user_id|product_id|price|
+---------+----------+-----+
|999999999|   1234567|499.0|
+---------+----------+-----+

