# Schema Governance & De-duplication

In [0]:
from pyspark.sql.types import StructType, StructField, TimestampType, StringType, LongType, DoubleType

schema = StructType([
    StructField("event_time", TimestampType(), True),
    StructField("event_type", StringType(), True),
    StructField("product_id", LongType(), True),
    StructField("category_id", LongType(), True),
    StructField("category_code", StringType(), True),
    StructField("brand", StringType(), True),
    StructField("price", DoubleType(), True),
    StructField("user_id", LongType(), True),
    StructField("user_session", StringType(), True)
])

In [0]:
df_october = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv",header="true",schema=schema)

df_november = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv",header="true",schema=schema)

Loading data using SQL & Pyspark

In [0]:
df_october.write.format("delta").mode("overwrite").saveAsTable("workspace.ecommerce.df_october")

df_november.createOrReplaceTempView("df_november")


In [0]:
%sql
CREATE OR REPLACE TABLE workspace.ecommerce.df_november AS
SELECT * FROM df_november;

Schema Enforcement

In [0]:
%sql
Describe workspace.ecommerce.df_october

In [0]:
from pyspark.sql.types import StructType, StructField, TimestampType, StringType, LongType, DoubleType


incoming_schema = T.StructType([
    T.StructField("event_time", T.TimestampType(), True),
    T.StructField("event_date", T.DateType(), True),    # NEW column
    T.StructField("event_type", T.StringType(), True),
    T.StructField("product_id", T.LongType(), True),
    T.StructField("category_id", T.LongType(), True),
    T.StructField("category_code", T.StringType(), True),
    T.StructField("brand", T.StringType(), True),
    T.StructField("price", T.DoubleType(), True),       # wider numeric type
    T.StructField("user_id", T.LongType(), True),
    T.StructField("user_session", T.StringType(), True),
])

from datetime import datetime, date
from pyspark.sql import Row

try:
    bad_row = Row(
    event_time=datetime.now(),       # Python datetime
    event_date=date.today(),         # Python date
    event_type="view",
    product_id=123,
    category_id=456,
    category_code="electronics",
    brand="BrandX",
    price=99,                        # becomes double by schema
    user_id=789,
    user_session="sess1"
    )
    print(f"{bad_row}")
    df_bad = spark.createDataFrame([bad_row], incoming_schema)
    print("DF created OK (unexpected). Schema:")
    df_bad.printSchema()

    _ = df_bad.count()
    df_bad.write.format("delta").mode("append").saveAsTable(
        "workspace.ecommerce.df_october")
    print("Write succeeded")
except Exception as e:
    print(f"schema enforcement : {e}")
    


Schema Evolution

In [0]:

from datetime import datetime, date
from pyspark.sql import Row, functions as F, types as T

incoming_schema = T.StructType([
    T.StructField("event_time", T.TimestampType(), True),
    T.StructField("event_date", T.DateType(), True),    # NEW column
    T.StructField("event_type", T.StringType(), True),
    T.StructField("product_id", T.LongType(), True),
    T.StructField("category_id", T.LongType(), True),
    T.StructField("category_code", T.StringType(), True),
    T.StructField("brand", T.StringType(), True),
    T.StructField("price", T.DoubleType(), True),       # wider numeric type
    T.StructField("user_id", T.LongType(), True),
    T.StructField("user_session", T.StringType(), True),
])

bad_row = Row(
    event_time=datetime.now(),       # Python datetime
    event_date=date.today(),         # Python date
    event_type="view",
    product_id=123,
    category_id=456,
    category_code="electronics",
    brand="BrandX",
    price=99,                        # becomes double by schema
    user_id=789,
    user_session="sess1"
)

df_bad = spark.createDataFrame([bad_row], incoming_schema)



(df_bad.write
  .format("delta")
  .mode("append")
  .option("mergeSchema", "true")
  .saveAsTable("workspace.ecommerce.df_october"))



In [0]:
display(spark.read.table("workspace.ecommerce.df_october").orderBy("event_date",ascending=False).limit(10))

Handling Duplication inserts

In [0]:
dup_events = spark.read.table("workspace.ecommerce.df_november").limit(10)

In [0]:
from datetime import datetime, date
from pyspark.sql import Row

add_row = Row(
    event_time=datetime.now(),       # Python datetime
    event_type="view",
    product_id=123,
    category_id=456,
    category_code="electronics",
    brand="BrandX",
    price=99,                        # becomes double by schema
    user_id=789,
    user_session="sess1"
)

# Add a row to the dup_events table for insert action
dup_events = dup_events.union(spark.createDataFrame([add_row]))


In [0]:
dup_events.createOrReplaceTempView("ecommerce.dup_events")

Merge \ Upsert action - 10 updates + 1 insert

In [0]:
%sql
MERGE INTO workspace.ecommerce.df_november AS T
USING dup_events as S
ON T.event_time = S.event_time
AND T.event_type = S.event_type
AND T.User_id = S.User_id
AND T.user_session = S.user_session
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *