In [0]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window

In [0]:
watermark_ts = spark.sql("select * from otc.silver.watermark where table_name='product' ").select('watermark_ts').collect()[0][0]

eligible_product_records = spark.sql(f"""
                                      select * from otc.bronze.src_product
                                      where ingest_ts > TIMESTAMP '{watermark_ts}'
                                      """)

# Dedup in silver
eligible_product_records = eligible_product_records.withColumn('SequenceOfRecord', F.row_number().over( Window.partitionBy('product_id').orderBy( F.desc('ingest_ts') ) ) )\
    .filter( F.col('SequenceOfRecord') == 1)\
        .drop('SequenceOfRecord')

eligible_product_records.display()
eligible_product_records.createOrReplaceTempView("eligible_product_records")

In [0]:
%sql
describe otc.silver.product

In [0]:
%sql

MERGE INTO otc.silver.product AS c
USING eligible_product_records AS ec
ON c.product_id = ec.product_id
WHEN MATCHED THEN 
    UPDATE SET
    product_sk = c.product_sk,
    product_id = ec.product_id,
    product_name = ec.product_name,
    category = ec.category,
    list_price = ec.list_price,
    is_active = ec.is_active,
    updated_at = ec.updated_at,
    ingest_ts = ec.ingest_ts
WHEN NOT MATCHED THEN
    INSERT (product_sk,product_id,product_name,category,list_price,is_active,updated_at,ingest_ts)
    VALUES (abs(xxhash64(ec.product_id, ec.ingest_ts)), ec.product_id, ec.product_name, ec.category, ec.list_price, ec.is_active, ec.updated_at, ec.ingest_ts)

In [0]:
%sql
UPDATE otc.silver.watermark
  SET watermark_ts = (select max(ingest_ts) from otc.silver.product)
  WHERE table_name = 'product'

In [0]:
%sql
select * from otc.silver.product

In [0]:
%sql
select * from otc.silver.watermark