Set Catalog

In [0]:
spark.sql("USE CATALOG postnord")
print(spark.catalog.currentCatalog())

Load tables

In [0]:
%python
df_basic = spark.table("silver_item_basic")
df_scan = spark.table("silver_item_scan")
df_rules = spark.table("end_time_rules")

For each (packageid, systemdato), keep the row with the latest created_dt and transaction_ref in df_basic

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col

window_basic = Window.partitionBy("packageid", "systemdato").orderBy(col("created_dt").desc(), col("transaction_ref").desc())
df_basic_dedup = df_basic.withColumn("rn", row_number().over(window_basic)).filter(col("rn") == 1).drop("rn")

display(df_basic_dedup.limit(5))

For df_scan, deduplicate on (packageid, systemdato, scan_datetime, scan_type, reason_code, location, transaction_ref)

In [0]:
#window_scan = Window.partitionBy(
 #   "packageid", "systemdato", "scan_datetime", "scan_type", "reason_code", "location", "transaction_ref"
#).orderBy(col("scan_datetime").desc())

#df_scan_dedup = df_scan.withColumn("rn", row_number().over(window_scan)).filter(col("rn") == 1).drop("rn")

#display(df_scan_dedup.limit(5))

In [0]:
window_scan = Window.partitionBy(
    "packageid", "systemdato", "scan_datetime", "scan_type", "reason_code", "location", "transaction_ref"
).orderBy(col("scan_created_dt").desc())

df_scan_dedup = df_scan.withColumn("rn", row_number().over(window_scan)).filter(col("rn") == 1).drop("rn")

display(df_scan_dedup.limit(5))

Use a column (e.g., created_dt) to only process new or updated records from df_basic since the last run.
For batch jobs, filter records where created_dt > last processed date.


In [0]:
from pyspark.sql.functions import col
from pyspark.sql.utils import AnalysisException

try:
    # Try to read the watermark table
    watermark_df = spark.read.table("control_watermark_table")
    last_processed_dt = watermark_df.select("last_processed_dt").collect()[0][0]
    print(f"Last processed watermark: {last_processed_dt}")
except AnalysisException:
    # Table does not exist (first run)
    last_processed_dt = None
    print("Watermark table not found. Processing all records (first run).")

if last_processed_dt is None:
    # First run - process all records
    df_basic_incremental = df_basic_dedup
    df_scan_incremental = df_scan_dedup
else:
    # Subsequent runs - filter for new records
    df_basic_incremental = df_basic_dedup.filter(col("created_dt") > last_processed_dt)
    df_scan_incremental = df_scan_dedup.filter(col("scan_created_dt") > last_processed_dt)

In [0]:
display(df_basic_incremental.limit(5))


In [0]:
display(df_scan_incremental.limit(5))

Join df_basic_incremental and df_scan_incremental on (packageid, systemdato, transaction_ref).

In [0]:
df_joined = df_scan_incremental.join(
    df_basic_incremental,
    on=["packageid", "systemdato", "transaction_ref"],
    how="inner"
)

display(df_joined.limit(5))

In [0]:
display(df_joined.printSchema())

In [0]:
#df_joined = df_joined.drop("created_dt")


In [0]:
#display(df_joined.printSchema())

create delta table for next notebook

In [0]:
spark.sql(
    "DROP TABLE IF EXISTS items_joined"
)

df_joined.write.format("delta").saveAsTable(
    "items_joined"
)

In [0]:
spark.sql(
    "DROP TABLE IF EXISTS items_basic_incremental"
)

df_basic_incremental.write.format("delta").saveAsTable("items_basic_incremental")
#df_scan_incremental.write.format("delta").saveAsTable("items_scan_incremental")