Ingest Data from Silver Tables (csvs) and EndTimeRules

In [0]:
df_basic = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/Volumes/postnord/default/data/Silver_item_basic_20251222.csv")
# change head() method to limit()
display(df_basic.limit(5))

In [0]:
df_basic.printSchema()

In [0]:
df_scan = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/Volumes/postnord/default/data/Silver_item_scans_20251222.csv")

# change head() method to limit()
display(df_scan.limit(5))

In [0]:
display(df_scan.printSchema())

In [0]:
from pyspark.sql. functions import monotonically_increasing_id

# Load Excel without header
df_rules = spark.read. format("excel").option("header", "false").option("inferSchema", "true").load("/Volumes/postnord/default/data/EndTimeRules.xlsx")

# Get the first row values (these are actual column names)
header = df_rules.first()

#print(header)

# Add row index
df_rules = df_rules.withColumn("row_id", monotonically_increasing_id())

# Remove first row
df_rules = df_rules. filter(df_rules["row_id"] > 0).drop("row_id")

# Rename columns using header values
for i, col_name in enumerate(header):
    df_rules = df_rules. withColumnRenamed(f"_c{i}", str(col_name))

# The limit() method returns a spark dataframe
display(df_rules.limit(5)) 



In [0]:
display(df_rules.printSchema())

For each (packageid, systemdato), keep the row with the latest created_dt and transaction_ref in df_basic

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col

window_basic = Window.partitionBy("packageid", "systemdato").orderBy(col("created_dt").desc(), col("transaction_ref").desc())
df_basic_dedup = df_basic.withColumn("rn", row_number().over(window_basic)).filter(col("rn") == 1).drop("rn")

display(df_basic_dedup.limit(5))

For df_scan, deduplicate on (packageid, systemdato, scan_datetime, scan_type, reason_code, location, transaction_ref)

In [0]:
#window_scan = Window.partitionBy(
 #   "packageid", "systemdato", "scan_datetime", "scan_type", "reason_code", "location", "transaction_ref"
#).orderBy(col("scan_datetime").desc())

#df_scan_dedup = df_scan.withColumn("rn", row_number().over(window_scan)).filter(col("rn") == 1).drop("rn")

#display(df_scan_dedup.limit(5))

In [0]:
window_scan = Window.partitionBy(
    "packageid", "systemdato", "scan_datetime", "scan_type", "reason_code", "location", "transaction_ref"
).orderBy(col("created_dt").desc())

df_scan_dedup = df_scan.withColumn("rn", row_number().over(window_scan)).filter(col("rn") == 1).drop("rn")

display(df_scan_dedup.limit(5))

Use a column (e.g., created_dt) to only process new or updated records from df_basic since the last run.
For batch jobs, filter records where created_dt > last processed date.


In [0]:
# First run - no previous data to compare against
# Process all deduplicated records

# Set this flag to True for the first run
is_first_run = True

if is_first_run:
    # First run - process all records
    df_basic_incremental = df_basic_dedup
    df_scan_incremental = df_scan_dedup
else:
    # Subsequent runs - filter for new records from TARGET table
    last_processed_dt = spark.table("postnord.default.basic_processed").agg({"created_dt": "max"}).collect()[0][0]
    print(f"Last processed:  {last_processed_dt}")
    
    df_basic_incremental = df_basic_dedup.filter(col("created_dt") > last_processed_dt)
    df_scan_incremental = df_scan_dedup.filter(col("created_dt") > last_processed_dt)

In [0]:
display(df_basic_incremental.limit(5))


In [0]:
display(df_scan_incremental.limit(5))

Join df_basic_incremental and df_scan_incremental on (packageid, systemdato, transaction_ref).

In [0]:
df_joined = df_scan_incremental.join(
    df_basic_incremental,
    on=["packageid", "systemdato", "transaction_ref"],
    how="inner"
)

display(df_joined.limit(5))

Join df_joined with df_rules on (scan_type, reason_code, location, product).


In [0]:
display(df_rules.printSchema())

In [0]:
df_rules = df_rules.withColumnRenamed("location ", "location")

In [0]:
display(df_rules.printSchema())

In [0]:
df_with_rules = df_joined.join(
    df_rules,
    on=["scan_type", "reason_code", "location", "product"],
    how="left"
)

In [0]:
display(df_with_rules.limit(5))