Set Catalog

In [0]:
spark.sql("USE CATALOG postnord")
print(spark.catalog.currentCatalog())

Load tables

In [0]:
%python
df_joined = spark.table("items_joined")

df_rules = spark.table("end_time_rules")

Join df_joined with df_rules on (scan_type, reason_code, location, product).


In [0]:
display(df_rules.limit(5))

In [0]:
display(df_rules.printSchema())

In [0]:
#df_rules = df_rules.withColumnRenamed("location ", "location")

In [0]:
display(df_rules.printSchema())

In [0]:
display(df_joined.printSchema())

In [0]:
# Check data types
print("df_joined types:")
df_joined.select("scan_type", "reason_code", "location", "product").dtypes


In [0]:
print("df_rules types:")
df_rules.select("scan_type", "reason_code", "location", "product").dtypes

In [0]:
# Try joining on just one column to isolate the issue
#df_test = df_joined.join(df_rules, on=["scan_type"], how="inner")
#print(f"Match on scan_type only: {df_test.count()}")

#df_test = df_joined.join(df_rules, on=["scan_type", "reason_code"], how="inner")
#print(f"Match on scan_type + reason_code:  {df_test.count()}")

#df_test = df_joined.join(df_rules, on=["scan_type", "reason_code", "location"], how="inner")
#print(f"Match on scan_type + reason_code + location: {df_test. count()}")




In [0]:
print("df_joined locations:")
df_joined.select("location").distinct().show(truncate=False)

In [0]:
print("df_rules locations:")
df_rules.select("location").distinct().show(truncate=False)


In [0]:
from pyspark.sql.functions import col, when, upper, trim

# Step 1 & 2: Clean both dataframes
join_columns = ["scan_type", "reason_code", "location", "product"]

df_joined_clean = df_joined
df_rules_clean = df_rules

for c in join_columns:
    df_joined_clean = df_joined_clean.withColumn(
        c,
        when(col(c).isNull(), "NULL").otherwise(upper(trim(col(c))))
    )
    df_rules_clean = df_rules_clean.withColumn(
        c,
        when(col(c).isNull(), "NULL").otherwise(upper(trim(col(c))))
    )

# Step 3: Join
df_with_rules = df_joined_clean.join(
    df_rules_clean,
    on=join_columns,
    how="left"
)

# Step 4: Verify
print(f"Total rows: {df_with_rules.count()}")
print(f"Matched (has rule): {df_with_rules.filter(col('is_end_event').isNotNull()).count()}")
print(f"Unmatched (no rule): {df_with_rules.filter(col('is_end_event').isNull()).count()}")

In [0]:
display(df_with_rules.limit(5))

In [0]:
# Check schema - is_end_event should now be present
df_with_rules.printSchema()

# Check for nulls in is_end_event (unmatched rows from left join)
df_with_rules.select("is_end_event").distinct().show()

# Count matched vs unmatched
print(f"Total rows: {df_with_rules.count()}")
print(f"Matched rows: {df_with_rules.filter(col('is_end_event').isNotNull()).count()}")
print(f"Unmatched rows: {df_with_rules.filter(col('is_end_event').isNull()).count()}")

Filter End Events \
Filter rows where is_end_event == TRUE

In [0]:
df_end_events = df_with_rules.filter(col("is_end_event") == 'true')

In [0]:
display(df_end_events.limit(5))

For each (packageid, systemdato), select the earliest scan_datetime among end events.


In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number

window_end = Window.partitionBy("packageid", "systemdato").orderBy(col("scan_datetime").asc())
df_end_time = df_end_events.withColumn("rn", row_number().over(window_end)).filter(col("rn") == 1).select("packageid", "systemdato", "scan_datetime")

In [0]:
display(df_end_time.limit(5))

load item_basic_incremental delta table

In [0]:
df_basic_incremental = spark.table("items_basic_incremental")

Classify Delivery Status 

In [0]:
# Joining df_end_time with df_basic_incremental to get ETA
df_status = df_basic_incremental.join(
    df_end_time,
    on=["packageid", "systemdato"],
    how="left"
).withColumnRenamed("scan_datetime", "end_time")

In [0]:
display(df_status.limit(5))

Add delivery_status column: \
'On Time': end_time <= ETA \
'Delayed': end_time > ETA \
'En Route': end_time is null


In [0]:
from pyspark.sql.functions import when

df_status = df_status.withColumn(
    "delivery_status",
    when(col("end_time").isNull(), "En Route")
    .when(col("end_time") <= col("ETA"), "On Time")
    .otherwise("Delayed")
)

In [0]:
display(df_status.limit(5))

create delta table for next notebook

In [0]:
# Save df_status as a Delta table
df_status.write.format("delta").mode("overwrite").saveAsTable("silver_item_status")