Set Catalog

In [0]:
from pyspark.sql. functions import col, trim, upper, regexp_replace, when, split, explode, array, lit, monotonically_increasing_id
from delta.tables import DeltaTable
from datetime import datetime, timedelta

BASE_PATH = "/Volumes/postnord/default/data/"

spark. sql("USE CATALOG postnord")
print(spark.catalog.currentCatalog())

For Dynamic handling

In [0]:

files = dbutils.fs.ls(BASE_PATH)
print(f"Found {len(files)} files")

basic_dates = set([
    f.name.replace("Silver_item_basic_", "").replace(".csv", "")
    for f in files
    if f.name.startswith("Silver_item_basic_") and f.name.endswith(".csv")
])

scan_dates = set([
    f.name.replace("Silver_item_scans_", "").replace(".csv", "")
    for f in files
    if f.name.startswith("Silver_item_scans_") and f.name. endswith(".csv")
])

complete_dates = sorted(basic_dates. intersection(scan_dates), reverse=True)

if not complete_dates: 
    raise ValueError("No complete data sets found!")

latest_basic_date = complete_dates[0]
latest_scan_date = complete_dates[0]

expected_date = (datetime.now() - timedelta(days=1)).strftime("%Y%m%d")
if latest_basic_date < expected_date: 
    print(f"Data delayed!  Latest:  {latest_basic_date}, Expected: {expected_date}")
else:
    print(f"Data is current")

print(f"Latest Silver_item_basic date: {latest_basic_date}")
print(f"Latest Silver_item_scans date: {latest_scan_date}")

In [0]:
df_basic = (spark.read
    . format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load(f"{BASE_PATH}Silver_item_basic_{latest_basic_date}.csv"))

display(df_basic. limit(5))
df_basic.printSchema()

# Merge (keys: packageid, transaction_ref, created_dt)
if spark.catalog.tableExists("silver_item_basic"):
    DeltaTable.forName(spark, "silver_item_basic").alias("target").merge(
        df_basic.alias("source"),
        """target. packageid <=> source.packageid 
           AND target.transaction_ref <=> source.transaction_ref 
           AND target. created_dt <=> source.created_dt"""
    ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
    print("Merged into silver_item_basic")
else:
    df_basic. write. format("delta").saveAsTable("silver_item_basic")
    print("Created silver_item_basic")

In [0]:
df_scan = (spark.read
    .format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load(f"{BASE_PATH}Silver_item_scans_{latest_scan_date}.csv"))

df_scan = (df_scan
    .withColumnRenamed("created_dt", "scan_created_dt")
    .withColumnRenamed("Terminal", "scan_terminal")
    .withColumnRenamed("transaction_ref", "scan_transaction_ref"))

display(df_scan.limit(5))
df_scan.printSchema()

# Merge (keys: packageid, systemdato, scan_datetime, scan_type, location)
if spark.catalog.tableExists("silver_item_scan"):
    DeltaTable.forName(spark, "silver_item_scan").alias("target").merge(
        df_scan.alias("source"),
        """target.packageid <=> source.packageid 
           AND target. systemdato <=> source.systemdato
           AND target.scan_datetime <=> source.scan_datetime
           AND target.scan_type <=> source.scan_type
           AND target.location <=> source.location"""
    ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
    print("Merged into silver_item_scan")
else:
    df_scan. write.format("delta").saveAsTable("silver_item_scan")
    print("Created silver_item_scan")

%md
Ingest Data from Silver Tables (csvs) and EndTimeRules

In [0]:
# df_basic = (spark.read
#             .format("csv")
#             .option("header", "true")
#             .option("inferSchema", "true")
#             .load(f"{base_path}Silver_item_basic_{latest_basic_date}.csv"))
# change head() method to limit()
display(df_basic.limit(5))

In [0]:
df_basic.printSchema()

In [0]:
# df_scan = (spark.read
#            .format("csv")
#            .option("header", "true")
#            .option("inferSchema", "true")
#            .load(f"{base_path}Silver_item_scans_{latest_basic_date}.csv"))
# change head() method to limit()
display(df_scan.limit(5))

In [0]:
display(df_scan.printSchema())

In [0]:
# df_scan = df_scan.withColumnRenamed("created_dt", "scan_created_dt")

# df_scan = df_scan.withColumnRenamed("Terminal", "scan_terminal")

# df_scan = df_scan.withColumnRenamed("transaction_ref", "scan_transaction_ref")




In [0]:
# display(df_scan.printSchema())

In [0]:
#from pyspark.sql. functions import monotonically_increasing_id

# Load Excel without header
df_rules = spark.read. format("excel").option("header", "false").option("inferSchema", "true").load("/Volumes/postnord/default/data/EndTimeRules.xlsx")

# Get the first row values (these are actual column names)
header = df_rules.first()

#print(header)

# Add row index
df_rules = df_rules.withColumn("row_id", monotonically_increasing_id())

# Remove first row
df_rules = df_rules. filter(df_rules["row_id"] > 0).drop("row_id")

# Rename columns using header values
for i, col_name in enumerate(header):
    df_rules = df_rules. withColumnRenamed(f"_c{i}", str(col_name))

# The limit() method returns a spark dataframe
display(df_rules.limit(5)) 

In [0]:
display(df_rules.printSchema())

In [0]:
df_rules = df_rules.withColumnRenamed("location ", "location")

In [0]:
display(df_rules.printSchema())

In [0]:
from pyspark.sql. functions import col, trim, upper, regexp_replace, when, split, explode, array, lit

# ============================================
# Clean and normalize df_rules 
# ============================================

# Step 1: Clean scan_type - trim, uppercase, remove extra spaces
df_rules = df_rules.withColumn(
    "scan_type_cleaned",
    upper(trim(regexp_replace(col("scan_type"), r"\s+", " ")))
)

# Step 2: Handle "OR" logic - split scan_type into array
# e.g., "I OR L" becomes ["I", "L"]
df_rules = df_rules.withColumn(
    "scan_type_list",
    split(regexp_replace(col("scan_type_cleaned"), r"\s*OR\s*", ","), ",")
)

# Step 3: Explode to create one row per scan_type
df_rules_exploded = df_rules.withColumn(
    "scan_type_normalized",
    explode(col("scan_type_list"))
).withColumn(
    "scan_type_normalized",
    trim(col("scan_type_normalized"))
)

# Step 4: Clean reason_code and location
df_rules_exploded = df_rules_exploded.withColumn(
    "reason_code",
    trim(col("reason_code"))
).withColumn(
    "location",
    when(col("location").isNull(), None).otherwise(upper(trim(col("location"))))
)

# ============================================
# Step 5: Get all unique products from df_basic dynamically
# ============================================
product_rows = df_basic.select("product").distinct().collect()
all_product_codes = [row. product for row in product_rows]
print(f"Product codes found in data: {all_product_codes}")

# ============================================
# Step 6: Create explicit product code arrays
# ============================================
# Build arrays with actual product codes
all_products_array = array(*[lit(code) for code in all_product_codes])
products_except_Z_array = array(*[lit(code) for code in all_product_codes if code != "Z"])

df_rules_exploded = df_rules_exploded.withColumn(
    "product_codes",
    when(
        col("product").contains("also Z"),
        all_products_array  # Creates array like ["W", "P", "Z", "G"]
    ).when(
        col("product").contains("except Z"),
        products_except_Z_array  # Creates array like ["W", "P", "G"]
    ).otherwise(
        all_products_array  # Default:  all product codes
    )
)

# Step 7: Explode to create one row per product code
df_rules_final = df_rules_exploded. withColumn(
    "product",
    explode(col("product_codes"))
)

# Step 8: Select final columns for the rules table
df_rules_final = df_rules_final.select(
    col("reason_code"),
    col("scan_type_normalized").alias("scan_type"),
    col("location"),
    col("product"),
    col("is_end_event")
).distinct()

# ============================================
# Display results to verify
# ============================================
print("=== Cleaned and Expanded Rules ===")
display(df_rules_final.orderBy("reason_code", "scan_type", "product"))

print(f"\nExpanded rule count: {df_rules_final. count()}")

# Show sample of expanded rules
print("\n=== Sample:  Rules with product 'Z' ===")
display(df_rules_final.filter(col("product") == "Z").limit(5))

print("\n=== Sample:  Rules with product 'W' ===")
display(df_rules_final.filter(col("product") == "W").limit(5))

# Replace df_rules with the cleaned version
df_rules = df_rules_final

In [0]:
display(df_rules.printSchema())

creating delta tables so next notebook can pickup where we left off

In [0]:
%python
# Save df_basic as a Delta table
df_basic.write.format("delta").mode("overwrite").saveAsTable("silver_item_basic")

# Save df_scan as a Delta table
df_scan.write.format("delta").mode("overwrite").option(
    "overwriteSchema", "true"
).saveAsTable("silver_item_scan")

# Save df_rules as a Delta table
df_rules.write.format("delta").mode("overwrite").saveAsTable("end_time_rules")

In [0]:
#display(spark.sql("DESCRIBE HISTORY silver_item_scan"))

In [0]:
# Query data as it was 7 days ago
#df_old = spark.read.format("delta").option("timestampAsOf", "2025-12-28").table("silver_item_scan")

# Or by version number
#df_v2 = spark.read.format("delta").option("versionAsOf", 2).table("silver_item_scan")
