Set Catalog

In [0]:
spark.sql("USE CATALOG postnord")
print(spark.catalog.currentCatalog())

For Dynamic handling

In [0]:
from pyspark.sql. functions import col

base_path = "/Volumes/postnord/default/data/"

# List all files and find the latest date for each file type
files = dbutils.fs.ls(base_path)

print(f"Found {len(files)} files")

# Extract dates from Silver_item_basic files
basic_dates = [
    f.name.replace("Silver_item_basic_", "").replace(".csv", "")
    for f in files
    if f.name.startswith("Silver_item_basic_") and f.name.endswith(".csv")
]

# Extract dates from Silver_item_scans files
scan_dates = [
    f.name.replace("Silver_item_scans_", "").replace(".csv", "")
    for f in files
    if f. name.startswith("Silver_item_scans_") and f.name.endswith(".csv")
]

# Get the latest date for each
latest_basic_date = max(basic_dates)
latest_scan_date = max(scan_dates)

print(f"Latest Silver_item_basic date: {latest_basic_date}")
print(f"Latest Silver_item_scans date: {latest_scan_date}")

%md
Ingest Data from Silver Tables (csvs) and EndTimeRules

In [0]:
df_basic = (spark.read
            .format("csv")
            .option("header", "true")
            .option("inferSchema", "true")
            .load(f"{base_path}Silver_item_basic_{latest_basic_date}.csv"))
# change head() method to limit()
display(df_basic.limit(5))

In [0]:
df_basic.printSchema()

In [0]:
df_scan = (spark.read
           .format("csv")
           .option("header", "true")
           .option("inferSchema", "true")
           .load(f"{base_path}Silver_item_scans_{latest_basic_date}.csv"))
# change head() method to limit()
display(df_scan.limit(5))

In [0]:
display(df_scan.printSchema())

In [0]:
df_scan = df_scan.withColumnRenamed("created_dt", "scan_created_dt")

df_scan = df_scan.withColumnRenamed("Terminal", "scan_terminal")



In [0]:
display(df_scan.printSchema())

In [0]:
from pyspark.sql. functions import monotonically_increasing_id

# Load Excel without header
df_rules = spark.read. format("excel").option("header", "false").option("inferSchema", "true").load("/Volumes/postnord/default/data/EndTimeRules.xlsx")

# Get the first row values (these are actual column names)
header = df_rules.first()

#print(header)

# Add row index
df_rules = df_rules.withColumn("row_id", monotonically_increasing_id())

# Remove first row
df_rules = df_rules. filter(df_rules["row_id"] > 0).drop("row_id")

# Rename columns using header values
for i, col_name in enumerate(header):
    df_rules = df_rules. withColumnRenamed(f"_c{i}", str(col_name))

# The limit() method returns a spark dataframe
display(df_rules.limit(5)) 

In [0]:
display(df_rules.printSchema())

In [0]:
df_rules = df_rules.withColumnRenamed("location ", "location")

In [0]:
display(df_rules.printSchema())

creating delta tables so next notebook can pickup where we left off

In [0]:
%python
# Save df_basic as a Delta table
df_basic.write.format("delta").mode("overwrite").saveAsTable("silver_item_basic")

# Save df_scan as a Delta table
df_scan.write.format("delta").mode("overwrite").option(
    "overwriteSchema", "true"
).saveAsTable("silver_item_scan")
# Save df_rules as a Delta table
df_rules.write.format("delta").mode("overwrite").saveAsTable("end_time_rules")