Set Catalog

In [0]:
spark.sql("USE CATALOG postnord")
print(spark.catalog.currentCatalog())

load delta table

In [0]:
df_status = spark.table("silver_item_status")

Dimension tables \
Terminal \
Product \
Delivery Time \
Delivery Status \
Transaction Ref


Extract Unique values

In [0]:
display(df_status.printSchema())

In [0]:
# Terminal Dimension
df_terminal = df_status.select("terminal").distinct().withColumnRenamed("terminal", "terminal_id")

# Product Dimension
df_product = df_status.select("product").distinct().withColumnRenamed("product", "product_id")

# Delivery Status Dimension
df_delivery_status = df_status.select("delivery_status").distinct()

# Transaction Ref Dimension
df_transaction_ref = df_status.select("transaction_ref").distinct().withColumnRenamed("transaction_ref", "transaction_ref_id")

# Delivery Time Dimension 
df_delivery_time = df_status.select("eta").distinct() 



In [0]:
#display(df_terminal.limit(5))
#display(df_product.limit(5))
#display(df_delivery_status.limit(5))
#display(df_transaction_ref.limit(5))
#display(df_delivery_time.limit(5))

Create Delta Tables

In [0]:
# Terminal Dimension Table
df_terminal.createOrReplaceTempView("dim_terminal_temp")
spark.sql("""
CREATE TABLE IF NOT EXISTS dim_terminal
USING DELTA
AS SELECT * FROM dim_terminal_temp WHERE 1=0
""")

# Product Dimension Table
df_product.createOrReplaceTempView("dim_product_temp")
spark.sql("""
CREATE TABLE IF NOT EXISTS dim_product
USING DELTA
AS SELECT * FROM dim_product_temp WHERE 1=0
""")

# Delivery Status Dimension Table
df_delivery_status.createOrReplaceTempView("dim_delivery_status_temp")
spark.sql("""
CREATE TABLE IF NOT EXISTS dim_delivery_status
USING DELTA
AS SELECT * FROM dim_delivery_status_temp WHERE 1=0
""")

# Transaction Ref Dimension Table
df_transaction_ref.createOrReplaceTempView("dim_transaction_ref_temp")
spark.sql("""
CREATE TABLE IF NOT EXISTS dim_transaction_ref
USING DELTA
AS SELECT * FROM dim_transaction_ref_temp WHERE 1=0
""")

# Delivery Time Dimension Table
df_delivery_time.createOrReplaceTempView("dim_delivery_time_temp")
spark.sql("""
CREATE TABLE IF NOT EXISTS dim_delivery_time     
USING DELTA
AS SELECT * FROM dim_delivery_time_temp WHERE 1=0   
""")


Upsert (MERGE) Dimension Data \
Use MERGE to ensure idempotency and incremental updates

In [0]:
# Upsert Terminal Dimension
spark.sql("""
MERGE INTO dim_terminal AS target
USING dim_terminal_temp AS source
ON target.terminal_id = source.terminal_id
WHEN NOT MATCHED THEN INSERT *
""")

# Upsert Product Dimension
spark.sql("""
MERGE INTO dim_product AS target
USING dim_product_temp AS source
ON target.product_id = source.product_id
WHEN NOT MATCHED THEN INSERT *
""")

# Upsert Delivery Status Dimension
spark.sql("""
MERGE INTO dim_delivery_status AS target
USING dim_delivery_status_temp AS source
ON target.delivery_status = source.delivery_status
WHEN NOT MATCHED THEN INSERT *
""")

# Upsert Transaction Ref Dimension
spark.sql("""
MERGE INTO dim_transaction_ref AS target
USING dim_transaction_ref_temp AS source
ON target.transaction_ref_id = source.transaction_ref_id
WHEN NOT MATCHED THEN INSERT *
""")

# Upsert Delivery Time Dimension
spark.sql("""
MERGE INTO dim_delivery_time AS target
USING dim_delivery_time_temp AS source
ON target.eta = source.eta
WHEN NOT MATCHED THEN INSERT *
""")

In [0]:
%sql
select *
from dim_terminal
limit 5;

In [0]:
%sql
select *
from dim_product
limit 5;

In [0]:
%sql
select *
from dim_delivery_status
limit 5;

In [0]:
%sql
select *
from dim_delivery_time
limit 5;

In [0]:
%sql
select *
from dim_transaction_ref
limit 5;

In [0]:
# result = spark.sql("""
# SELECT column_name, data_type
# FROM information_schema.columns
# WHERE table_name = 'dim_terminal'
# """)
# display(result)

In [0]:
# result_product = spark.sql("""
# SELECT column_name, data_type
# FROM information_schema.columns
# WHERE table_name = 'dim_product'
# """)
# display(result_product)

# result_delivery_status = spark.sql("""
# SELECT column_name, data_type
# FROM information_schema.columns
# WHERE table_name = 'dim_delivery_status'
# """)
# display(result_delivery_status)

# result_transaction_ref = spark.sql("""
# SELECT column_name, data_type
# FROM information_schema.columns
# WHERE table_name = 'dim_transaction_ref'
# """)
# display(result_transaction_ref)

# result_delivery_time = spark.sql("""
# SELECT column_name, data_type
# FROM information_schema.columns
# WHERE table_name = 'dim_delivery_time'
# """)
# display(result_delivery_time)