In [1]:
# Script to load Products in dimension table using SCD type 2 table

# Import required libraries
import sys
from lib.spark_session import get_spark_session
from lib.utils import date_data, get_string_cols, get_rundate
from lib.job_control import insert_log, get_max_timestamp
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import current_timestamp, expr, to_date, date_format,udf,lit,col
from pyspark.sql.types import StringType
from datetime import datetime
from delta import DeltaTable
import uuid

In [2]:
# Job Parameters
rundate = get_rundate()
schema_name = "edw"
table_name = "dim_product"
table_full_name = f"{schema_name}.{table_name}"
staging_table_full_name = "edw_stg.dim_product_stg"
print("SPARK_APP: JOB triggered for rundate - " + rundate)

SPARK_APP: JOB triggered for rundate - 20220101


In [3]:
spark: SparkSession = get_spark_session(f"Dimension load - {table_full_name}")
print("SPARK_APP: Spark UI - " + spark.sparkContext.uiWebUrl)


SPARK_APP: Spark UI - http://03205cdd01e3:4040


In [4]:
#spark Configs
spark.conf.set("spark.sql.shuffle.partitions", 8)
#spark.conf.set("spark.sql.parquet.mergeSchema", False)

In [5]:
# Reading Stage Layer Data

df_stg = spark.read.table(staging_table_full_name)

print("SPARK_APP: Staging Data Count - " + str(df_stg.count()))
print("SPARK_APP: Printing Staging Schema --")
df_stg.printSchema()

SPARK_APP: Staging Data Count - 14
SPARK_APP: Printing Staging Schema --
root
 |-- product_id: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- type: string (nullable = true)
 |-- flavor: string (nullable = true)
 |-- size: string (nullable = true)
 |-- price: double (nullable = true)
 |-- image_url: string (nullable = true)
 |-- insert_dt: timestamp (nullable = true)
 |-- rundate: string (nullable = true)
 |-- expiration_dt: date (nullable = true)
 |-- effective_start_dt: timestamp (nullable = true)
 |-- effective_end_dt: timestamp (nullable = true)
 |-- active_flg: integer (nullable = true)
 |-- update_dt: timestamp (nullable = true)



In [6]:
df_stg.show(5,False)

+----------+-----------------------+------------+----+-------+-------+-----+----------------------------------------------------------+--------------------------+--------+-------------+--------------------------+-------------------+----------+--------------------------+
|product_id|product_name           |brand       |type|flavor |size   |price|image_url                                                 |insert_dt                 |rundate |expiration_dt|effective_start_dt        |effective_end_dt   |active_flg|update_dt                 |
+----------+-----------------------+------------+----+-------+-------+-----+----------------------------------------------------------+--------------------------+--------+-------------+--------------------------+-------------------+----------+--------------------------+
|P001      |Purina Pro Plan        |Purina      |Dry |Chicken|5 kgs  |20.0 |https://www.example.com/purina-pro-plan-chicken.jpg       |2024-06-01 08:16:58.750267|20220101|2024-12-31   |20

In [7]:
uuid_udf = udf(lambda : uuid.uuid4,StringType())

In [9]:
# Generate Surrogate Keys

df_dim_temp = df_stg.withColumn("row_wid",uuid_udf())\
.withColumn("history_update_dt",current_timestamp())\
.withColumn("history_active_flg",lit(0))\
.withColumn("history_effective_end_dt",expr("CAST(effective_start_dt AS TIMESTAMP) - INTERVAL 1 SECONDS"))

print("SPARK_APP: Dim Temp Data Count - " + str(df_dim_temp.count()))
print("SPARK_APP: Printing Dim Temp Schema --")
df_dim_temp.printSchema()

SPARK_APP: Dim Temp Data Count - 14
SPARK_APP: Printing Dim Temp Schema --
root
 |-- product_id: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- type: string (nullable = true)
 |-- flavor: string (nullable = true)
 |-- size: string (nullable = true)
 |-- price: double (nullable = true)
 |-- image_url: string (nullable = true)
 |-- insert_dt: timestamp (nullable = true)
 |-- rundate: string (nullable = true)
 |-- expiration_dt: date (nullable = true)
 |-- effective_start_dt: timestamp (nullable = true)
 |-- effective_end_dt: timestamp (nullable = true)
 |-- active_flg: integer (nullable = true)
 |-- update_dt: timestamp (nullable = true)
 |-- row_wid: string (nullable = true)
 |-- history_update_dt: timestamp (nullable = false)
 |-- history_active_flg: integer (nullable = false)
 |-- history_effective_end_dt: timestamp (nullable = true)



In [10]:
#Get the delta table for Upserts (SCD2)
dt_dim = DeltaTable.forName(spark,table_full_name)

# check if table is set for full load

if get_max_timestamp(spark,schema_name,table_name) == "1900-01-01 00:00:00.000000":
    print("SPARK_APP: Table is set for full load") 
    # Truncate the Dimension table
    spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled",False)
    dt_dim.delete(f"1=1")
    dt_dim.vacuum(0)

dt_dim.alias("dim_product").merge\
      (df_dim_temp.alias("dim_temp"), "dim_product.product_id = dim_temp.product_id and dim_product.active_flg = 1")\
.whenMatchedUpdate( set = {
    "update_dt" : "history_update_dt",
    "active_flg" : "history_active_flg",
    "effective_end_dt" : "history_effective_end_dt"}).execute()

print("SPARK_APP: Updated History Records")



SPARK_APP: Table is set for full load
SPARK_APP: Updated History Records


In [11]:
# see metrics for Delta Table
dt_dim.history().limit(1).select("version","operationMetrics.executionTimeMs", 
                                 "operationMetrics.numTargetRowsInserted",
                                "operationMetrics.numTargetRowsUpdated",
                                "operationMetrics.numOutputRows").show(1, False)

+-------+---------------+---------------------+--------------------+-------------+
|version|executionTimeMs|numTargetRowsInserted|numTargetRowsUpdated|numOutputRows|
+-------+---------------+---------------------+--------------------+-------------+
|1      |9419           |0                    |0                   |0            |
+-------+---------------+---------------------+--------------------+-------------+



In [12]:
# Insert all records in Delta Table in APPEND mode

df_dim_temp\
.drop("history_effective_end_dt", "history_active_flg", "history_update_dt")\
.write.format("delta").mode("append").saveAsTable(table_full_name)

print("SPARK_APP: Active Records inserted into Dimesion Table")

SPARK_APP: Active Records inserted into Dimesion Table


In [13]:
# see metrics for Delta Table
dt_dim.history().limit(1).select("version","operationMetrics.executionTimeMs", 
                                 "operationMetrics.numTargetRowsInserted",
                                "operationMetrics.numTargetRowsUpdated",
                                "operationMetrics.numOutputRows").show(1, False)

+-------+---------------+---------------------+--------------------+-------------+
|version|executionTimeMs|numTargetRowsInserted|numTargetRowsUpdated|numOutputRows|
+-------+---------------+---------------------+--------------------+-------------+
|2      |null           |null                 |null                |14           |
+-------+---------------+---------------------+--------------------+-------------+



In [17]:
spark.sql("select * from edw.dim_product limit 5").show(truncate = False)

+----------------------------------------------------------+----------+-----------------------+------------+----+-------+-------+-----+-------------+----------------------------------------------------------+--------------------------+-------------------+----------+--------+--------------------------+--------------------------+
|row_wid                                                   |product_id|product_name           |brand       |type|flavor |size   |price|expiration_dt|image_url                                                 |effective_start_dt        |effective_end_dt   |active_flg|rundate |insert_dt                 |update_dt                 |
+----------------------------------------------------------+----------+-----------------------+------------+----+-------+-------+-----+-------------+----------------------------------------------------------+--------------------------+-------------------+----------+--------+--------------------------+--------------------------+
|net.razor

In [18]:
spark.stop()