In [1]:
# Script to load customers in dimension table using SCD type 2 table

# Import required libraries
import sys
from lib.spark_session import get_spark_session
from lib.utils import date_data, get_string_cols, get_rundate
from lib.job_control import insert_log, get_max_timestamp
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import current_timestamp, expr, to_date, date_format,udf,lit,col
from pyspark.sql.types import StringType
from datetime import datetime
from delta import DeltaTable
import uuid



In [2]:
# Job Parameters
rundate = get_rundate()
schema_name = "edw"
table_name = "dim_customer"
table_full_name = f"{schema_name}.{table_name}"
staging_table_full_name = "edw_stg.dim_customer_stg"
print("SPARK_APP: JOB triggered for rundate - " + rundate)

SPARK_APP: JOB triggered for rundate - 20220101


In [3]:
# Generate Spark Session
spark: SparkSession = get_spark_session(f"Dimension load - {table_full_name}")
print("SPARK_APP: Spark UI - " + spark.sparkContext.uiWebUrl)

SPARK_APP: Spark UI - http://03205cdd01e3:4040


In [4]:
#spark Configs
spark.conf.set("spark.sql.shuffle.partitions", 8)
#spark.conf.set("spark.sql.parquet.mergeSchema", False)

In [5]:
# Reading Stage Layer Data

df_stg = spark.read.table(staging_table_full_name)

print("SPARK_APP: Staging Data Count - " + str(df_stg.count()))
print("SPARK_APP: Printing Staging Schema --")
df_stg.printSchema()

SPARK_APP: Staging Data Count - 18
SPARK_APP: Printing Staging Schema --
root
 |-- customer_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- email: string (nullable = true)
 |-- date_of_birth: date (nullable = true)
 |-- plan_type: string (nullable = true)
 |-- insert_dt: timestamp (nullable = true)
 |-- rundate: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- effective_start_date: timestamp (nullable = true)
 |-- effective_end_date: timestamp (nullable = true)
 |-- active_flag: integer (nullable = true)
 |-- update_dt: timestamp (nullable = true)



In [8]:
df_stg.show(5,False)

+-----------+-----------------+--------------+-----------+-----+--------+--------------+--------------------------+-------------+---------+--------------------------+--------+----------+---------+--------------------------+-------------------+-----------+--------------------------+
|customer_id|name             |address       |city       |state|zip_code|phone_number  |email                     |date_of_birth|plan_type|insert_dt                 |rundate |first_name|last_name|effective_start_date      |effective_end_date |active_flag|update_dt                 |
+-----------+-----------------+--------------+-----------+-----+--------+--------------+--------------------------+-------------+---------+--------------------------+--------+----------+---------+--------------------------+-------------------+-----------+--------------------------+
|C001       |Ramesh Kumar     |123 Main St   |Anytown    |WB   |12345   |91-00000-00000|ramesh@email.com          |1980-01-01   |P        |2024-05-31 0

In [6]:
# Generate UUID for Surrogate Key
uuid_udf = udf(lambda: str(uuid.uuid4()), StringType())

In [8]:
# Generate Surrogate Keys

df_dim_temp = df_stg\
.withColumn("row_wid",uuid_udf())\
.withColumn("hist_active_flag",lit(0))\
.withColumn("history_record_end_timestamp",expr("cast(effective_start_date as timestamp) - INTERVAL 1 SECONDS"))\
.withColumn("hist_record_update_dt",current_timestamp())

print("SPARK_APP: Dim Temp Data Count - " + str(df_dim_temp.count()))
print("SPARK_APP: Printing Dim Temp Schema --")
df_dim_temp.printSchema()

SPARK_APP: Dim Temp Data Count - 18
SPARK_APP: Printing Dim Temp Schema --
root
 |-- customer_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- email: string (nullable = true)
 |-- date_of_birth: date (nullable = true)
 |-- plan_type: string (nullable = true)
 |-- insert_dt: timestamp (nullable = true)
 |-- rundate: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- effective_start_date: timestamp (nullable = true)
 |-- effective_end_date: timestamp (nullable = true)
 |-- active_flag: integer (nullable = true)
 |-- update_dt: timestamp (nullable = true)
 |-- row_wid: string (nullable = true)
 |-- hist_active_flag: integer (nullable = false)
 |-- history_record_end_timestamp: timestamp (nullable = true)
 |-- hist_

In [12]:
#Get the delta table for Upserts (SCD2)
dt_dim = DeltaTable.forName(spark,table_full_name)

# check if table is set for full load

if get_max_timestamp(spark,schema_name,table_name) == "1900-01-01 00:00:00.000000":
    print("SPARK_APP: Table is set for full load") 
    # Truncate the Dimension table
    spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled",False)
    dt_dim.delete(f"1=1")
    dt_dim.vacuum(0)

dt_dim.alias("dim_customer").merge\
      (df_dim_temp.alias("dim_temp"), "dim_customer.customer_id = dim_temp.customer_id and dim_customer.active_flag = 1")\
.whenMatchedUpdate( set = {
    "update_dt" : "hist_record_update_dt",
    "active_flag" : "hist_active_flag",
    "effective_end_date" : "history_record_end_timestamp"}).execute()

print("SPARK_APP: Updated History Records")


SPARK_APP: Table is set for full load
SPARK_APP: Updated History Records


In [10]:
df = spark.read.table(f"{table_full_name}")
df.schema

StructType([StructField('row_wid', StringType(), True), StructField('customer_id', StringType(), True), StructField('first_name', StringType(), True), StructField('last_name', StringType(), True), StructField('address', StringType(), True), StructField('city', StringType(), True), StructField('state', StringType(), True), StructField('zip_code', StringType(), True), StructField('phone_number', StringType(), True), StructField('email', StringType(), True), StructField('date_of_birth', DateType(), True), StructField('plan_type', StringType(), True), StructField('effective_start_date', TimestampType(), True), StructField('effective_end_date', TimestampType(), True), StructField('active_flag', IntegerType(), True), StructField('rundate', StringType(), True), StructField('insert_dt', TimestampType(), True), StructField('update_dt', TimestampType(), True)])

In [23]:
# see metrics for Delta Table
dt_dim.history().limit(1).select("version","operationMetrics.executionTimeMs", 
                                 "operationMetrics.numTargetRowsInserted",
                                "operationMetrics.numTargetRowsUpdated",
                                "operationMetrics.numOutputRows").show(1, False)

+-------+---------------+---------------------+--------------------+-------------+
|version|executionTimeMs|numTargetRowsInserted|numTargetRowsUpdated|numOutputRows|
+-------+---------------+---------------------+--------------------+-------------+
|1      |8488           |0                    |0                   |0            |
+-------+---------------+---------------------+--------------------+-------------+



In [13]:
# Insert all records in Delta Table in APPEND mode

df_dim_temp\
.drop("history_record_end_timestamp", "hist_active_flag", "hist_record_update_dt", "name")\
.write.format("delta").mode("append").saveAsTable(table_full_name)

print("SPARK_APP: Active Records inserted into Dimesion Table")

SPARK_APP: Active Records inserted into Dimesion Table


In [16]:
spark.stop()

In [15]:
spark.sql("select * from edw.dim_customer").show()

+--------------------+-----------+----------+---------+--------------+-----------+-----+--------+--------------+--------------------+-------------+---------+--------------------+-------------------+-----------+--------+--------------------+--------------------+
|             row_wid|customer_id|first_name|last_name|       address|       city|state|zip_code|  phone_number|               email|date_of_birth|plan_type|effective_start_date| effective_end_date|active_flag| rundate|           insert_dt|           update_dt|
+--------------------+-----------+----------+---------+--------------+-----------+-----+--------+--------------+--------------------+-------------+---------+--------------------+-------------------+-----------+--------+--------------------+--------------------+
|65d16f34-e371-455...|       C001|    Ramesh|    Kumar|   123 Main St|    Anytown|   WB|   12345|91-00000-00000|    ramesh@email.com|   1980-01-01|        P|2024-05-31 03:34:...|9999-12-31 00:00:00|          1|2022