In [10]:
# Script to Load Store Dimesion
# Import required libraries
import sys
from lib.spark_session import get_spark_session
from lib.utils import date_data, get_string_cols, get_rundate
from lib.job_control import insert_log, get_max_timestamp
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import current_timestamp, expr, to_date, date_format,udf
from pyspark.sql.types import StringType
from datetime import datetime
from delta import DeltaTable
import uuid

In [2]:
#Job Parameters
rundate = get_rundate()
schema_name = "edw"
table_name = "dim_store"
table_full_name = f"{schema_name}.{table_name}"
staging_table_full_name = "edw_stg.dim_store_stg"
print("SPARK_APP: JOB triggered for rundate - " + rundate)

SPARK_APP: JOB triggered for rundate - 20220101


In [3]:
# Generate Spark Session
spark: SparkSession = get_spark_session(f"Dimension load - {table_full_name}")
print("SPARK_APP: Spark UI - " + spark.sparkContext.uiWebUrl)

SPARK_APP: Spark UI - http://03205cdd01e3:4040


In [4]:
#spark Configs
spark.conf.set("spark.sql.shuffle.partitions", 8)
spark.conf.set("spark.sql.parquet.mergeSchema", True)

In [5]:
# Read data from Staging
df_stg = spark \
    .read \
    .table(staging_table_full_name)

print("SPARK_APP: Staging Data Count - " + str(df_stg.count()))
print("SPARK_APP: Printing Staging Schema --")
df_stg.printSchema()

SPARK_APP: Staging Data Count - 7
SPARK_APP: Printing Staging Schema --
root
 |-- store_id: string (nullable = true)
 |-- store_name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- insert_dt: timestamp (nullable = true)
 |-- rundate: string (nullable = true)
 |-- update_dt: timestamp (nullable = true)



In [11]:
uuidf = udf(lambda : str(uuid.uuid4), StringType())

In [13]:
#creating temp df to add new column
df_dim_temp = df_stg.withColumn("row_wid", uuidf())

print("SPARK_APP: Dim Temp Data Count - " + str(df_dim_temp.count()))
print("SPARK_APP: Printing Dim Temp Schema --")
df_dim_temp.printSchema()

SPARK_APP: Dim Temp Data Count - 7
SPARK_APP: Printing Dim Temp Schema --
root
 |-- store_id: string (nullable = true)
 |-- store_name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- insert_dt: timestamp (nullable = true)
 |-- rundate: string (nullable = true)
 |-- update_dt: timestamp (nullable = true)
 |-- row_wid: string (nullable = true)



In [18]:
# Get the delta table for Upserts (SCD1)
dt_dim = DeltaTable.forName(spark,table_full_name)

# Validate if its a first run
if get_max_timestamp(spark,schema_name,table_name) != "1900-01-01 00:00:00.000000":
    print("SPARK_APP: Table is set for full load") 
    # Truncate the Dimension table
    spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled",False)
    dt_dim.delete(f"1=1")
    dt_dim.vaccum(0)

dt_dim.alias("dim_store").\
merge(df_dim_temp.alias("dim_temp"),"dim_store.store_id=dim_temp.store_id").\
whenMatchedUpdate(set = {
   "store_name" : "dim_temp.store_name",
            "address" : "dim_temp.address",
            "city" : "dim_temp.city",
            "state" : "dim_temp.state",
            "zip_code" : "dim_temp.zip_code",
            "phone_number" : "dim_temp.phone_number",
            "rundate" : "dim_temp.rundate",
            "update_dt" : "dim_temp.update_dt"
        }  
    ) \
    .whenNotMatchedInsertAll() \
    .execute()
    
    



In [19]:
# Add job details in JOB CONTROL
insert_log(spark, schema_name, table_name, datetime.now(), rundate)
print("SPARK_APP: Update JOB Control Log")

SPARK_APP: Update JOB Control Log


In [20]:
spark.sql(f"select * from edw.job_control where table_name = '{table_name}' order by insert_dt desc limit 1").show(truncate=False)

+-----------+----------+--------------------------+--------+--------------------------+
|schema_name|table_name|max_timestamp             |rundate |insert_dt                 |
+-----------+----------+--------------------------+--------+--------------------------+
|edw        |dim_store |2024-05-26 14:29:01.560863|20220101|2024-05-26 14:29:02.180285|
+-----------+----------+--------------------------+--------+--------------------------+



In [21]:
# Get the logs from delta table version
dt_dim.history().limit(1).select("version","operationMetrics.executionTimeMs", 
                                 "operationMetrics.numTargetRowsInserted",
                                "operationMetrics.numTargetRowsUpdated",
                                "operationMetrics.numOutputRows").show(1, False)

+-------+---------------+---------------------+--------------------+-------------+
|version|executionTimeMs|numTargetRowsInserted|numTargetRowsUpdated|numOutputRows|
+-------+---------------+---------------------+--------------------+-------------+
|1      |8399           |7                    |0                   |7            |
+-------+---------------+---------------------+--------------------+-------------+



In [22]:
# Generate Symlink manifest for Athena Access
dt = DeltaTable.forName(spark, table_full_name)
dt.generate("symlink_format_manifest")
print("SPARK_APP: Symlink Manifest file generated")


SPARK_APP: Symlink Manifest file generated


In [23]:
spark.stop()