In [1]:
# Script to create date for Date Dimesion Staging
# Import required libraries
import sys
from lib.spark_session import get_spark_session
from lib.utils import date_data, get_string_cols, get_rundate
from lib.job_control import insert_log, get_max_timestamp
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import current_timestamp, expr, to_date, lit
from datetime import datetime
from delta import DeltaTable

In [2]:
# JOB Parameters
rundate = get_rundate()
schema_name = "edw_stg"
table_name = "dim_store_stg"
table_full_name = f"{schema_name}.{table_name}"
landing_table_full_name = "edw_ld.dim_store_ld"
print("SPARK_APP: JOB triggered for rundate - " + rundate)

SPARK_APP: JOB triggered for rundate - 20220101


In [3]:
# Generate Spark Session
spark: SparkSession = get_spark_session(f"Staging load - {table_full_name}")
print("SPARK_APP: Spark UI - " + spark.sparkContext.uiWebUrl)

SPARK_APP: Spark UI - http://03205cdd01e3:4040


In [6]:
#setting Spark Configs
spark.conf.set("spark.sql.shuffle.partitions", 8)
spark.conf.set("spark.sql.parquet.mergeSchema", True)
# Get the max_timestamp for data load in staging
max_timestamp = get_max_timestamp(spark, schema_name, table_name)
print("SPARK_APP: Max timestamp for staging data load - " + str(max_timestamp))

SPARK_APP: Max timestamp for staging data load - 1900-01-01 00:00:00.000000


In [5]:
df_ld = spark \
    .read \
    .table(landing_table_full_name) \
    .where(f"insert_dt > to_timestamp('{max_timestamp}')")

print("SPARK_APP: Landing Data Count - " + str(df_ld.count()))
print("SPARK_APP: Printing Landing Schema --")
df_ld.printSchema()

SPARK_APP: Landing Data Count - 14
SPARK_APP: Printing Landing Schema --
root
 |-- store_id: string (nullable = true)
 |-- store_name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- insert_dt: timestamp (nullable = true)
 |-- rundate: string (nullable = true)



In [9]:
df_ld.show(truncate = False)

+--------+------------+--------------+-----------+-----+--------+--------------+--------------------------+--------+
|store_id|store_name  |address       |city       |state|zip_code|phone_number  |insert_dt                 |rundate |
+--------+------------+--------------+-----------+-----+--------+--------------+--------------------------+--------+
|S001    |Pet House KA|123 Main St   |Anytown    |KA   |12345   |91-88929-88888|2024-05-26 12:01:58.482732|20220101|
|S002    |Pet House MH|456 Elm St    |Anothertown|MH   |67890   |91-99999-99999|2024-05-26 12:01:58.482732|20220101|
|S003    |Pet House TN|789 Oak Ave   |Bigcity    |TN   |9876    |91-77777-77777|2024-05-26 12:01:58.482732|20220101|
|S004    |Pet House OR|321 Birch Blvd|Small Town |OR   |76684   |91-88822-00000|2024-05-26 12:01:58.482732|20220101|
|S005    |Pet House WB|654 Pine St   |Busytown   |WB   |11111   |91-00002-22222|2024-05-26 12:01:58.482732|20220101|
|S006    |Pet House JK|987 Cedar Rd  |Hill Town  |JK   |22222   

In [7]:
# De-dupe the data based on NK
df_dedupe = df_ld.withColumn("_rnk", expr(f"row_number() over (partition by store_id order by insert_dt desc)")) \
    .where("_rnk = 1").drop("_rnk")

print("SPARK_APP: Landing Data Count after de-dupe - " + str(df_dedupe.count()))

SPARK_APP: Landing Data Count after de-dupe - 7


In [8]:
df_stg = df_dedupe.withColumn("insert_dt",current_timestamp()).withColumn("update_dt",current_timestamp())
print("SPARK_APP: Staging Data Count - " + str(df_stg.count()))
print("SPARK_APP: Printing Staging Schema --")
df_stg.printSchema()

SPARK_APP: Staging Data Count - 7
SPARK_APP: Printing Staging Schema --
root
 |-- store_id: string (nullable = true)
 |-- store_name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- insert_dt: timestamp (nullable = false)
 |-- rundate: string (nullable = true)
 |-- update_dt: timestamp (nullable = false)



In [10]:
df_stg.show(truncate = False)

+--------+------------+--------------+-----------+-----+--------+--------------+--------------------------+--------+--------------------------+
|store_id|store_name  |address       |city       |state|zip_code|phone_number  |insert_dt                 |rundate |update_dt                 |
+--------+------------+--------------+-----------+-----+--------+--------------+--------------------------+--------+--------------------------+
|S001    |Pet House KA|123 Main St   |Anytown    |KA   |12345   |91-88929-88888|2024-05-26 12:32:24.037692|20220101|2024-05-26 12:32:24.037692|
|S002    |Pet House MH|456 Elm St    |Anothertown|MH   |67890   |91-99999-99999|2024-05-26 12:32:24.037692|20220101|2024-05-26 12:32:24.037692|
|S003    |Pet House TN|789 Oak Ave   |Bigcity    |TN   |9876    |91-77777-77777|2024-05-26 12:32:24.037692|20220101|2024-05-26 12:32:24.037692|
|S004    |Pet House OR|321 Birch Blvd|Small Town |OR   |76684   |91-88822-00000|2024-05-26 12:32:24.037692|20220101|2024-05-26 12:32:24.

In [11]:
# Write the data to Staging table in overwrite mode for truncate
df_stg.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable(table_full_name)


print("SPARK_APP: Data written to staging table")

SPARK_APP: Data written to staging table


In [12]:
# Add job details in JOB CONTROL
insert_log(spark, schema_name, table_name, datetime.now(), rundate)
print("SPARK_APP: Update JOB Control Log")

SPARK_APP: Update JOB Control Log


In [13]:
spark.sql(f"select * from edw.job_control where table_name = '{table_name}' order by insert_dt desc limit 1").show(truncate=False)

+-----------+-------------+--------------------------+--------+--------------------------+
|schema_name|table_name   |max_timestamp             |rundate |insert_dt                 |
+-----------+-------------+--------------------------+--------+--------------------------+
|edw_stg    |dim_store_stg|2024-05-26 12:35:18.249333|20220101|2024-05-26 12:35:19.165441|
+-----------+-------------+--------------------------+--------+--------------------------+



In [14]:
# Get the logs from delta table version
dt = DeltaTable.forName(spark, table_full_name)
dt.history().limit(1).select("version","operationMetrics.executionTimeMs", 
                                 "operationMetrics.numTargetRowsInserted",
                                "operationMetrics.numTargetRowsUpdated",
                                "operationMetrics.numOutputRows").show(1, False)

+-------+---------------+---------------------+--------------------+-------------+
|version|executionTimeMs|numTargetRowsInserted|numTargetRowsUpdated|numOutputRows|
+-------+---------------+---------------------+--------------------+-------------+
|0      |null           |null                 |null                |7            |
+-------+---------------+---------------------+--------------------+-------------+



In [15]:
# Generate Symlink manifest for Athena Access
dt.generate("symlink_format_manifest")
print("SPARK_APP: Symlink Manifest file generated")

SPARK_APP: Symlink Manifest file generated


In [16]:
spark.stop()