In [1]:
# Script to create date for Date Dimesion Staging
# Import required libraries
import sys
from lib.spark_session import get_spark_session
from lib.utils import date_data, get_string_cols, get_rundate
from lib.job_control import insert_log, get_max_timestamp
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import current_timestamp, expr, to_date, lit,to_timestamp,lit,coalesce,split
from datetime import datetime
from delta import DeltaTable

In [2]:
# JOB Parameters
rundate = get_rundate()
schema_name = "edw_stg"
table_name = "dim_product_stg"
table_full_name = f"{schema_name}.{table_name}"
landing_table_full_name = "edw_ld.dim_product_ld"
print("SPARK_APP: JOB triggered for rundate - " + rundate)

SPARK_APP: JOB triggered for rundate - 20220101


In [3]:
spark:SparkSession = get_spark_session(f"staging_load: {table_full_name}")
print("SPARK_APP: Spark UI - " + spark.sparkContext.uiWebUrl)

SPARK_APP: Spark UI - http://03205cdd01e3:4040


In [4]:
spark.conf.set("spark.sql.shuffle.partitions",8)
spark.conf.set("spark.sql.parquet.mergeSchema",True)

In [5]:
# Get the max_timestamp for data load in staging
max_timestamp = get_max_timestamp(spark, schema_name, table_name)
print("SPARK_APP: Max timestamp for staging data load - " + str(max_timestamp))

SPARK_APP: Max timestamp for staging data load - 1900-01-01 00:00:00.000000


In [6]:
df_ld = spark \
    .read \
    .table(landing_table_full_name) \
    .where(f"insert_dt > to_timestamp('{max_timestamp}')")

print("SPARK_APP: Landing Data Count - " + str(df_ld.count()))
print("SPARK_APP: Printing Landing Schema --")
df_ld.printSchema()

SPARK_APP: Landing Data Count - 14
SPARK_APP: Printing Landing Schema --
root
 |-- product_id: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- type: string (nullable = true)
 |-- flavor: string (nullable = true)
 |-- size: string (nullable = true)
 |-- price: string (nullable = true)
 |-- quantity: string (nullable = true)
 |-- expiration_date: string (nullable = true)
 |-- image_url: string (nullable = true)
 |-- insert_dt: timestamp (nullable = true)
 |-- rundate: string (nullable = true)



In [7]:
# De-dupe the data based on NK
df_dedupe = df_ld.withColumn("_rnk", expr(f"row_number() over (partition by product_id order by insert_dt desc)")) \
    .where("_rnk = 1").drop("_rnk")

print("SPARK_APP: Landing Data Count after de-dupe - " + str(df_dedupe.count()))

SPARK_APP: Landing Data Count after de-dupe - 14


In [8]:
df_dedupe.show(5,truncate = False)

+----------+-----------------------+------------+----+-------+-------+-----+--------+---------------+----------------------------------------------------------+--------------------------+--------+
|product_id|product_name           |brand       |type|flavor |size   |price|quantity|expiration_date|image_url                                                 |insert_dt                 |rundate |
+----------+-----------------------+------------+----+-------+-------+-----+--------+---------------+----------------------------------------------------------+--------------------------+--------+
|P001      |Purina Pro Plan        |Purina      |Dry |Chicken|5 kgs  |Rs 20|50      |31-12-2024     |https://www.example.com/purina-pro-plan-chicken.jpg       |2024-06-01 07:47:16.371708|20220101|
|P002      |Hill's Science Diet    |Hill's      |Dry |Beef   |15 kgs |Rs 50|25      |31-12-2024     |https://www.example.com/hills-science-diet-beef.jpg       |2024-06-01 07:47:16.371708|20220101|
|P003      |Iam

In [10]:
df_stg = df_dedupe.withColumn("price",split("price", " ")[1].cast("double"))\
.withColumn("expiration_dt",to_date("expiration_date","dd-MM-yyyy"))\
.withColumn("effective_start_dt", current_timestamp())\
.withColumn("effective_end_dt", to_timestamp(lit("9999-12-31 00:00:00.000000"))) \
.withColumn("active_flg", lit(1)) \
    .withColumn("insert_dt", current_timestamp()) \
    .withColumn("update_dt", current_timestamp()) \
    .drop("expiration_date", "quantity")         



In [11]:
df_stg.show(5,False)

+----------+-----------------------+------------+----+-------+-------+-----+----------------------------------------------------------+-------------------------+--------+-------------+-------------------------+-------------------+----------+-------------------------+
|product_id|product_name           |brand       |type|flavor |size   |price|image_url                                                 |insert_dt                |rundate |expiration_dt|effective_start_dt       |effective_end_dt   |active_flg|update_dt                |
+----------+-----------------------+------------+----+-------+-------+-----+----------------------------------------------------------+-------------------------+--------+-------------+-------------------------+-------------------+----------+-------------------------+
|P001      |Purina Pro Plan        |Purina      |Dry |Chicken|5 kgs  |20.0 |https://www.example.com/purina-pro-plan-chicken.jpg       |2024-06-01 08:14:22.67819|20220101|2024-12-31   |2024-06-01 0

In [12]:
# writing data to table
df_stg.write.format("delta").mode("overwrite").saveAsTable(table_full_name)
print("SPARK_APP: Data written to staging table")

SPARK_APP: Data written to staging table


In [13]:
# Add job details in JOB CONTROL
insert_log(spark, schema_name, table_name, datetime.now(), rundate)
print("SPARK_APP: Update JOB Control Log")

SPARK_APP: Update JOB Control Log


In [14]:
spark.sql(f"select * from edw.job_control where table_name = '{table_name}' order by insert_dt desc limit 1").show(truncate=False)

+-----------+---------------+--------------------------+--------+--------------------------+
|schema_name|table_name     |max_timestamp             |rundate |insert_dt                 |
+-----------+---------------+--------------------------+--------+--------------------------+
|edw_stg    |dim_product_stg|2024-06-01 08:17:19.552546|20220101|2024-06-01 08:17:20.460358|
+-----------+---------------+--------------------------+--------+--------------------------+



In [15]:
# Get the logs from delta table version
dt = DeltaTable.forName(spark, table_full_name)
dt.history().limit(1).select("version","operationMetrics.executionTimeMs", 
                                 "operationMetrics.numTargetRowsInserted",
                                "operationMetrics.numTargetRowsUpdated",
                                "operationMetrics.numOutputRows").show(1, False)

+-------+---------------+---------------------+--------------------+-------------+
|version|executionTimeMs|numTargetRowsInserted|numTargetRowsUpdated|numOutputRows|
+-------+---------------+---------------------+--------------------+-------------+
|0      |null           |null                 |null                |14           |
+-------+---------------+---------------------+--------------------+-------------+



In [16]:
# Generate Symlink manifest for Athena Access
dt.generate("symlink_format_manifest")
print("SPARK_APP: Symlink Manifest file generated")

SPARK_APP: Symlink Manifest file generated


In [18]:
spark.sql(f"select * from edw_stg.dim_product_stg limit 10").show(truncate=False)

+----------+-------------------------+-----------------+----+-------+-------+-----+-----------------------------------------------------------+--------------------------+--------+-------------+--------------------------+-------------------+----------+--------------------------+
|product_id|product_name             |brand            |type|flavor |size   |price|image_url                                                  |insert_dt                 |rundate |expiration_dt|effective_start_dt        |effective_end_dt   |active_flg|update_dt                 |
+----------+-------------------------+-----------------+----+-------+-------+-----+-----------------------------------------------------------+--------------------------+--------+-------------+--------------------------+-------------------+----------+--------------------------+
|P001      |Purina Pro Plan          |Purina           |Dry |Chicken|5 kgs  |20.0 |https://www.example.com/purina-pro-plan-chicken.jpg        |2024-06-01 08:16:58.

In [19]:
spark.stop()