In [2]:
# Script to load Orders Data in Sales table using SCD type 2 table

# Import required libraries
import sys
from lib.spark_session import get_spark_session
from lib.utils import date_data, get_string_cols, get_rundate
from lib.job_control import insert_log, get_max_timestamp
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import current_timestamp, expr, to_date, date_format,udf,lit,col
from pyspark.sql.types import StringType
from datetime import datetime
from delta import DeltaTable
import uuid

In [17]:
# Job Parameters
rundate = get_rundate()
schema_name = 'edw'
table_name = 'fact_sales'
table_full_name = f'{schema_name}.{table_name}'
staging_table_full_name = 'edw_stg.fact_sales_stg'

In [15]:
# generate spark sesion
spark:SparkSession = get_spark_session(f'Fact Table Load - {table_full_name}')
print("SPARK_APP: Spark UI - " + spark.sparkContext.uiWebUrl)


SPARK_APP: Spark UI - http://03205cdd01e3:4040


In [16]:
# config Spark Parameters
spark.conf.set("spark.sql.shuffle.partitions",8)
#spark.conf.set("spark.sql.parquet.mergeSchema",False)

In [18]:
# Read Staging Data
df_stg = spark.read.table(staging_table_full_name)

print("SPARK_APP: Staging Data Count - " + str(df_stg.count()))
print("SPARK_APP: Printing Staging Schema --")
df_stg.printSchema()

SPARK_APP: Staging Data Count - 163
SPARK_APP: Printing Staging Schema --
root
 |-- cust_id: string (nullable = true)
 |-- store_id: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- qty: integer (nullable = true)
 |-- tax: double (nullable = true)
 |-- discount: double (nullable = true)
 |-- line_total: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- invoice_num: string (nullable = true)
 |-- prod_id: string (nullable = true)
 |-- product_wid: string (nullable = true)
 |-- integration_key: string (nullable = true)
 |-- rundate: string (nullable = true)
 |-- insert_dt: timestamp (nullable = true)
 |-- update_dt: timestamp (nullable = true)



In [19]:
# print staging data sample
df_stg.show(2,truncate = False)

+-------+--------+----------+---+----+--------+----------+----------------+----------------+-------+----------------------------------------------------------+------------------------------------------+--------+--------------------------+--------------------------+
|cust_id|store_id|order_date|qty|tax |discount|line_total|order_id        |invoice_num     |prod_id|product_wid                                               |integration_key                           |rundate |insert_dt                 |update_dt                 |
+-------+--------+----------+---+----+--------+----------+----------------+----------------+-------+----------------------------------------------------------+------------------------------------------+--------+--------------------------+--------------------------+
|C018   |S003    |2022-07-19|9  |11.0|5.4     |545.6     |ORD2022071900000|INV2022071900000|P009   |net.razorvine.pickle.objects.ClassDictConstructor@55c03255|ORD2022071900000~P009~S003~C018~2022-07-19|

In [21]:
# Read dim tables to join to with Fact
df_dim_store = spark.read.table("edw.dim_store").selectExpr("store_id", "row_wid as store_wid")
# df_dim_date = spark.read.table("edw.dim_date")
df_dim_customer = spark.read.table("edw.dim_customer").where("active_flag = 1").selectExpr("customer_id", "row_wid as customer_wid")

In [25]:
# Add additional columns
df_fact = df_stg\
                .join(df_dim_store,how='left_outer',on=df_stg.store_id == df_dim_store.store_id)\
                .join(df_dim_customer,how='left_outer',on=df_stg.cust_id == df_dim_customer.customer_id)\
                .withColumn("date_wid",date_format("order_date","yyyyMMdd"))\
                .withColumn("rundate",lit(rundate))\
                .withColumn("update_dt", current_timestamp())\
                .withColumn("insert_dt", current_timestamp())\
.select("date_wid", "product_wid", "store_wid", "customer_wid", "order_id", "invoice_num", 
           "qty", "tax", "discount", "line_total", "integration_key", "rundate", "insert_dt", "update_dt")

print("SPARK_APP: Fact Data Count - " + str(df_fact.count()))
print("SPARK_APP: Printing Fact Schema --")
df_fact.printSchema()

SPARK_APP: Fact Data Count - 163
SPARK_APP: Printing Fact Schema --
root
 |-- date_wid: string (nullable = true)
 |-- product_wid: string (nullable = true)
 |-- store_wid: string (nullable = true)
 |-- customer_wid: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- invoice_num: string (nullable = true)
 |-- qty: integer (nullable = true)
 |-- tax: double (nullable = true)
 |-- discount: double (nullable = true)
 |-- line_total: double (nullable = true)
 |-- integration_key: string (nullable = true)
 |-- rundate: string (nullable = false)
 |-- insert_dt: timestamp (nullable = false)
 |-- update_dt: timestamp (nullable = false)



In [26]:
# Validate DataFrame Data
df_fact.show(5, truncate = False)

+--------+----------------------------------------------------------+----------------------------------+------------------------------------+----------------+----------------+---+----+--------+----------+------------------------------------------+--------+--------------------------+--------------------------+
|date_wid|product_wid                                               |store_wid                         |customer_wid                        |order_id        |invoice_num     |qty|tax |discount|line_total|integration_key                           |rundate |insert_dt                 |update_dt                 |
+--------+----------------------------------------------------------+----------------------------------+------------------------------------+----------------+----------------+---+----+--------+----------+------------------------------------------+--------+--------------------------+--------------------------+
|20220719|net.razorvine.pickle.objects.ClassDictConstructor@55c0325

In [28]:
# load data into final table
df_fact.write.format('delta').mode("append").saveAsTable(table_full_name)
print("SPARK APP: Fact Table Loaded")

SPARK APP: Fact Table Loaded


In [29]:
# Update Job Control
insert_log(spark,schema_name,table_name,datetime.now(),rundate)
print("SPARK APP: Update Job Control")

SPARK APP: Update Job Control


In [31]:
# Delta table Metrics
dt_fact = DeltaTable.forName(spark,table_full_name)
dt_fact.history().limit(1).select("version","operationMetrics.executionTimeMs", 
                                 "operationMetrics.numTargetRowsInserted",
                                "operationMetrics.numTargetRowsUpdated",
                                "operationMetrics.numOutputRows").show(1, False)

+-------+---------------+---------------------+--------------------+-------------+
|version|executionTimeMs|numTargetRowsInserted|numTargetRowsUpdated|numOutputRows|
+-------+---------------+---------------------+--------------------+-------------+
|1      |null           |null                 |null                |163          |
+-------+---------------+---------------------+--------------------+-------------+



In [33]:
# Validate Data
spark.sql("select * from edw.fact_sales limit 10").show(truncate = False)


+--------+----------------------------------------------------------+----------------------------------+------------------------------------+----------------+----------------+---+------------------+--------+----------+------------------------------------------+--------+------------------------+------------------------+
|date_wid|product_wid                                               |store_wid                         |customer_wid                        |order_id        |invoice_num     |qty|tax               |discount|line_total|integration_key                           |rundate |insert_dt               |update_dt               |
+--------+----------------------------------------------------------+----------------------------------+------------------------------------+----------------+----------------+---+------------------+--------+----------+------------------------------------------+--------+------------------------+------------------------+
|20220719|net.razorvine.pickle.object

In [34]:
# Generate Symlink Manifest
dt_fact.generate("symlink_format_manifest")
print("manifest file genrated for athena access")

manifest file genrated for athena access


In [35]:
spark.stop()