In [1]:
# Script to Load Plan Dimension

# Import Libraries
import sys
from lib.spark_session import get_spark_session
from lib.utils import date_data, get_string_cols, get_rundate
from lib.job_control import insert_log, get_max_timestamp
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import current_timestamp, expr, to_date, date_format,udf,lit,col
from pyspark.sql.types import StringType
from datetime import datetime
from delta import DeltaTable
import uuid

In [2]:
# Job Parameters
rundate = get_rundate()
schema_name = 'edw'
table_name = 'dim_plan_type'
table_full_name = f"{schema_name}.{table_name}"
print("SPARK_APP: JOB triggered for rundate - " + rundate)

SPARK_APP: JOB triggered for rundate - 20220101


In [3]:
# start Spark Session
spark:SparkSession = get_spark_session()
print(f"SPARK APP: Dimension Table Load - {table_full_name}")

SPARK APP: Dimension Table Load - edw.dim_plan_type


In [4]:
# Create Plan Type Dimension data and Schema

_schema = ["plan_type_code", "plan_name"]
_data = [
    ["G", "GOLD"],
    ["P", "PLATINUM"],
    ["D", "DIAMOND"],
    ["S", "SILVER"],
    ["NA", "NOT APPLICABLE"]
]

# Create dataframe
df = spark.createDataFrame(data=_data, schema=_schema)

In [5]:
# Add required audit columns
df_dim = df.withColumn("rundate", lit(rundate)) \
    .withColumn("insert_dt", current_timestamp()) \
    .withColumn("update_dt", current_timestamp())

df_dim.show()

+--------------+--------------+--------+--------------------+--------------------+
|plan_type_code|     plan_name| rundate|           insert_dt|           update_dt|
+--------------+--------------+--------+--------------------+--------------------+
|             G|          GOLD|20220101|2024-06-03 02:50:...|2024-06-03 02:50:...|
|             P|      PLATINUM|20220101|2024-06-03 02:50:...|2024-06-03 02:50:...|
|             D|       DIAMOND|20220101|2024-06-03 02:50:...|2024-06-03 02:50:...|
|             S|        SILVER|20220101|2024-06-03 02:50:...|2024-06-03 02:50:...|
|            NA|NOT APPLICABLE|20220101|2024-06-03 02:50:...|2024-06-03 02:50:...|
+--------------+--------------+--------+--------------------+--------------------+



In [6]:
# Insert all records in Delta Table in OVERWRITE mode
df_dim.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable(table_full_name)
print("SPARK_APP: Dim data loaded")

SPARK_APP: Dim data loaded


In [7]:
# Add job details in JOB CONTROL
insert_log(spark, schema_name, table_name, datetime.now(), rundate)
print("SPARK_APP: Update JOB Control Log")

SPARK_APP: Update JOB Control Log


In [8]:
spark.sql(f"select * from edw.job_control where table_name = '{table_name}' order by insert_dt desc limit 1").show(truncate=False)

+-----------+-------------+--------------------------+--------+--------------------------+
|schema_name|table_name   |max_timestamp             |rundate |insert_dt                 |
+-----------+-------------+--------------------------+--------+--------------------------+
|edw        |dim_plan_type|2024-06-03 02:50:44.532011|20220101|2024-06-03 02:50:48.781843|
+-----------+-------------+--------------------------+--------+--------------------------+



In [9]:
# Generate Symlink manifest for Athena Access
dt = DeltaTable.forName(spark, table_full_name)
dt.generate("symlink_format_manifest")
print("SPARK_APP: Symlink Manifest file generated")

SPARK_APP: Symlink Manifest file generated


In [10]:
spark.sql("select * from edw.dim_plan_type").show()

+--------------+--------------+--------+--------------------+--------------------+
|plan_type_code|     plan_name| rundate|           insert_dt|           update_dt|
+--------------+--------------+--------+--------------------+--------------------+
|            NA|NOT APPLICABLE|20220101|2024-06-03 02:50:...|2024-06-03 02:50:...|
|             P|      PLATINUM|20220101|2024-06-03 02:50:...|2024-06-03 02:50:...|
|             D|       DIAMOND|20220101|2024-06-03 02:50:...|2024-06-03 02:50:...|
|             S|        SILVER|20220101|2024-06-03 02:50:...|2024-06-03 02:50:...|
|             G|          GOLD|20220101|2024-06-03 02:50:...|2024-06-03 02:50:...|
+--------------+--------------+--------+--------------------+--------------------+



In [11]:
spark.stop()