In [1]:
# Script to load plan type

In [2]:
# Import required libraries
import sys
from lib.spark_session import get_spark_session
from lib.utils import date_data, get_string_cols, get_rundate
from lib.job_control import insert_log, get_max_timestamp
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import current_timestamp, expr, to_date, date_format, udf, lit
from pyspark.sql.types import StringType
from datetime import datetime
from delta import DeltaTable
import uuid

In [3]:
# JOB Parameters
rundate = get_rundate()
schema_name = "edw"
table_name = "dim_plan_type"
table_full_name = f"{schema_name}.{table_name}"
print("SPARK_APP: JOB triggered for rundate - " + rundate)

SPARK_APP: JOB triggered for rundate - 20220101


In [4]:
# Generate Spark Session
spark: SparkSession = get_spark_session(f"Dim load - {table_full_name}")
print("SPARK_APP: Spark UI - " + spark.sparkContext.uiWebUrl)

SPARK_APP: Spark UI - http://b47c9213eb15:4040


In [6]:
# Create Plan Type Dimension data and Schema

_schema = ["plan_type_code", "plan_name"]
_data = [
    ["G", "GOLD"],
    ["P", "PLATINUM"],
    ["D", "DIAMOND"],
    ["S", "SILVER"],
    ["NA", "NOT APPLICABLE"]
]

# Create dataframe
df = spark.createDataFrame(data=_data, schema=_schema)

In [8]:
# Add required audit columns
df_dim = df.withColumn("rundate", lit(rundate)) \
    .withColumn("insert_dt", current_timestamp()) \
    .withColumn("update_dt", current_timestamp())

df_dim.show()

+--------------+--------------+--------+--------------------+--------------------+
|plan_type_code|     plan_name| rundate|           insert_dt|           update_dt|
+--------------+--------------+--------+--------------------+--------------------+
|             G|          GOLD|20220101|2023-03-19 07:01:...|2023-03-19 07:01:...|
|             P|      PLATINUM|20220101|2023-03-19 07:01:...|2023-03-19 07:01:...|
|             D|       DIAMOND|20220101|2023-03-19 07:01:...|2023-03-19 07:01:...|
|             S|        SILVER|20220101|2023-03-19 07:01:...|2023-03-19 07:01:...|
|            NA|NOT APPLICABLE|20220101|2023-03-19 07:01:...|2023-03-19 07:01:...|
+--------------+--------------+--------+--------------------+--------------------+



In [9]:
# Insert all records in Delta Table in OVERWRITE mode
df_dim.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable(table_full_name)
print("SPARK_APP: Dim data loaded")

SPARK_APP: Dim data loaded


In [10]:
# Add job details in JOB CONTROL
insert_log(spark, schema_name, table_name, datetime.now(), rundate)
print("SPARK_APP: Update JOB Control Log")

SPARK_APP: Update JOB Control Log


In [11]:
spark.sql(f"select * from edw.job_control where table_name = '{table_name}' order by insert_dt desc limit 1").show(truncate=False)

+-----------+-------------+--------------------------+--------+--------------------------+
|schema_name|table_name   |max_timestamp             |rundate |insert_dt                 |
+-----------+-------------+--------------------------+--------+--------------------------+
|edw        |dim_plan_type|2023-03-19 07:03:41.954863|20220101|2023-03-19 07:03:45.094971|
+-----------+-------------+--------------------------+--------+--------------------------+



In [12]:
# Generate Symlink manifest for Athena Access
dt = DeltaTable.forName(spark, table_full_name)
dt.generate("symlink_format_manifest")
print("SPARK_APP: Symlink Manifest file generated")

SPARK_APP: Symlink Manifest file generated


In [13]:
spark.sql("select * from edw.dim_plan_type").show()

+--------------+--------------+--------+--------------------+--------------------+
|plan_type_code|     plan_name| rundate|           insert_dt|           update_dt|
+--------------+--------------+--------+--------------------+--------------------+
|            NA|NOT APPLICABLE|20220101|2023-03-19 07:03:...|2023-03-19 07:03:...|
|             P|      PLATINUM|20220101|2023-03-19 07:03:...|2023-03-19 07:03:...|
|             D|       DIAMOND|20220101|2023-03-19 07:03:...|2023-03-19 07:03:...|
|             S|        SILVER|20220101|2023-03-19 07:03:...|2023-03-19 07:03:...|
|             G|          GOLD|20220101|2023-03-19 07:03:...|2023-03-19 07:03:...|
+--------------+--------------+--------+--------------------+--------------------+



In [14]:
spark.stop()