In [0]:
%run ../../utils/common

In [0]:
# Define widgets 
dbutils.widgets.text("environment", "", "")
dbutils.widgets.text("table_key", "", "")
dbutils.widgets.text("job_name", "", "")
dbutils.widgets.text("proc_date", "", "")
dbutils.widgets.text("source_origin", "", "")

# Retrieve widget values into direct variables
environment   = dbutils.widgets.get("environment")
table_key     = dbutils.widgets.get("table_key")
job_name      = dbutils.widgets.get("job_name")
proc_date_str = dbutils.widgets.get("proc_date")
source_origin = dbutils.widgets.get("source_origin")

# Get catalog from settings config
catalog = settings[environment]['catalog_name']

print(f"Environment:   {environment}")
print(f"Table Key:     {table_key}")
print(f"Job Name:      {job_name}")
print(f"Proc Date:     {proc_date_str}")
print(f"Source Origin: {source_origin}")
print(f"Catalog:       {catalog}")


In [0]:
# Parse the processing date string into datetime/date
try:
    proc_date = datetime.strptime(proc_date_str, "%Y-%m-%dT%H:%M:%S.%f")
except ValueError:
    raise ValueError(f"Invalid date format: {proc_date_str}")
proc_date_val = proc_date.date()
dbutils.jobs.taskValues.set("proc_date", proc_date_val.strftime('%Y-%m-%d'))


In [0]:
# Fetch the file, table, and other configs for given source/table from a lookup table
lookup_df = (
    spark.table(f"{catalog}.default.lookup_table_raw_to_silver")
    .filter(
        (col("source_origin") == source_origin) & 
        (col("table_key") == table_key)
    )
)
records = lookup_df.collect()
if not records:
    raise ValueError(f"No entry found in lookup table for source_origin = '{source_origin}' and table_key = '{table_key}'")

record = records[0]
raw_path_base = record["raw_file_path"].rstrip("/")
raw_format = record["raw_file_format"]
bronze_schema = record["bronze_schema"]
bronze_table_name = record["bronze_table_name"]
silver_layer_notebook_path = record["silver_layer_notebook_path"]
tbl_bronze = f"{catalog}.{bronze_schema}.{bronze_table_name}"

print(f"Raw Path Base: {raw_path_base}")
print(f"Raw Format: {raw_format}")
print(f"Bronze Schema: {bronze_schema}")
print(f"Bronze Table Name: {bronze_table_name}")
print(f"Silver Layer Notebook Path: {silver_layer_notebook_path}")
print(f"Target Bronze Table: {tbl_bronze}")


In [0]:
date_suffix = f"/yyyy={proc_date_val.year}/mm={proc_date_val.strftime('%m')}/dd={proc_date_val.strftime('%d')}"
raw_path = raw_path_base
if "yyyy=" not in raw_path_base:
    raw_path += date_suffix

print(f"Processing Date (date part): {proc_date_val}")
print(f"Reading from path: {raw_path}")


In [0]:
# Transformation and Write Function
def apply_transformations_and_write(df_raw, tbl_bronze, proc_date_val, write_mode="overwrite", cast_timestamp=False):

    for col_name in df_raw.columns:
        df_raw = df_raw.withColumnRenamed(col_name, to_snake_case(col_name, source_origin))

    if cast_timestamp and 'file_creation_ts' in df_raw.columns:
        df_raw = df_raw.withColumn("file_creation_ts", col("file_creation_ts").cast("timestamp"))

    df_to_write = df_raw.withColumn("proc_date", to_date(lit(proc_date_val), "yyyy-MM-dd"))
    if "file_creation_ts" not in df_to_write.columns:
        df_to_write = df_to_write.withColumn("file_creation_ts", lit(None).cast("timestamp"))

    (
        df_to_write.write
        .mode(write_mode)
        .option("mergeSchema", "true")
        .option("replaceWhere", f"proc_date = '{proc_date_val}'")
        .format("delta")
        .saveAsTable(tbl_bronze)
    )

# Source Processing Block
def process_all_sources(source_origin, spark, dbutils, raw_path, raw_format, tbl_bronze, proc_date_val, table_key=None):
    print(f"Executing logic for {source_origin}")

    if source_origin == "SAP BW WCM":
        spark.sql(f"DELETE FROM {tbl_bronze} WHERE proc_date = '{proc_date_val}'")
        parquet_file_list = [file.path for file in dbutils.fs.ls(raw_path) if file.path.endswith('.parquet')]
        print(f"Found {len(parquet_file_list)} Parquet files.")
        
        for file_path in parquet_file_list:
            df_raw = spark.read.format(raw_format).load(file_path)
            n_records = df_raw.count()
            if n_records > 0 and 'file_creation_ts' in df_raw.columns:
                print(f"Ingesting {n_records} records from {file_path}")
                apply_transformations_and_write(df_raw, tbl_bronze, proc_date_val, write_mode="append", cast_timestamp=True)
                print(f"Success: Loaded data from {file_path}")
            else:
                print(f"No data found in file {file_path}")

    elif source_origin == "ROP" and table_key:
        df_raw = spark.read.format(raw_format).load(raw_path)
        schema_mapping = {
            "rop_order_report": [
                "serial_number", "order_id", "region", "city", "vendor_id", "vendor_name", "vendor_address",
                "sub_range_id", "sub_range_name", "store_id", "store_name", "store_address", "order_user",
                "product_stt", "product_id", "product_name", "barcode", "baseuom", "price",
                "purchase_order_quantity", "purchase_order_quantity_delivery",
                "purchase_order_quantity_delivered_actual", "order_date", "order_date_request",
                "order_date_expected_delivery", "time_frame_of_delivery", "delivery_confirmation_date",
                "time_frame_for_delivery_confirmation", "delivery_date", "status", "upd_date", "note_1", "note_2"
            ],
            "rop_problem_article": ["index", "domain", "area", "sku", "reason", "user_updated", "updated_at", "active_status"]
        }
        if table_key in schema_mapping:
            df_raw = df_raw.toDF(*schema_mapping[table_key])
        n_records = df_raw.count()
        if n_records > 0:
            print(f"Ingesting {n_records} records")
            apply_transformations_and_write(df_raw, tbl_bronze, proc_date_val)
            print(f"Success: Loaded data into {tbl_bronze}")
        else:
            print(f"No data found for date: {proc_date_val}")

    else:
        df_raw = spark.read.format(raw_format).load(raw_path)
        n_records = df_raw.count()
        if n_records > 0:
            print(f"Ingesting {n_records} records")
            apply_transformations_and_write(df_raw, tbl_bronze, proc_date_val)
            print(f"Success: Loaded data into {tbl_bronze}")
        else:
            print(f"No data found for date: {proc_date_val}")

# Silver Layer Notebook Execution Block
def execute_silver_layer_notebook(dbutils, silver_layer_notebook_path, tbl_bronze, proc_date_val, environment, bronze_table_name):
    if not silver_layer_notebook_path.strip():
        raise ValueError(f"Silver notebook path for bronze table {tbl_bronze} does not exist.")
    print(f"Triggering Silver Layer Notebook: {silver_layer_notebook_path}")
    dbutils.notebook.run(
        silver_layer_notebook_path,
        timeout_seconds=0,
        arguments={
            "proc_date": proc_date_val.strftime('%Y-%m-%d'),
            "environment": environment,
            "table": bronze_table_name,
            "layer": "silver"
        }
    )
    print(f"Completed Silver Layer Notebook: {silver_layer_notebook_path}")


In [0]:
SUPPORTED_SOURCES = [
    "SAP BW WCM", "ROP", "CAPILLARY", "CX LOYALTY", "DATA PORTAL", "SAP CAR", "SAP ERP WCM","SUPRA"
]
if source_origin not in SUPPORTED_SOURCES:
    raise ValueError(f"Processing logic is not defined for source_origin: '{source_origin}'")

# Main execution
process_all_sources(
    source_origin=source_origin,
    spark=spark,
    dbutils=dbutils,
    raw_path=raw_path,
    raw_format=raw_format,
    tbl_bronze=tbl_bronze,
    proc_date_val=proc_date_val,
    table_key=table_key
)


execute_silver_layer_notebook(
    dbutils=dbutils,
    silver_layer_notebook_path=silver_layer_notebook_path,
    tbl_bronze=tbl_bronze,
    proc_date_val=proc_date_val,
    environment=environment,
    bronze_table_name=bronze_table_name
)
