In [0]:
%run ../.././start_up 

In [0]:
logger = create_logger(notebook_name="landing_to_bronze", log_level="DEBUG")
logger.info("🚀 Initializing landing_to_bronze notebook")

# Extract frequently used config values into variables
catalog = pipeline_config["catalog"]
bronze_schema = pipeline_config["schemas"]["bronze"]
bronze_path = pipeline_config["paths"]["bronze_path"]
bronze_volume_path = pipeline_config["paths"]["bronze_volume_path"]
silver_schema = pipeline_config["schemas"]["silver"]
silver_path = pipeline_config["paths"]["silver_path"]
landing_schema= pipeline_config["schemas"]["landing"]
landing_path = pipeline_config["paths"]["landing_path"]
logs_schema = pipeline_config["schemas"]["logs"]
table_name = "dim_visits"
logger.info("Extracted frequently used config values into variables")

In [0]:

# --- Setup ---
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog}.{gold_schema}")

# Paths and config
silver_table = f"{catalog}.{silver_schema}.visits"
gold_table = f"{catalog}.{gold_schema}.{table_name}"


In [0]:
# dim_visits.py (Gold Layer - SCD Type 1 with PK - Managed Table)
from pyspark.sql.functions import col, current_timestamp
from delta.tables import DeltaTable

# Read only active records from silver layer
logger.info(f"📅 Reading silver table: {silver_table}")
silver_df = spark.table(silver_table).filter("is_current = true")

# Drop SCD2 columns
columns_to_drop = ["is_current", "valid_from", "valid_to"]
silver_df = silver_df.drop(*[c for c in columns_to_drop if c in silver_df.columns])

# Add metadata column
logger.info("✨ Adding record_updated_ts column")
silver_df = silver_df.withColumn("record_updated_ts", current_timestamp())

# Extract schema string
logger.info("🛠️ Extracting schema for table creation")
schema_extracted = ",\n".join(
    f"    {field.name} {field.dataType.simpleString().upper()}"
    for field in silver_df.schema.fields
)

# Primary key from config
primary_key = dim_table_config["dim_visits"]["primary_key"]

# Create table if not exists
logger.info("🛠️ Checking if Gold table exists")
table_exists = spark.catalog.tableExists(gold_table)

if not table_exists:
    logger.info("📐 Creating managed Gold table with PK constraint")
    create_sql = f"""
        CREATE TABLE IF NOT EXISTS {gold_table} (
{schema_extracted},
            CONSTRAINT pk_dim_visits PRIMARY KEY ({primary_key})
        )
        USING DELTA
    """
    spark.sql(create_sql)

    logger.info("📂 Writing initial data")
    silver_df.write.format("delta") \
        .mode("overwrite") \
        .option("mergeSchema", "true") \
        .saveAsTable(gold_table)

else:
    logger.info("🔄 Table exists. Applying MERGE INTO for SCD Type 1")
    delta_table = DeltaTable.forName(spark, gold_table)

    delta_table.alias("target").merge(
        silver_df.alias("source"),
        f"target.{primary_key} = source.{primary_key}"
    ).whenMatchedUpdateAll() \
     .whenNotMatchedInsertAll() \
     .execute()

# Final view
if silver_df.count() == 0:
    logger.info("⚠️ No active data found for dim_visits")
else:
    display(silver_df)
