In [0]:
%run ../.././start_up 

In [0]:
logger = create_logger(notebook_name="landing_to_bronze", log_level="DEBUG")
logger.info("🚀 Initializing landing_to_bronze notebook")

# Extract frequently used config values into variables
catalog = pipeline_config["catalog"]
bronze_schema = pipeline_config["schemas"]["bronze"]
bronze_path = pipeline_config["paths"]["bronze_path"]
bronze_volume_path = pipeline_config["paths"]["bronze_volume_path"]
silver_schema = pipeline_config["schemas"]["silver"]
silver_path = pipeline_config["paths"]["silver_path"]
landing_schema= pipeline_config["schemas"]["landing"]
landing_path = pipeline_config["paths"]["landing_path"]
logs_schema = pipeline_config["schemas"]["logs"]
table_name = "fact_treatments"
logger.info("Extracted frequently used config values into variables")

In [0]:

# --- Setup ---
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog}.{gold_schema}")

# Paths and config
silver_table = f"{catalog}.{silver_schema}.treatments"
gold_table = f"{catalog}.{gold_schema}.{table_name}"


In [0]:
# fact_treatments.py (Gold Layer - SCD Type 1 with Derived Columns, Composite PK - Managed Table)
from pyspark.sql.functions import col, when, current_timestamp
from delta.tables import DeltaTable

# Read only active records from Silver layer
logger.info(f"📥 Reading silver table: {silver_table}")
silver_df = spark.table(silver_table).filter("is_current = true")

# Drop SCD2 columns
columns_to_drop = ["is_current", "valid_from", "valid_to"]
silver_df = silver_df.drop(*[c for c in columns_to_drop if c in silver_df.columns])

# Add derived column
logger.info("✨ Adding derived column: treatment_cost_bucket")
silver_df = silver_df.withColumn(
    "treatment_cost_bucket",
    when(col("treatment_cost") < 1000, "Low")
    .when(col("treatment_cost") < 10000, "Medium")
    .otherwise("High")
).withColumn("record_updated_ts", current_timestamp())

# Extract schema string
logger.info("🧱 Extracting schema for table creation")
schema_extracted = ",\n".join(
    f"    {field.name} {field.dataType.simpleString().upper()}"
    for field in silver_df.schema.fields
)

# Get composite primary key
primary_keys = dim_table_config["fact_treatments"]["primary_key"]  # ["treatment_id", "visit_id"]
primary_key_str = ", ".join(primary_keys)

# Create table if not exists
logger.info("🧰 Checking if Gold table exists")
table_exists = spark.catalog.tableExists(gold_table)

if not table_exists:
    logger.info("📐 Creating managed Gold table with composite PK")
    create_sql = f"""
        CREATE TABLE IF NOT EXISTS {gold_table} (
{schema_extracted},
            CONSTRAINT pk_fact_treatments PRIMARY KEY ({primary_key_str})
        )
        USING DELTA
    """
    spark.sql(create_sql)

    logger.info("💾 Writing initial data")
    silver_df.write.format("delta") \
        .mode("overwrite") \
        .option("mergeSchema", "true") \
        .saveAsTable(gold_table)

else:
    logger.info("🔁 Table exists. Applying MERGE INTO for SCD Type 1")
    delta_table = DeltaTable.forName(spark, gold_table)

    merge_condition = " AND ".join([f"target.{pk} = source.{pk}" for pk in primary_keys])

    delta_table.alias("target").merge(
        silver_df.alias("source"),
        merge_condition
    ).whenMatchedUpdateAll() \
     .whenNotMatchedInsertAll() \
     .execute()

# Final view
if silver_df.count() == 0:
    logger.info("⚠️ No active data found for fact_treatments")
else:
    display(silver_df)
