In [0]:
%run ../.././start_up 

In [0]:
dim_patients_logger = create_logger(notebook_name="landing_to_bronze", log_level="DEBUG")
dim_patients_logger.info("🚀 Initializing landing_to_bronze notebook")

# Extract frequently used config values into variables
catalog = pipeline_config["catalog"]
bronze_schema = pipeline_config["schemas"]["bronze"]
bronze_path = pipeline_config["paths"]["bronze_path"]
bronze_volume_path = pipeline_config["paths"]["bronze_volume_path"]
silver_schema = pipeline_config["schemas"]["silver"]
silver_path = pipeline_config["paths"]["silver_path"]
landing_schema= pipeline_config["schemas"]["landing"]
landing_path = pipeline_config["paths"]["landing_path"]
logs_schema = pipeline_config["schemas"]["logs"]
table_name = "dim_patients"
dim_patients_logger.info("Extracted frequently used config values into variables")

In [0]:

# --- Setup ---
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog}.{gold_schema}")

# Paths and config
silver_table = f"{catalog}.{silver_schema}.patients"
gold_table = f"{catalog}.{gold_schema}.{table_name}"


In [0]:
# dim_patients.py (Gold Layer - SCD Type 1 with Derived Columns, Surrogate Key, and PK - Managed Table)
from pyspark.sql.functions import col, upper, trim, current_timestamp, lit, when, monotonically_increasing_id
from delta.tables import DeltaTable

# Read only active records from silver layer
logger.info(f"📥 Reading silver table: {silver_table}")
silver_df = spark.table(silver_table).filter("is_current = true")

# Drop SCD2 columns
columns_to_drop = ["is_current", "valid_from", "valid_to"]
silver_df = silver_df.drop(*[c for c in columns_to_drop if c in silver_df.columns])

# Add surrogate key and derived columns
logger.info("✨ Adding surrogate key and derived columns")
silver_df = silver_df \
    .withColumn("patient_sk", monotonically_increasing_id()) \
    .withColumn("is_minor", when(col("age") < 18, lit(True)).otherwise(lit(False))) \
    .withColumn("is_senior", when(col("age") >= 60, lit(True)).otherwise(lit(False))) \
    .withColumn("region", 
        when(upper(col("country")).isin("INDIA", "PAKISTAN", "SRI LANKA"), "Asia")
        .when(upper(col("country")).isin("USA", "CANADA"), "North America")
        .otherwise("Other")
    ) \
    .withColumn("record_updated_ts", current_timestamp())

# Reorder columns to place surrogate key first
cols = ["patient_sk"] + [c for c in silver_df.columns if c != "patient_sk"]
silver_df = silver_df.select(cols)

# Extract schema string
logger.info("🧱 Extracting schema for table creation")
schema_extracted = ",\n".join(
    f"    {field.name} {field.dataType.simpleString().upper()}"
    for field in silver_df.schema.fields
)

# Primary key from config
primary_key = dim_table_config["dim_patients"]["primary_key"]

# Create table if not exists
logger.info("🧰 Checking if Gold table exists")
table_exists = spark.catalog.tableExists(gold_table)

if not table_exists:
    logger.info("📐 Creating managed Gold table with PK constraint")
    create_sql = f"""
        CREATE TABLE IF NOT EXISTS {gold_table} (
{schema_extracted},
            CONSTRAINT pk_dim_patients PRIMARY KEY ({primary_key})
        )
        USING DELTA
    """
    spark.sql(create_sql)

    logger.info("💾 Writing initial data")
    silver_df.write.format("delta") \
        .mode("overwrite") \
        .option("mergeSchema", "true") \
        .saveAsTable(gold_table)

else:
    logger.info("🔁 Table exists. Applying MERGE INTO for SCD Type 1")
    delta_table = DeltaTable.forName(spark, gold_table)

    delta_table.alias("target").merge(
        silver_df.alias("source"),
        f"target.{primary_key} = source.{primary_key}"
    ).whenMatchedUpdateAll() \
     .whenNotMatchedInsertAll() \
     .execute()

# Final view
if silver_df.count() == 0:
    logger.info("⚠️ No active data found for dim_patients")
else:
    display(silver_df)
