In [0]:
%pip install dotenv
%restart_python

### Importing helper functins with reload, so that changes in that file are reflected here

In [0]:
# Only needed if you've already imported before
import helper_functions
from importlib import reload
reload(helper_functions)

# Now import the function
from helper_functions import sanitize_name, add_ingestion_columns


### Importing libraries

In [0]:
import os
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql import types as T
import sys
from dotenv import load_dotenv
from functools import reduce
from delta.tables import DeltaTable
from pyspark.sql.window import Window

In [0]:
load_dotenv(override=True)

### Loading variable values from .env

In [0]:
catalog = os.getenv("catalog")
bronze_schema = os.getenv("bronze_schema")
silver_schema = os.getenv("silver_schema")
container = os.getenv("container")
storage_account = os.getenv("storage_account")

### Creating Schema

In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS veersadatabricks.silver
LOCATION 'abfss://veersacontainer@storage12092004.dfs.core.windows.net/silver/';

### Tables

In [0]:
bronze_tables = [t.name for t in spark.catalog.listTables(f"{catalog}.{bronze_schema}") if t.tableType == "MANAGED" or t.tableType == "EXTERNAL"]

In [0]:
# from delta.tables import DeltaTable
# import pyspark.sql.functions as F

# def run_scd2_for_patientinfo(catalog, bronze_schema, silver_schema):
#     # Read Bronze patientinfo
#     bronze_df = spark.read.table(f"{catalog}.{bronze_schema}.patientinfo")

#     # Basic cleaning (rename cols, lowercase)
#     for col in bronze_df.columns:
#         new_col = sanitize_name(col)
#         if new_col != col:
#             bronze_df = bronze_df.withColumnRenamed(col, new_col)
#     bronze_df = bronze_df.select([F.col(c).alias(c.lower()) for c in bronze_df.columns])

#     # Add start_date (use ingested_at or _source_file_mod_ts from Bronze)
#     bronze_df = bronze_df.withColumn("effective_start_date", F.col("_ingested_at")) \
#                         .withColumn("effective_end_date", F.lit(None).cast("timestamp")) \
#                         .withColumn("is_current", F.lit(True))

#     silver_table_path = f"{catalog}.{silver_schema}.patientinfo_silver"

#     # If table exists, perform SCD2 merge
#     if DeltaTable.isDeltaTable(spark, f"/user/hive/warehouse/{silver_schema}.db/patientinfo_silver"):
#         silver_tbl = DeltaTable.forName(spark, silver_table_path)

#         # Join condition on natural key
#         join_cond = "tgt.patient_id = src.patient_id"

#         # Columns to compare for changes (exclude metadata columns)
#         compare_cols = [
#         "sex", "age", "country", "province", "city", "infection_case",
#         "infected_by", "contact_number", "symptom_onset_date",
#         "confirmed_date", "released_date", "deceased_date", "state"
#         ]

#         # Expression to detect any change
#         change_cond = " OR ".join([f"tgt.{c} <> src.{c}" for c in compare_cols])

#         (
#             silver_tbl.alias("tgt")
#             .merge(bronze_df.alias("src"), join_cond)
#             .whenMatchedUpdate(
#                 condition=change_cond,
#                 set={
#                     "effective_end_date": F.col("src.effective_start_date"),
#                     "is_current": F.lit(False)
#                 }
#             )
#             .whenNotMatchedInsertAll()
#             .execute()
#         )
#         # After this, you separately insert the new changed rows
#         changed_records = bronze_df.join(silver_tbl.toDF().filter("is_current = false"),join_cond, "inner").select("src.*")

#         changed_records.write.format("delta").mode("append").saveAsTable(silver_table_path)

#     else:
#         # First load → initialize Silver table
#         (bronze_df.write.format("delta")
#                 .mode("overwrite")
#                 .option("overwriteSchema","true")
#                 .saveAsTable(silver_table_path))


In [0]:
# %sql
# drop table veersadatabricks.silver.patientinfo_silver

### Function for scd2
run_scd2_for_patientinfo reads the Bronze patientinfo table, normalizes column names, deduplicates identical business records (keeping the latest ingestion), converts the result into SCD-2 style rows (effective_start/end, is_current), writes that dataset into a Silver Delta table (overwrite mode), then updates older rows in that Silver table to mark them as historical (is_current = False and set an effective_end_date).
- using ingested_at as effective_start_date as ingested_at represents the time at which this snapshot became visible to our system

In [0]:
def run_scd2_for_patientinfo(catalog, bronze_schema, silver_schema):
    # Read Bronze patientinfo
    bronze_df = spark.read.table(f"{catalog}.{bronze_schema}.patientinfo")

    # Basic cleaning (rename cols, lowercase)
    for col in bronze_df.columns:
        new_col = sanitize_name(col)
        if new_col != col:
            bronze_df = bronze_df.withColumnRenamed(col, new_col)
    bronze_df = bronze_df.select([F.col(c).alias(c.lower()) for c in bronze_df.columns])

    # Business columns (exclude metadata)
    business_cols = [
        "patient_id", "sex", "age", "country", "province", "city",
        "infection_case", "infected_by", "contact_number",
        "symptom_onset_date", "confirmed_date",
        "released_date", "deceased_date", "state"
    ]

    # Dedup → keep latest ingestion for identical business data
    window_spec = Window.partitionBy(*business_cols).orderBy(F.col("_ingested_at").desc())
    bronze_df = (
        bronze_df
        .withColumn("rn", F.row_number().over(window_spec))
        .filter(F.col("rn") == 1)
        .drop("rn")
    )

    # Add SCD2 metadata
    bronze_df = (
        bronze_df
        .withColumn("effective_start_date", F.col("_ingested_at"))
        .withColumn("effective_end_date", F.lit(None).cast("timestamp"))
        .withColumn("is_current", F.lit(True))
    )

    # Silver table path (catalog + schema)
    silver_table_path = f"{catalog}.{silver_schema}.patientinfo_silver"
    silver_table_fs_path = f"/user/hive/warehouse/{silver_schema}.db/patientinfo_silver"
        # First load ---> creates Silver table
    (
        bronze_df.write.format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .saveAsTable(silver_table_path)
    )
    #creates a delta table object from table_name, necessary to run merger operation later
    silver_tbl = DeltaTable.forName(spark, silver_table_path)

    # Window to get latest effective_start_date per patient_id
    window_patient = Window.partitionBy("patient_id").orderBy(F.col("effective_start_date").desc())

    silver_df = silver_tbl.toDF().withColumn("rn", F.row_number().over(window_patient))

    # Merge to update historical rows (rn > 1)
    silver_tbl.alias("tgt").merge(
        silver_df.filter(F.col("rn") > 1).alias("src"),
        "tgt.patient_id = src.patient_id AND tgt.effective_start_date = src.effective_start_date"
    ).whenMatchedUpdate(
        set={"is_current": F.lit(False), "effective_end_date": F.current_timestamp()}
    ).execute()
    print("SCD2 merge for patientinfo complete.")

## WORKIGN PERFECTLY


In [0]:
for bronze_table in bronze_tables:
    silver_table = f"{sanitize_name(bronze_table)}_silver"  # auto map to Silver table name
    print(f"Processing {bronze_table} → {silver_table}")
    if bronze_table == "patientinfo":
        # Run SCD2 logic here (instead of overwrite), maybe run this twice
        run_scd2_for_patientinfo(catalog, bronze_schema, silver_schema)
    # Read from Bronze
    else:
        df = spark.read.table(f"{catalog}.{bronze_schema}.{bronze_table}")

        # Basic standardization
        # for col in df.columns:
        #     new_col = sanitize_name(col)
        #     if new_col != col:
        #         df = df.withColumnRenamed(col, new_col)
        # df = df.select([F.col(c).alias(c.lower()) for c in df.columns])

        # Add ingestion metadata
        df = add_ingestion_columns(df)  # e.g., ingested_at, is_current

        # Write to Silver
        (
            df.write
            .format("delta")
            .mode("overwrite")
            .option("overwriteSchema", "true")
            .saveAsTable(f"{catalog}.{silver_schema}.{silver_table}")
        )

In [0]:
from pyspark.sql.functions import *
df = spark.table("veersadatabricks.silver.patientinfo_silver")
print(df.count())
df_bronze = spark.table("veersadatabricks.bronze.patientinfo")
#print(df_bronze.count())
df.filter(col("patient_id")==1000000001).display()
df.display()


In [0]:
# patientinfo = spark.read.table("veersadatabricks.silver.patientinfo_silver")
# patientinfo.printSchema()
# print(patientinfo.count())
# case_table_silver = spark.read.table("veersadatabricks.Silver.case_table_silver")
# case_table_silver.printSchema()

In [0]:
# case_bronze = spark.table("veersadatabricks.bronze.case_table")
# #case_bronze.printSchema()
# case_silver = (
#     case_bronze
#     .withColumnRenamed("_case_id", "caseId")
#     .withColumnRenamed("confirmed", "confirmedCases")
#     .withColumn("confirmedCases", col("confirmedCases").cast("int"))
#     .withColumn("city", trim(col("city")))
#     .filter(
#         col("caseId").isNotNull() &
#         col("confirmedCases").isNotNull() 
#         #&
#         #(col("city").isNotNull() & (col("city") != "-"))
#     )
#     .dropDuplicates()
# )

In [0]:
# def run_scd2_for_patientinfo(catalog, bronze_schema, silver_schema):
#     # Read and process bronze data (same as before)
#     bronze_df = spark.read.table(f"{catalog}.{bronze_schema}.patientinfo")
    
#     # ... (same cleaning and dedup logic as original)
#     for col in bronze_df.columns:
#         new_col = sanitize_name(col)
#         if new_col != col:
#             bronze_df = bronze_df.withColumnRenamed(col, new_col)
#     bronze_df = bronze_df.select([F.col(c).alias(c.lower()) for c in bronze_df.columns])

#     business_cols = ["patient_id", "sex", "age", "country", "province", "city", "infection_case", "infected_by", "contact_number", "symptom_onset_date", "confirmed_date", "released_date", "deceased_date", "state"]

#     window_spec = Window.partitionBy(*business_cols).orderBy(F.col("_ingested_at").desc())
#     # Add SCD2 metadata
#     bronze_df = (
#         bronze_df
#         .withColumn("effective_start_date", F.col("_ingested_at"))
#         .withColumn("effective_end_date", F.lit(None).cast("timestamp"))
#         .withColumn("is_current", F.lit(True))
#     )

#     silver_table_path = f"{catalog}.{silver_schema}.patientinfo_silver"

#     if spark.catalog.tableExists(silver_table_path):
#         silver_tbl = DeltaTable.forName(spark, silver_table_path)
        
#         # Get current active records from silver
#         current_silver = spark.read.table(silver_table_path).filter(F.col("is_current") == True)
        
#         # Identify what's changed by comparing business columns
#         changed_patients = (
#             bronze_df.alias("bronze")
#             .join(current_silver.alias("silver"), "patient_id", "inner")
#             .filter(
#                 reduce(lambda x, y: x | y, [
#                     ~F.col(f"bronze.{col}").eqNullSafe(F.col(f"silver.{col}")) 
#                     for col in business_cols if col != "patient_id"
#                 ])
#             )
#             .select("bronze.patient_id")
#             .distinct()
#         )
        
#         changed_patient_ids = [row.patient_id for row in changed_patients.collect()]
        
#         if changed_patient_ids:
#             print(f"Found {len(changed_patient_ids)} changed patients")
            
#             # Close old versions
#             silver_tbl.update(
#                 condition=F.col("patient_id").isin(changed_patient_ids) & (F.col("is_current") == True),
#                 set={
#                     "effective_end_date": F.current_timestamp(),
#                     "is_current": F.lit(False)
#                 }
#             )
            
#             # Insert new versions (changed records)
#             new_changed_records = bronze_df.filter(F.col("patient_id").isin(changed_patient_ids))
#             new_changed_records.write.format("delta").mode("append").saveAsTable(silver_table_path)
        
#         # Insert completely new records
#         existing_patient_ids = current_silver.select("patient_id").distinct().rdd.map(lambda x: x[0]).collect()
#         new_records = bronze_df.filter(~F.col("patient_id").isin(existing_patient_ids))
        
#         if new_records.count() > 0:
#             print(f"Inserting {new_records.count()} new patient records")
#             new_records.write.format("delta").mode("append").saveAsTable(silver_table_path)
    
#     else:
#         # First load
#         bronze_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(silver_table_path)

#     print("✅ SCD2 processing complete.")            cols_to_drop = [f"{col}_changed" for

In [0]:
# case_silver.write.format("delta") \
#     .mode("overwrite") \
#     .option("path", f"abfss://{container}@{storage_account}.dfs.core.windows.net/Silver/your_table") \
#     .saveAsTable("veersadatabricks.Silver.case_silver")

In [0]:
# from delta.tables import DeltaTable
# import pyspark.sql.functions as F
# from pyspark.sql.window import Window

# def run_scd2_for_patientinfo(catalog, bronze_schema, silver_schema):
#     # Read Bronze patientinfo
#     bronze_df = spark.read.table(f"{catalog}.{bronze_schema}.patientinfo")

#     # Basic cleaning (rename cols, lowercase)
#     for col in bronze_df.columns:
#         new_col = sanitize_name(col)
#         if new_col != col:
#             bronze_df = bronze_df.withColumnRenamed(col, new_col)
#     bronze_df = bronze_df.select([F.col(c).alias(c.lower()) for c in bronze_df.columns])

#     # Business columns (exclude metadata)
#     business_cols = [
#         "patient_id", "sex", "age", "country", "province", "city",
#         "infection_case", "infected_by", "contact_number",
#         "symptom_onset_date", "confirmed_date",
#         "released_date", "deceased_date", "state"
#     ]

#     # Dedup → keep latest ingestion for identical business data
#     window_spec = Window.partitionBy(*business_cols).orderBy(F.col("_ingested_at").desc())
#     bronze_df = (
#         bronze_df
#         .withColumn("rn", F.row_number().over(window_spec))
#         .filter(F.col("rn") == 1)
#         .drop("rn")
#     )

#     # Add SCD2 metadata
#     bronze_df = (
#         bronze_df
#         .withColumn("effective_start_date", F.col("_ingested_at"))
#         .withColumn("effective_end_date", F.lit(None).cast("timestamp"))
#         .withColumn("is_current", F.lit(True))
#     )

#     # Silver table path (catalog + schema)
#     silver_table_path = f"{catalog}.{silver_schema}.patientinfo_silver"
#     silver_table_fs_path = f"/user/hive/warehouse/{silver_schema}.db/patientinfo_silver"

#     if spark.catalog.tableExists(silver_table_path):
#         silver_tbl = DeltaTable.forName(spark, silver_table_path)
#         print("in here!!!!!!!")
#         # Natural key
#         join_cond = "tgt.patient_id = src.patient_id AND tgt.is_current = true"

#         # Change detection (null-safe)
#         compare_cols = [c for c in business_cols if c != "patient_id"]
#         # compare_cols.append(F.col("src._ingested_at") > F.col("tgt.effective_start_date"))

#         # # Combine all conditions with OR
#         # from functools import reduce
#         # from operator import or_
#         # change_cond = reduce(or_, compare_cols)
#         # #change_cond = " OR ".join([f"NOT (tgt.{c} <=> src.{c})" for c in compare_cols])

#         compare_exprs = [~(F.expr(f"tgt.{c} <=> src.{c}")) for c in compare_cols]

#         # If you also want to include _ingested_at check
#         compare_exprs.append(F.col("src._ingested_at") > F.col("tgt.effective_start_date"))

#         # Combine all conditions into a single OR expression
#         change_cond = reduce(or_, compare_exprs)

#         (
#             silver_tbl.alias("tgt")
#             .merge(bronze_df.alias("src"), join_cond)
#             .whenMatchedUpdate(
#                 condition=change_cond,
#                 set={
#                     "effective_end_date": F.col("src.effective_start_date"),
#                     "is_current": F.lit(False)
#                 }
#             )
#             .whenNotMatchedInsertAll()
#             .execute()
#         )

#     else:
#         # First load → create Silver table
#         (
#             bronze_df.write.format("delta")
#             .mode("overwrite")
#             .option("overwriteSchema", "true")
#             .saveAsTable(silver_table_path)
#         )

#     print("✅ SCD2 merge for patientinfo complete.")

# ## WORKIGN PERFECTLY


In [0]:
# from delta.tables import DeltaTable
# import pyspark.sql.functions as F
# from pyspark.sql.window import Window

# def run_scd2_for_patientinfo(catalog, bronze_schema, silver_schema):
#     # Read Bronze patientinfo
#     bronze_df = spark.read.table(f"{catalog}.{bronze_schema}.patientinfo")

#     # Basic cleaning (rename cols, lowercase)
#     for col in bronze_df.columns:
#         new_col = sanitize_name(col)
#         if new_col != col:
#             bronze_df = bronze_df.withColumnRenamed(col, new_col)
#     bronze_df = bronze_df.select([F.col(c).alias(c.lower()) for c in bronze_df.columns])

#     # Business columns
#     business_cols = [
#         "patient_id", "sex", "age", "country", "province", "city",
#         "infection_case", "infected_by", "contact_number",
#         "symptom_onset_date", "confirmed_date",
#         "released_date", "deceased_date", "state"
#     ]

#     # Dedup → keep latest ingestion for identical business data
#     window_spec = Window.partitionBy(*business_cols).orderBy(F.col("_ingested_at").desc())
#     bronze_df = (
#         bronze_df
#         .withColumn("rn", F.row_number().over(window_spec))
#         .filter(F.col("rn") == 1)
#         .drop("rn")
#     )

#     # Add SCD2 metadata
#     bronze_df = (
#         bronze_df
#         .withColumn("effective_start_date", F.col("_ingested_at"))
#         .withColumn("effective_end_date", F.lit(None).cast("timestamp"))
#         .withColumn("is_current", F.lit(True))
#     )

#     # Silver table path
#     silver_table_path = f"{catalog}.{silver_schema}.patientinfo_silver"
#     silver_table_fs_path = f"/user/hive/warehouse/{silver_schema}.db/patientinfo_silver"
#     compare_cols = [c for c in business_cols if c != "patient_id"]
#     if DeltaTable.isDeltaTable(spark, silver_table_fs_path):
#         silver_tbl = DeltaTable.forName(spark, silver_table_fs_path)

#         join_cond = "tgt.patient_id = src.patient_id AND tgt.is_current = true"

#         # compare_cols = [c for c in business_cols if c != "patient_id"]
#         # change_cond = " OR ".join([f"NOT (tgt.{c} <=> src.{c})" for c in compare_cols])
#         print("Got here!!!!!!!!!!!!!!")
#         compare_exprs = [f"NOT (tgt.{c} <=> src.{c})" for c in compare_cols]
#        # change_cond_col = reduce(or_, compare_exprs)  # combine with OR
#         change_cond_col = " OR ".join(compare_exprs)
#         (
#             silver_tbl.alias("tgt")
#             .merge(bronze_df.alias("src"), "tgt.patient_id = src.patient_id AND tgt.is_current = true")
#             # 1. Close existing record if any change
#             .whenMatchedUpdate(
#                 condition=change_cond_col,
#                 set={
#                     "effective_end_date": F.col("src.effective_start_date"),
#                     "is_current": F.lit(False)
#                 }
#             )
#             # 2. Always insert new record (if not exact duplicate)
#             .whenNotMatchedInsertAll()
#             .whenMatchedInsert(
#                 condition=change_cond_col,
#                 values={c: F.col(f"src.{c}") for c in bronze_df.columns}
#             )
#             .execute()
#         )

#     else:
#         (
#             bronze_df.write.format("delta")
#             .mode("overwrite")
#             .option("overwriteSchema", "true")
#             .saveAsTable(silver_table_path)
#         )

#     print("✅ SCD2 merge for patientinfo complete.")


In [0]:
# from delta.tables import DeltaTable
# import pyspark.sql.functions as F
# from pyspark.sql.window import Window

# def run_scd2_for_patientinfo(catalog, bronze_schema, silver_schema):
#     # Read Bronze patientinfo
#     bronze_df = spark.read.table(f"{catalog}.{bronze_schema}.patientinfo")

#     # Basic cleaning (rename cols, lowercase)
#     for col in bronze_df.columns:
#         new_col = sanitize_name(col)
#         if new_col != col:
#             bronze_df = bronze_df.withColumnRenamed(col, new_col)
#     bronze_df = bronze_df.select([F.col(c).alias(c.lower()) for c in bronze_df.columns])

#     # Business columns (exclude metadata)
#     business_cols = [
#         "patient_id", "sex", "age", "country", "province", "city",
#         "infection_case", "infected_by", "contact_number",
#         "symptom_onset_date", "confirmed_date",
#         "released_date", "deceased_date", "state"
#     ]

#     # Dedup → keep latest ingestion for identical business data
#     window_spec = Window.partitionBy(*business_cols).orderBy(F.col("_ingested_at").desc())
#     bronze_df = (
#         bronze_df
#         .withColumn("rn", F.row_number().over(window_spec))
#         .filter(F.col("rn") == 1)
#         .drop("rn")
#     )

#     # Add SCD2 metadata
#     bronze_df = (
#         bronze_df
#         .withColumn("effective_start_date", F.col("_ingested_at"))
#         .withColumn("effective_end_date", F.lit(None).cast("timestamp"))
#         .withColumn("is_current", F.lit(True))
#     )

#     # Silver table path
#     silver_table_path = f"{catalog}.{silver_schema}.patientinfo_silver"

#     if spark.catalog.tableExists(silver_table_path):
#         print("Processing existing silver table...")
        
#         # Read current silver data
#         silver_df = spark.read.table(silver_table_path)
#         current_silver = silver_df.filter(F.col("is_current") == True)
        
#         # Find what records have changed by comparing business columns only
#         compare_cols = [c for c in business_cols if c != "patient_id"]
        
#         # Get records that exist in both bronze and current silver
#         existing_patients = (
#             bronze_df.alias("bronze")
#             .join(current_silver.alias("silver"), "patient_id", "inner")
#             .select(
#                 "bronze.*",
#                 # Add flags to check if data changed
#                 *[
#                     (~F.col(f"bronze.{col}").eqNullSafe(F.col(f"silver.{col}"))).alias(f"{col}_changed")
#                     for col in compare_cols
#                 ]
#             )
#         )
        
#         # Create a single change flag
#         change_conditions = [F.col(f"{col}_changed") for col in compare_cols]
#         combined_change_flag = change_conditions[0]
#         for condition in change_conditions[1:]:
#             combined_change_flag = combined_change_flag | condition
            
#         existing_patients = existing_patients.withColumn("has_changes", combined_change_flag)
        
#         # Separate changed vs unchanged patients
#         changed_patients = existing_patients.filter(F.col("has_changes") == True)
#         unchanged_patients = existing_patients.filter(F.col("has_changes") == False)
#         new_patients = bronze_df.alias("bronze").join(
#             current_silver.alias("silver"), "patient_id", "left_anti"
#         )
        
#         print(f"Changed patients: {changed_patients.count()}")
#         print(f"Unchanged patients: {unchanged_patients.count()}")  
#         print(f"New patients: {new_patients.count()}")
        
#         if changed_patients.count() > 0:
#             # Get the patient IDs that changed
#             changed_patient_ids = [row.patient_id for row in changed_patients.select("patient_id").distinct().collect()]
            
#             # Step 1: Close the old versions (set is_current = False, effective_end_date)
#             silver_tbl = DeltaTable.forName(spark, silver_table_path)
            
#             # Get the new effective_start_date for each changed patient
#             patient_new_dates = {
#                 row.patient_id: row.effective_start_date 
#                 for row in changed_patients.select("patient_id", "effective_start_date").collect()
#             }
            
#             # Update old records one by one to avoid confusion
#             for patient_id in changed_patient_ids:
#                 new_start_date = patient_new_dates[patient_id]
#                 silver_tbl.update(
#                     condition=(F.col("patient_id") == patient_id) & (F.col("is_current") == True),
#                     set={
#                         "effective_end_date": F.lit(new_start_date),
#                         "is_current": F.lit(False)
#                     }
#                 )
            
#             # Step 2: Insert the new versions (drop the helper columns first)
#             cols_to_drop = [f"{col}_changed" for col in compare_cols] + ["has_changes"]
#             new_versions = changed_patients.drop(*cols_to_drop)
            
#             new_versions.write.format("delta").mode("append").saveAsTable(silver_table_path)
#             print(f"Inserted {new_versions.count()} new versions of changed records")
        
#         # Insert completely new patients
#         if new_patients.count() > 0:
#             new_patients.write.format("delta").mode("append").saveAsTable(silver_table_path)
#             print(f"Inserted {new_patients.count()} completely new patient records")
            
#     else:
#         # First load → create Silver table
#         print("Creating new silver table...")
#         (
#             bronze_df.write.format("delta")
#             .mode("overwrite")
#             .option("overwriteSchema", "true")
#             .saveAsTable(silver_table_path)
#         )

#     print("✅ SCD2 merge for patientinfo complete.")

# # Corrected approach using two separate operations
# def run_scd2_for_patientinfo_clean(catalog, bronze_schema, silver_schema):
#     # Read and clean bronze data (same as above)
#     bronze_df = spark.read.table(f"{catalog}.{bronze_schema}.patientinfo")
    
#     # Basic cleaning (rename cols, lowercase)
#     for col in bronze_df.columns:
#         new_col = sanitize_name(col)
#         if new_col != col:
#             bronze_df = bronze_df.withColumnRenamed(col, new_col)
#     bronze_df = bronze_df.select([F.col(c).alias(c.lower()) for c in bronze_df.columns])

#     # Business columns (exclude metadata)
#     business_cols = [
#         "patient_id", "sex", "age", "country", "province", "city",
#         "infection_case", "infected_by", "contact_number",
#         "symptom_onset_date", "confirmed_date",
#         "released_date", "deceased_date", "state"
#     ]

#     # Dedup → keep latest ingestion for identical business data
#     window_spec = Window.partitionBy(*business_cols).orderBy(F.col("_ingested_at").desc())
#     bronze_df = (
#         bronze_df
#         .withColumn("rn", F.row_number().over(window_spec))
#         .filter(F.col("rn") == 1)
#         .drop("rn")
#     )
    
#     # Add SCD2 metadata
#     bronze_df = (
#         bronze_df
#         .withColumn("effective_start_date", F.col("_ingested_at"))
#         .withColumn("effective_end_date", F.lit(None).cast("timestamp"))
#         .withColumn("is_current", F.lit(True))
#     )

#     silver_table_path = f"{catalog}.{silver_schema}.patientinfo_silver"

#     if spark.catalog.tableExists(silver_table_path):
#         # STEP 1: Clear any existing duplicates first
#         print("Cleaning existing table...")
#         silver_df = spark.read.table(silver_table_path)
        
#         # Keep only the latest version of each patient (by effective_start_date)
#         window_spec_clean = Window.partitionBy("patient_id").orderBy(F.col("effective_start_date").desc())
#         cleaned_silver = (
#             silver_df
#             .withColumn("rn", F.row_number().over(window_spec_clean))
#             .filter(F.col("rn") == 1)
#             .drop("rn")
#             .withColumn("is_current", F.lit(True))
#             .withColumn("effective_end_date", F.lit(None).cast("timestamp"))
#         )
        
#         # Overwrite with cleaned data
#         cleaned_silver.write.format("delta").mode("overwrite").saveAsTable(silver_table_path)
#         print("Table cleaned - removed duplicates")
        
#         # STEP 2: Now do the SCD2 process with two operations
#         silver_tbl = DeltaTable.forName(spark, silver_table_path)
        
#         compare_cols = [c for c in business_cols if c != "patient_id"]
#         change_cond = " OR ".join([f"NOT (tgt.{c} <=> src.{c})" for c in compare_cols])

#         # Operation 1: Close old records when changes detected
#         print("Step 1: Closing old records...")
#         (
#             silver_tbl.alias("tgt")
#             .merge(bronze_df.alias("src"), "tgt.patient_id = src.patient_id AND tgt.is_current = true")
#             .whenMatchedUpdate(
#                 condition=change_cond,
#                 set={
#                     "effective_end_date": "src.effective_start_date",
#                     "is_current": "false"
#                 }
#             )
#             .execute()
#         )
        
#         # Operation 2: Insert all records (new + changed versions)
#         print("Step 2: Inserting new records...")
#         (
#             silver_tbl.alias("tgt")
#             .merge(bronze_df.alias("src"), "tgt.patient_id = src.patient_id AND tgt.is_current = true")
#             .whenNotMatchedInsertAll()
#             .execute()
#         )
        
#     else:
#         # First load
#         print("Creating new silver table...")
#         bronze_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(silver_table_path)

#     print("✅ SCD2 processing complete.")