In [3]:
# Databricks notebook source
# MAGIC %md
# MAGIC # PHASE 1 - STEP 1: ADD AUDIT COLUMNS TO PERSON TABLE
# MAGIC 
# MAGIC **Objective:** Add watermark columns for incremental load pattern
# MAGIC 
# MAGIC **Approach:** Spark native (no SQL ALTER TABLE - Fabric compatible)
# MAGIC 
# MAGIC **Safety:**
# MAGIC - ‚úÖ Non-destructive: Reads existing data, adds columns, rewrites
# MAGIC - ‚úÖ DAMA compliant: Preserves all 15.7M rows
# MAGIC - ‚úÖ Audit trail: Logs all operations
# MAGIC - ‚úÖ Rollback: Original table backed up before change
# MAGIC 
# MAGIC **Estimated Time:** 3-5 minutes for 15.7M rows

# COMMAND ----------

# MAGIC %md
# MAGIC ## CONFIGURATION

# COMMAND ----------

import pyspark.sql.functions as F
from pyspark.sql.types import *
from datetime import datetime
import time

# Configuration
SOURCE_TABLE = "Lake24.dbo.person"
BACKUP_TABLE = "Lake24.dbo.person_backup_phase1"  # Safety backup
TEMP_TABLE = "Lake24.dbo.person_temp_phase1"       # Temporary work table

print("=" * 80)
print("PHASE 1 - STEP 1: ADD AUDIT COLUMNS")
print("=" * 80)
print(f"Source Table: {SOURCE_TABLE}")
print(f"Backup Table: {BACKUP_TABLE}")
print(f"Spark Version: {spark.version}")
print("=" * 80)

# COMMAND ----------

# MAGIC %md
# MAGIC ## PRE-FLIGHT CHECKS

# COMMAND ----------

def preflight_checks():
    """
    Safety checks before modifying schema
    
    Returns: (success: bool, current_count: int, current_schema: list)
    """
    print("\nüîç PRE-FLIGHT CHECKS:")
    print("-" * 80)
    
    try:
        # 1. Verify source table exists
        if not spark.catalog.tableExists(SOURCE_TABLE):
            print(f"‚ùå ERROR: Table {SOURCE_TABLE} does not exist!")
            return False, 0, []
        print(f"‚úÖ Source table exists: {SOURCE_TABLE}")
        
        # 2. Get current record count
        source_df = spark.table(SOURCE_TABLE)
        current_count = source_df.count()
        print(f"‚úÖ Current record count: {current_count:,}")
        
        # 3. Get current schema
        current_schema = source_df.schema.fields
        current_columns = [f.name for f in current_schema]
        print(f"‚úÖ Current column count: {len(current_columns)}")
        
        # 4. Check if audit columns already exist
        audit_columns = ['created_timestamp', 'updated_timestamp', 'is_deleted']
        existing_audit = [col for col in audit_columns if col in current_columns]
        
        if existing_audit:
            print(f"‚ö†Ô∏è  WARNING: Some audit columns already exist: {existing_audit}")
            print(f"   This script will preserve existing values.")
        else:
            print(f"‚úÖ No audit columns exist yet (will add: {audit_columns})")
        
        # 5. Check if backup already exists
        if spark.catalog.tableExists(BACKUP_TABLE):
            print(f"‚ö†Ô∏è  WARNING: Backup table already exists: {BACKUP_TABLE}")
            print(f"   Will be overwritten with current data.")
        
        # 6. Estimate processing time
        estimated_time = current_count / 100000  # ~100K rows/second estimate
        print(f"üìä Estimated processing time: {estimated_time:.1f} seconds ({estimated_time/60:.1f} minutes)")
        
        print("-" * 80)
        print("‚úÖ ALL PRE-FLIGHT CHECKS PASSED")
        print("=" * 80)
        
        return True, current_count, current_columns
        
    except Exception as e:
        print(f"‚ùå PRE-FLIGHT CHECK FAILED: {str(e)}")
        return False, 0, []

# Run pre-flight checks
checks_passed, record_count, existing_columns = preflight_checks()

if not checks_passed:
    raise Exception("Pre-flight checks failed. Aborting.")

# COMMAND ----------

# MAGIC %md
# MAGIC ## STEP 1.1: CREATE BACKUP (SAFETY)

# COMMAND ----------

def create_backup():
    """
    Create backup of original table before modification
    
    DAMA Best Practice: Always backup before schema changes
    """
    print("\nüì¶ CREATING BACKUP:")
    print("-" * 80)
    
    start_time = time.time()
    
    try:
        # Drop backup if exists (overwrite with fresh backup)
        if spark.catalog.tableExists(BACKUP_TABLE):
            print(f"   Dropping existing backup: {BACKUP_TABLE}")
            spark.sql(f"DROP TABLE IF EXISTS {BACKUP_TABLE}")
        
        # Create backup (exact copy)
        print(f"   Creating backup: {BACKUP_TABLE}")
        source_df = spark.table(SOURCE_TABLE)
        
        source_df.write \
            .format("delta") \
            .mode("overwrite") \
            .saveAsTable(BACKUP_TABLE)
        
        # Verify backup
        backup_count = spark.table(BACKUP_TABLE).count()
        
        duration = time.time() - start_time
        print(f"‚úÖ Backup created successfully")
        print(f"   Records backed up: {backup_count:,}")
        print(f"   Duration: {duration:.2f} seconds")
        print(f"   Location: {BACKUP_TABLE}")
        print("-" * 80)
        
        return True
        
    except Exception as e:
        print(f"‚ùå BACKUP FAILED: {str(e)}")
        print(f"   ABORTING: Cannot proceed without backup")
        return False

# Create backup
backup_success = create_backup()

if not backup_success:
    raise Exception("Backup creation failed. Aborting for safety.")

# COMMAND ----------

# MAGIC %md
# MAGIC ## STEP 1.2: ADD AUDIT COLUMNS

# COMMAND ----------

def add_audit_columns():
    """
    Add audit columns to person table using Spark native approach
    
    Method:
    1. Read existing table
    2. Add new columns with appropriate defaults
    3. Write to temporary table
    4. Swap tables (atomic operation)
    
    Columns Added:
    - created_timestamp: When record was first created
    - updated_timestamp: When record was last modified
    - is_deleted: Soft delete flag
    
    Default Values:
    - Existing records: NULL for timestamps (unknown), False for is_deleted
    - Future records: Will be populated by source system or ETL
    """
    print("\nüîß ADDING AUDIT COLUMNS:")
    print("-" * 80)
    
    start_time = time.time()
    
    try:
        # 1. Read existing table
        print("   Step 1: Reading source table...")
        source_df = spark.table(SOURCE_TABLE)
        original_count = source_df.count()
        print(f"   ‚úÖ Read {original_count:,} records")
        
        # 2. Add audit columns
        print("   Step 2: Adding audit columns...")
        
        # Check which columns already exist
        existing_cols = source_df.columns
        
        # Add created_timestamp (if not exists)
        if 'created_timestamp' not in existing_cols:
            source_df = source_df.withColumn(
                "created_timestamp", 
                F.lit(None).cast(TimestampType())
            )
            print("   ‚úÖ Added: created_timestamp (TIMESTAMP, NULL for existing)")
        else:
            print("   ‚ö†Ô∏è  Skipped: created_timestamp (already exists)")
        
        # Add updated_timestamp (if not exists)
        if 'updated_timestamp' not in existing_cols:
            source_df = source_df.withColumn(
                "updated_timestamp", 
                F.lit(None).cast(TimestampType())
            )
            print("   ‚úÖ Added: updated_timestamp (TIMESTAMP, NULL for existing)")
        else:
            print("   ‚ö†Ô∏è  Skipped: updated_timestamp (already exists)")
        
        # Add is_deleted (if not exists)
        if 'is_deleted' not in existing_cols:
            source_df = source_df.withColumn(
                "is_deleted", 
                F.lit(False).cast(BooleanType())
            )
            print("   ‚úÖ Added: is_deleted (BOOLEAN, False for existing)")
        else:
            print("   ‚ö†Ô∏è  Skipped: is_deleted (already exists)")
        
        # 3. Verify schema
        new_schema = source_df.schema.fields
        new_columns = [f.name for f in new_schema]
        print(f"   ‚úÖ New schema has {len(new_columns)} columns (was {len(existing_columns)})")
        
        # 4. Write to temporary table
        print("   Step 3: Writing to temporary table...")
        
        # Drop temp table if exists
        if spark.catalog.tableExists(TEMP_TABLE):
            spark.sql(f"DROP TABLE IF EXISTS {TEMP_TABLE}")
        
        source_df.write \
            .format("delta") \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .saveAsTable(TEMP_TABLE)
        
        # Verify temp table
        temp_count = spark.table(TEMP_TABLE).count()
        print(f"   ‚úÖ Wrote {temp_count:,} records to temp table")
        
        # 5. Verify record count matches
        if temp_count != original_count:
            raise Exception(f"Record count mismatch! Original: {original_count:,}, Temp: {temp_count:,}")
        print(f"   ‚úÖ Record count verified: {temp_count:,} = {original_count:,}")
        
        duration = time.time() - start_time
        print(f"‚úÖ Audit columns added successfully")
        print(f"   Duration: {duration:.2f} seconds")
        print("-" * 80)
        
        return True, temp_count
        
    except Exception as e:
        print(f"‚ùå ADDING COLUMNS FAILED: {str(e)}")
        print(f"   Rolling back...")
        
        # Cleanup temp table
        try:
            if spark.catalog.tableExists(TEMP_TABLE):
                spark.sql(f"DROP TABLE IF EXISTS {TEMP_TABLE}")
                print("   ‚úÖ Cleaned up temporary table")
        except:
            pass
        
        return False, 0

# Add audit columns
add_success, new_count = add_audit_columns()

if not add_success:
    print("\nüîÑ ROLLBACK: Restoring from backup...")
    # Source table unchanged, backup exists
    print("‚úÖ Original table unchanged. Backup available at:", BACKUP_TABLE)
    raise Exception("Adding columns failed. Original table is safe.")

# COMMAND ----------

# MAGIC %md
# MAGIC ## STEP 1.3: ATOMIC TABLE SWAP

# COMMAND ----------

def swap_tables():
    """
    Atomically swap temp table with source table
    
    Strategy:
    1. Rename source ‚Üí source_old
    2. Rename temp ‚Üí source
    3. Drop source_old
    
    This ensures minimal downtime and atomic switch
    """
    print("\nüîÑ ATOMIC TABLE SWAP:")
    print("-" * 80)
    
    try:
        # 1. Rename source to old
        print(f"   Step 1: Renaming {SOURCE_TABLE} ‚Üí person_old...")
        spark.sql(f"ALTER TABLE {SOURCE_TABLE} RENAME TO Lake24.dbo.person_old")
        print("   ‚úÖ Source renamed to person_old")
        
        # 2. Rename temp to source
        print(f"   Step 2: Renaming {TEMP_TABLE} ‚Üí {SOURCE_TABLE}...")
        spark.sql(f"ALTER TABLE {TEMP_TABLE} RENAME TO {SOURCE_TABLE}")
        print(f"   ‚úÖ Temp renamed to {SOURCE_TABLE}")
        
        # 3. Verify new table
        new_count = spark.table(SOURCE_TABLE).count()
        print(f"   ‚úÖ Verified: {SOURCE_TABLE} has {new_count:,} records")
        
        # 4. Drop old table
        print(f"   Step 3: Dropping person_old...")
        spark.sql("DROP TABLE IF EXISTS Lake24.dbo.person_old")
        print("   ‚úÖ Old table dropped")
        
        print("-" * 80)
        print("‚úÖ TABLE SWAP COMPLETED SUCCESSFULLY")
        print("=" * 80)
        
        return True
        
    except Exception as e:
        print(f"‚ùå TABLE SWAP FAILED: {str(e)}")
        print(f"   CRITICAL: Manual intervention required!")
        print(f"   Current state:")
        print(f"   - Original table may be at: Lake24.dbo.person_old")
        print(f"   - Temp table may be at: {TEMP_TABLE}")
        print(f"   - Backup available at: {BACKUP_TABLE}")
        return False

# Perform atomic swap
swap_success = swap_tables()

if not swap_success:
    print("\n‚ö†Ô∏è  MANUAL RECOVERY NEEDED:")
    print("   Run: spark.sql('ALTER TABLE Lake24.dbo.person_old RENAME TO Lake24.dbo.person')")
    raise Exception("Table swap failed. Backup available.")

# COMMAND ----------

# MAGIC %md
# MAGIC ## STEP 1.4: POST-OPERATION VERIFICATION

# COMMAND ----------

def verify_completion():
    """
    Comprehensive verification after schema change
    
    Verifies:
    1. Record count matches original
    2. All original columns present
    3. New audit columns present
    4. Sample data integrity
    5. Table is queryable
    """
    print("\n‚úÖ POST-OPERATION VERIFICATION:")
    print("=" * 80)
    
    try:
        # 1. Check table exists
        if not spark.catalog.tableExists(SOURCE_TABLE):
            print(f"‚ùå ERROR: {SOURCE_TABLE} does not exist!")
            return False
        print(f"‚úÖ Table exists: {SOURCE_TABLE}")
        
        # 2. Get new schema
        new_df = spark.table(SOURCE_TABLE)
        new_columns = new_df.columns
        
        print(f"\nüìä SCHEMA VERIFICATION:")
        print(f"   Original columns: {len(existing_columns)}")
        print(f"   New columns: {len(new_columns)}")
        print(f"   Added: {len(new_columns) - len(existing_columns)}")
        
        # 3. Verify audit columns exist
        audit_columns = ['created_timestamp', 'updated_timestamp', 'is_deleted']
        for col in audit_columns:
            if col in new_columns:
                print(f"   ‚úÖ {col} present")
            else:
                print(f"   ‚ùå {col} MISSING!")
                return False
        
        # 4. Verify record count
        final_count = new_df.count()
        print(f"\nüìä RECORD COUNT VERIFICATION:")
        print(f"   Original: {record_count:,}")
        print(f"   Final: {final_count:,}")
        
        if final_count != record_count:
            print(f"   ‚ùå COUNT MISMATCH! Lost {record_count - final_count:,} records!")
            return False
        print(f"   ‚úÖ All {final_count:,} records preserved")
        
        # 5. Sample data check
        print(f"\nüìä SAMPLE DATA CHECK:")
        sample = new_df.select(
            "person_id", 
            "gender_concept_id",
            "created_timestamp",
            "updated_timestamp", 
            "is_deleted"
        ).limit(5)
        
        sample.show(truncate=False)
        
        # 6. Check audit column values
        null_created = new_df.filter(F.col("created_timestamp").isNull()).count()
        null_updated = new_df.filter(F.col("updated_timestamp").isNull()).count()
        deleted_count = new_df.filter(F.col("is_deleted") == True).count()
        
        print(f"\nüìä AUDIT COLUMN STATISTICS:")
        print(f"   created_timestamp = NULL: {null_created:,} ({null_created/final_count*100:.1f}%)")
        print(f"   updated_timestamp = NULL: {null_updated:,} ({null_updated/final_count*100:.1f}%)")
        print(f"   is_deleted = True: {deleted_count:,} ({deleted_count/final_count*100:.1f}%)")
        
        print("\n" + "=" * 80)
        print("‚úÖ‚úÖ‚úÖ PHASE 1 - STEP 1 COMPLETED SUCCESSFULLY ‚úÖ‚úÖ‚úÖ")
        print("=" * 80)
        print(f"‚úÖ Table: {SOURCE_TABLE}")
        print(f"‚úÖ Records: {final_count:,} (preserved)")
        print(f"‚úÖ Columns: {len(new_columns)} (added {len(new_columns) - len(existing_columns)})")
        print(f"‚úÖ Backup: {BACKUP_TABLE} (available for 7 days)")
        print("=" * 80)
        
        return True
        
    except Exception as e:
        print(f"‚ùå VERIFICATION FAILED: {str(e)}")
        return False

# Verify completion
verification_passed = verify_completion()

if not verification_passed:
    print("\n‚ö†Ô∏è  VERIFICATION FAILED BUT TABLE UPDATED")
    print("   Table may be in inconsistent state")
    print("   Backup available for recovery")

# COMMAND ----------

# MAGIC %md
# MAGIC ## CLEANUP & SUMMARY

# COMMAND ----------

print("\nüßπ CLEANUP:")
print("-" * 80)

# Cleanup temp table (if still exists)
try:
    if spark.catalog.tableExists(TEMP_TABLE):
        spark.sql(f"DROP TABLE IF EXISTS {TEMP_TABLE}")
        print(f"‚úÖ Cleaned up: {TEMP_TABLE}")
except:
    pass

print("\nüìã SUMMARY:")
print("=" * 80)
print("PHASE 1 - STEP 1: ADD AUDIT COLUMNS")
print("=" * 80)
print(f"Status: {'‚úÖ SUCCESS' if verification_passed else '‚ùå FAILED'}")
print(f"Table: {SOURCE_TABLE}")
print(f"Records: {record_count:,} ‚Üí {spark.table(SOURCE_TABLE).count():,}")
print(f"Columns Added:")
print(f"  - created_timestamp (TIMESTAMP)")
print(f"  - updated_timestamp (TIMESTAMP)")
print(f"  - is_deleted (BOOLEAN)")
print(f"\nBackup Location: {BACKUP_TABLE}")
print(f"Retention: Keep for 7 days")
print("=" * 80)

print("\nüìå NEXT STEPS:")
print("1. ‚úÖ Audit columns added to person table")
print("2. ‚è≠Ô∏è  Update synthetic data generator (Step 2)")
print("3. ‚è≠Ô∏è  Create ETL control table (Step 3)")
print("4. ‚è≠Ô∏è  Modify ETL for incremental load (Step 4)")

# COMMAND ----------

# MAGIC %md
# MAGIC ---
# MAGIC ## ‚úÖ PHASE 1 - STEP 1 COMPLETE
# MAGIC 
# MAGIC **What Was Done:**
# MAGIC - ‚úÖ Added 3 audit columns to person table
# MAGIC - ‚úÖ Preserved all 15.7M records
# MAGIC - ‚úÖ Created backup (safety)
# MAGIC - ‚úÖ Atomic table swap (no downtime)
# MAGIC - ‚úÖ Verified integrity
# MAGIC 
# MAGIC **Rollback (if needed):**
# MAGIC ```python
# MAGIC # Restore from backup
# MAGIC spark.sql(f"DROP TABLE IF EXISTS {SOURCE_TABLE}")
# MAGIC spark.sql(f"ALTER TABLE {BACKUP_TABLE} RENAME TO {SOURCE_TABLE}")
# MAGIC ```
# MAGIC 
# MAGIC **Impact on Your ETL v3.2:**
# MAGIC - Your SchemaInspector will detect new columns
# MAGIC - Will use ALTER TABLE ADD COLUMNS on downstream tables
# MAGIC - No FORCE_RECREATE needed (new columns compatible)
# MAGIC - Next ETL run will auto-evolve bronze/silver/gold/dim

StatementMeta(, 23c0b5c7-6d7d-4afa-8adc-b5d549b56141, 5, Finished, Available, Finished, False)

PHASE 1 - STEP 1: ADD AUDIT COLUMNS
Source Table: Lake24.dbo.person
Backup Table: Lake24.dbo.person_backup_phase1
Spark Version: 3.5.5.5.4.20260109.1

üîç PRE-FLIGHT CHECKS:
--------------------------------------------------------------------------------
‚úÖ Source table exists: Lake24.dbo.person
‚úÖ Current record count: 15,712,818
‚úÖ Current column count: 18
‚úÖ No audit columns exist yet (will add: ['created_timestamp', 'updated_timestamp', 'is_deleted'])
üìä Estimated processing time: 157.1 seconds (2.6 minutes)
--------------------------------------------------------------------------------
‚úÖ ALL PRE-FLIGHT CHECKS PASSED

üì¶ CREATING BACKUP:
--------------------------------------------------------------------------------
   Creating backup: Lake24.dbo.person_backup_phase1
‚úÖ Backup created successfully
   Records backed up: 15,712,818
   Duration: 23.08 seconds
   Location: Lake24.dbo.person_backup_phase1
--------------------------------------------------------------------