In [0]:
# ============================================
# FULLY DYNAMIC CONFIGURATION
# Auto-discovers databases AND tables
# ============================================

# Base storage account and container
storage_account = "datamigrationsathya"
container = "datalake"

# Path structure configuration
layer = "bronze"  # bronze, silver, gold
source_system = "mysql"  # mysql, postgres, etc.

# Base path for the source system
source_base_path = f"abfss://{container}@{storage_account}.dfs.core.windows.net/{layer}/{source_system}/"

print("=" * 70)
print("AUTO-DISCOVERING DATABASES AND TABLES")
print("=" * 70)
print(f"\nSource system: {source_system}")
print(f"Base path: {source_base_path}")

# ============================================
# AUTO-DISCOVER ALL DATABASES
# ============================================

all_database_configs = []

try:
    # List all database folders under bronze/mysql/
    database_folders = dbutils.fs.ls(source_base_path)
    
    print(f"\nFound {len(database_folders)} database(s):\n")
    
    for db_folder in database_folders:
        if db_folder.isDir():
            database_name = db_folder.name.rstrip('/')
            
            # Construct paths for this database
            bronze_path = f"{source_base_path}{database_name}/"
            silver_path = f"abfss://{container}@{storage_account}.dfs.core.windows.net/silver/{source_system}/{database_name}/"
            checkpoint_path = f"abfss://{container}@{storage_account}.dfs.core.windows.net/checkpoints/bronze_to_silver/{source_system}/{database_name}/"
            
            # Auto-discover tables in this database
            try:
                table_folders = dbutils.fs.ls(bronze_path)
                tables = [t.name.rstrip('/') for t in table_folders if t.isDir()]
                
                if tables:  # Only add if tables exist
                    all_database_configs.append({
                        "database_name": database_name,
                        "bronze_path": bronze_path,
                        "silver_path": silver_path,
                        "checkpoint_path": checkpoint_path,
                        "tables": tables
                    })
                    
                    print(f"  ‚úì {database_name}: {len(tables)} table(s)")
                    for table in tables:
                        print(f"      - {table}")
                    print()
                else:
                    print(f"  ‚ö† {database_name}: No tables found (skipping)\n")
                    
            except Exception as e:
                print(f"  ‚úó {database_name}: Error reading tables - {str(e)}\n")
                continue
    
    if not all_database_configs:
        print("\n‚ö†Ô∏è  No databases with tables found!")
        print("Please check your bronze layer structure.")
    else:
        print("=" * 70)
        print(f"SUMMARY: {len(all_database_configs)} database(s) ready to process")
        total_tables = sum(len(config['tables']) for config in all_database_configs)
        print(f"Total tables across all databases: {total_tables}")
        print("=" * 70)
        
except Exception as e:
    print(f"\n‚ùå Error discovering databases: {str(e)}")
    print("\nFalling back to manual configuration...")
    
    # Fallback: Manual configuration
    database_name = "retail_db"
    bronze_base_path = f"{source_base_path}{database_name}/"
    silver_base_path = f"abfss://{container}@{storage_account}.dfs.core.windows.net/silver/{source_system}/{database_name}/"
    checkpoint_base_path = f"abfss://{container}@{storage_account}.dfs.core.windows.net/checkpoints/bronze_to_silver/{source_system}/{database_name}/"
    
    # Auto-discover tables
    try:
        folders = dbutils.fs.ls(bronze_base_path)
        tables_to_process = [folder.name.rstrip('/') for folder in folders if folder.isDir()]
        print(f"Using manual database: {database_name}")
        print(f"Found {len(tables_to_process)} tables: {tables_to_process}")
    except:
        tables_to_process = ["customer_details"]
        print(f"Using fallback table list: {tables_to_process}")
    
    # Create single database config for backward compatibility
    all_database_configs = [{
        "database_name": database_name,
        "bronze_path": bronze_base_path,
        "silver_path": silver_base_path,
        "checkpoint_path": checkpoint_base_path,
        "tables": tables_to_process
    }]

print("\n‚úÖ Configuration complete!")
print("\nNote: Auto Loader will recursively process all files in subdirectories (e.g., load_date partitions)")

In [0]:
# Verification: Check silver layer for all databases and tables

print("=" * 70)
print("SILVER LAYER VERIFICATION")
print("=" * 70)

if 'all_database_configs' not in dir() or not all_database_configs:
    print("\n‚ùå Please run Cell 1 first!")
else:
    total_processed = 0
    total_records = 0
    
    for config in all_database_configs:
        db_name = config['database_name']
        silver_path = config['silver_path']
        tables = config['tables']
        
        print(f"\nüìä Database: {db_name}")
        print("=" * 70)
        
        for table_name in tables:
            try:
                # Read from silver layer
                df = spark.read.format("delta").load(f"{silver_path}{table_name}/")
                count = df.count()
                
                # Get distinct load dates if column exists
                if "load_date" in df.columns:
                    dates = df.select("load_date").distinct().count()
                    print(f"  ‚úì {table_name}: {count:,} records, {dates} load date(s)")
                else:
                    print(f"  ‚úì {table_name}: {count:,} records")
                
                total_processed += 1
                total_records += count
                
            except Exception as e:
                print(f"  ‚úó {table_name}: Not processed or error - {str(e)[:50]}")
        
        print()
    
    print("=" * 70)
    print(f"SUMMARY")
    print("=" * 70)
    print(f"Databases: {len(all_database_configs)}")
    print(f"Tables processed: {total_processed}")
    print(f"Total records: {total_records:,}")
    print("=" * 70)

In [0]:
# # Test processing a single table to verify the setup works

# from pyspark.sql.functions import current_timestamp, input_file_name, lit

# # Use first database and first table for testing
# test_config = all_database_configs[0]
# test_db = test_config['database_name']
# test_table = test_config['tables'][0]

# print(f"üìä Testing with: {test_db}/{test_table}")
# print("=" * 70)

# bronze_path = f"{test_config['bronze_path']}{test_table}/"
# silver_path = f"{test_config['silver_path']}{test_table}/"
# checkpoint_path = f"{test_config['checkpoint_path']}{test_table}/"

# print(f"\nBronze: {bronze_path}")
# print(f"Silver: {silver_path}")
# print(f"Checkpoint: {checkpoint_path}")

# # Check bronze layer has data
# print(f"\nüîç Checking bronze layer...")
# try:
#     bronze_files = dbutils.fs.ls(bronze_path)
#     print(f"   ‚úì Found {len(bronze_files)} item(s) in bronze layer")
#     for item in bronze_files[:3]:
#         print(f"     - {item.name}")
# except Exception as e:
#     print(f"   ‚ùå Error: {str(e)}")
#     raise

# # Process with Auto Loader
# print(f"\n‚è≥ Processing with Auto Loader...")

# try:
#     df_stream = (spark.readStream
#         .format("cloudFiles")
#         .option("cloudFiles.format", "parquet")
#         .option("cloudFiles.schemaLocation", f"{checkpoint_path}schema")
#         .option("cloudFiles.inferColumnTypes", "true")
#         .option("recursiveFileLookup", "true")
#         .load(bronze_path)
#     )
    
#     print(f"   ‚úì Stream created")
    
#     # Add metadata
#     df_enriched = (df_stream
#         .withColumn("processing_timestamp", current_timestamp())
#         .withColumn("source_file", input_file_name())
#         .withColumn("source_table", lit(test_table))
#         .withColumn("source_database", lit(test_db))
#     )
    
#     print(f"   ‚úì Metadata columns added")
    
#     # Write to silver
#     query = (df_enriched.writeStream
#         .format("delta")
#         .option("checkpointLocation", f"{checkpoint_path}checkpoint")
#         .option("mergeSchema", "true")
#         .outputMode("append")
#         .trigger(availableNow=True)
#         .start(silver_path)
#     )
    
#     print(f"   ‚úì Write stream started")
#     print(f"   ‚è≥ Waiting for completion...")
    
#     query.awaitTermination()
    
#     print(f"\n‚úÖ SUCCESS! Processed {test_db}/{test_table}")
    
#     # Verify silver layer
#     print(f"\nüîç Verifying silver layer...")
#     df_silver = spark.read.format("delta").load(silver_path)
#     count = df_silver.count()
#     print(f"   ‚úì Silver layer has {count:,} records")
    
#     print(f"\nüéâ Test successful! The pipeline is working correctly.")
    
# except Exception as e:
#     print(f"\n‚ùå ERROR: {str(e)}")
#     import traceback
#     traceback.print_exc()