In [0]:
from pyspark.sql.functions import *
from datetime import datetime
import logging

In [0]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [0]:
# Get parameters from widgets (set by DAB)
dbutils.widgets.text("environment", "dev", "Environment")
dbutils.widgets.text("catalog_name", "cargo_fleet_dev", "Catalog Name")

environment = dbutils.widgets.get("environment")
catalog_name = dbutils.widgets.get("catalog_name")

logger.info(f"Setting up Unity Catalog for environment: {environment}")
logger.info(f"Catalog name: {catalog_name}")

### Create Catalog

In [0]:
try:
    # Create catalog
    spark.sql(f"""
        CREATE CATALOG IF NOT EXISTS {catalog_name}
        COMMENT 'Cargo fleet management catalog for {environment} environment'
    """)
    logger.info(f"✓ Created catalog: {catalog_name}")
    
    # Use catalog
    spark.sql(f"USE CATALOG {catalog_name}")
    
    # Create schemas with comments
    schemas = {
        "bronze": "Raw data landing zone - preserves original data",
        "silver": "Curated data layer - cleaned, validated, deduplicated",
        "gold": "Business aggregations - optimized for analytics"
    }
    
    for schema, comment in schemas.items():
        spark.sql(f"""
            CREATE SCHEMA IF NOT EXISTS {schema}
            COMMENT '{comment}'
        """)
        logger.info(f"✓ Created schema: {catalog_name}.{schema}")
        
except Exception as e:
    logger.error(f"Failed to create catalog/schemas: {str(e)}")
    raise

### Create Volumes for File Storage

In [0]:
try:
    volumes = {
        "telemetry_landing": "Landing zone for streaming ship telemetry data",
        "manifest_landing": "Landing zone for batch cargo manifest files",
        "status_landing": "Landing zone for container status changes",
        "maintenance_landing": "Landing zone for ship maintenance records",
        "checkpoints": "Checkpoint storage for streaming queries",
        "schemas": "Schema storage for evolution tracking"
    }
    
    for volume_name, comment in volumes.items():
        spark.sql(f"""
            CREATE VOLUME IF NOT EXISTS {catalog_name}.bronze.{volume_name}
            COMMENT '{comment}'
        """)
        logger.info(f"✓ Created volume: {catalog_name}.bronze.{volume_name}")
        
except Exception as e:
    logger.error(f"Failed to create volumes: {str(e)}")
    raise

### Create Utility Tables

In [0]:
try:
    # Create logging table for monitoring
    spark.sql(f"""
        CREATE TABLE IF NOT EXISTS {catalog_name}.bronze.pipeline_logs (
            log_id STRING,
            pipeline_name STRING,
            log_level STRING,
            message STRING,
            details STRING,
            timestamp TIMESTAMP,
            environment STRING
        )
        COMMENT 'Centralized logging for all pipelines'
    """)
    
    # Create data quality metrics table
    spark.sql(f"""
        CREATE TABLE IF NOT EXISTS {catalog_name}.gold.data_quality_metrics (
            check_id STRING,
            table_name STRING,
            check_type STRING,
            check_result STRING,
            metric_value DOUBLE,
            threshold DOUBLE,
            check_timestamp TIMESTAMP,
            environment STRING
        )
        COMMENT 'Data quality monitoring metrics'
    """)
    
    logger.info("✓ Created utility tables for logging and monitoring")
    
except Exception as e:
    logger.error(f"Failed to create utility tables: {str(e)}")
    raise

### Enable Delta Features

In [0]:
try:
    # Enable Change Data Feed for specific tables
    cdf_tables = [
        f"{catalog_name}.bronze.container_status_raw",
        f"{catalog_name}.silver.container_tracking_cdc"
    ]
    
    for table in cdf_tables:
        # Will be enabled when tables are created
        logger.info(f"CDF will be enabled for: {table}")
    
    # Set table properties for optimization
    optimization_settings = {
        "delta.autoOptimize.optimizeWrite": "true",
        "delta.autoOptimize.autoCompact": "true"
    }
    
    logger.info("✓ Delta optimization features configured")
    
except Exception as e:
    logger.error(f"Failed to configure Delta features: {str(e)}")
    raise

### Verify Setup

In [0]:
# Display catalog structure
print(f"\n{'='*60}")
print(f"CATALOG STRUCTURE FOR: {catalog_name}")
print(f"{'='*60}\n")

# Show schemas
schemas_df = spark.sql(f"SHOW SCHEMAS IN {catalog_name}")
display(schemas_df)

# Show volumes
print("\nVolumes in bronze schema:")
volumes_df = spark.sql(f"SHOW VOLUMES IN {catalog_name}.bronze")
display(volumes_df)

# Show tables
for schema in ["bronze", "silver", "gold"]:
    print(f"\nTables in {schema} schema:")
    try:
        tables_df = spark.sql(f"SHOW TABLES IN {catalog_name}.{schema}")
        display(tables_df)
    except:
        print(f"No tables yet in {schema}")

logger.info("✓ Unity Catalog setup completed successfully")

In [0]:
# Create configuration dictionary for other notebooks
config = {
    "catalog_name": catalog_name,
    "environment": environment,
    "bronze_schema": f"{catalog_name}.bronze",
    "silver_schema": f"{catalog_name}.silver",
    "gold_schema": f"{catalog_name}.gold",
    "volumes": {
        "telemetry": f"/Volumes/{catalog_name}/bronze/telemetry_landing",
        "manifest": f"/Volumes/{catalog_name}/bronze/manifest_landing",
        "status": f"/Volumes/{catalog_name}/bronze/status_landing",
        "maintenance": f"/Volumes/{catalog_name}/bronze/maintenance_landing",
        "checkpoints": f"/Volumes/{catalog_name}/bronze/checkpoints",
        "schemas": f"/Volumes/{catalog_name}/bronze/schemas"
    },
    "setup_timestamp": datetime.now().isoformat()
}

In [0]:
# Store configuration for reference
dbutils.notebook.exit(str(config))