# Bronze Layer: Ship Telemetry with Auto Loader

**Ingestion Pattern**: Auto Loader (cloudFiles)

**Features**:
- Automatic schema inference and evolution
- Scalable to millions of files
- Exactly-once processing with checkpoints
- Built-in error handling

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import logging

In [0]:
import sys
sys.path.append("../utils")
from logging_utils import get_logger
from error_handlers import handle_streaming_error

# Initialize logger
logger = get_logger("bronze_telemetry_autoloader")

In [0]:
# Get parameters
dbutils.widgets.text("catalog_name", "cargo_fleet_dev", "Catalog")
dbutils.widgets.text("checkpoint_location", "", "Checkpoint Location")
dbutils.widgets.text("schema_location", "", "Schema Location")

catalog_name = dbutils.widgets.get("catalog_name")
checkpoint_location = dbutils.widgets.get("checkpoint_location") or \
    f"/Volumes/{catalog_name}/bronze/checkpoints/telemetry_autoloader"
schema_location = dbutils.widgets.get("schema_location") or \
    f"/Volumes/{catalog_name}/bronze/schemas/telemetry"

logger.info(f"Starting Auto Loader ingestion for catalog: {catalog_name}")

In [0]:
# Define initial schema (Auto Loader will evolve this)
telemetry_schema = StructType([
    StructField("ship_id", StringType(), False),
    StructField("timestamp", StringType(), False),  # Will convert to TimestampType
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("speed_knots", DoubleType(), True),
    StructField("course_degrees", DoubleType(), True),
    StructField("fuel_level_percent", DoubleType(), True),
    StructField("fuel_consumption_rate", DoubleType(), True),
    StructField("engine_status", StringType(), True),
    StructField("weather_condition", StringType(), True),
    StructField("wave_height_meters", DoubleType(), True),
    StructField("transmission_delay_minutes", IntegerType(), True)
])

logger.info("Schema defined for telemetry ingestion")

In [0]:
source_path = f"/Volumes/{catalog_name}/bronze/telemetry_landing"

try:
    # Read stream with Auto Loader
    df_telemetry_stream = (
        spark.readStream
        .format("cloudFiles")  # Auto Loader format
        .option("cloudFiles.format", "json")
        .option("cloudFiles.schemaLocation", schema_location)
        .option("cloudFiles.inferColumnTypes", "true")
        .option("cloudFiles.schemaEvolutionMode", "addNewColumns")  # Handle schema evolution
        .option("cloudFiles.maxFilesPerTrigger", 100)  # Process 100 files per micro-batch
        #.schema(telemetry_schema)  # Initial schema hint
        .load(source_path)
    )
    
    logger.info(f"Auto Loader configured for source: {source_path}")
    
except Exception as e:
    logger.error(f"Failed to configure Auto Loader: {str(e)}")
    raise

In [0]:
try:
    df_telemetry_bronze = (
        df_telemetry_stream
        # Convert timestamp string to TimestampType
        .withColumn("timestamp", to_timestamp(col("timestamp")))
        
        # Add ingestion metadata
        .withColumn("ingestion_timestamp", current_timestamp())
        .withColumn("source_file", col('_metadata.file_path'))  # UC-compatible
        .withColumn("ingestion_date", current_date())
        
        # Add data quality flag
        .withColumn("is_valid_coordinates",
            (col("latitude").between(-90, 90)) & 
            (col("longitude").between(-180, 180))
        )
        
        # Add processing metadata
        .withColumn("processing_timestamp", current_timestamp())
    )
    
    logger.info("Ingestion metadata added successfully")
    
except Exception as e:
    logger.error(f"Failed to add metadata: {str(e)}")
    raise


In [0]:
bronze_table = f"{catalog_name}.bronze.ship_telemetry_raw"

try:
    query = (
        df_telemetry_bronze.writeStream
        .format("delta")
        .outputMode("append")
        .option("checkpointLocation", checkpoint_location)
        .option("mergeSchema", "true")  # Enable schema evolution
        .partitionBy("ingestion_date")  # Partition by date for performance
        #.trigger(processingTime="30 seconds")  # Micro-batch every 30 seconds
        .trigger(availableNow=True)
        .toTable(bronze_table)
    )
    
    logger.info(f"✓ Streaming query started successfully")
    logger.info(f"✓ Target table: {bronze_table}")
    logger.info(f"✓ Checkpoint: {checkpoint_location}")
    logger.info(f"✓ Query ID: {query.id}")
    
    # Display stream status
    print(f"\n{'='*60}")
    print(f"STREAMING QUERY STATUS")
    print(f"{'='*60}")
    print(f"Status: {query.status}")
    print(f"Recent Progress: {query.recentProgress}")
    
except Exception as e:
    logger.error(f"Failed to start streaming query: {str(e)}")
    handle_streaming_error(e, query_name="telemetry_autoloader")
    raise

In [0]:
# Keep the stream running
query.awaitTermination()