## BRONZE LAYER (Delta Table Compliant)
 - Incremental ingestion (explicit Auto Loader options)
 - Partitioning by ingestion date
 - Record count validation (logged via DLT expectations)
 - Raw schema preservation (no transformations on business columns)

In [0]:
# Bronze Sales Transactions
import dlt
from pyspark.sql import functions as F
# Store data in original/raw format
@dlt.table(
    name="bronze_sales_transactions",
    comment="Raw sales transactions ingested incrementally from CSV files",
    table_properties={
        "quality": "bronze"
    },
    partition_cols=["ingestion_date"] # Partition Bronze data by ingestion date
)
@dlt.expect("record_count_check", "transaction_id IS NOT NULL OR transaction_id IS NULL")
def bronze_sales_transactions():

    return (
        spark.readStream # Read raw data using Spark
        .format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("cloudFiles.schemaLocation", "/Volumes/main/default/_schemas/bronze_sales_transactions")
        .option("cloudFiles.inferColumnTypes", "true")
        .option("header", "true")
        .load("/Volumes/main/default/enterprise_sales_data/sales_transactions/")
        # ingestion metadata only (raw schema preserved)
        .withColumn("ingestion_timestamp", F.current_timestamp())
        .withColumn("ingestion_date", F.to_date(F.current_timestamp()))
        .withColumn("source_system", F.lit("POS_SYSTEM"))
    )


In [0]:
# Bronze Product Master
@dlt.table(
    name="bronze_product_master",
    comment="Raw product master data ingested incrementally",
    table_properties={
        "quality": "bronze"
    },
    partition_cols=["ingestion_date"]
)
@dlt.expect("record_count_check", "product_id IS NOT NULL OR product_id IS NULL")
def bronze_product_master():

    return (
        spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("cloudFiles.schemaLocation", "/Volumes/main/default/_schemas/bronze_product_master")
        .option("cloudFiles.inferColumnTypes", "true")
        .option("header", "true")
        .load("/Volumes/main/default/enterprise_sales_data/product_master/")
        .withColumn("ingestion_timestamp", F.current_timestamp())
        .withColumn("ingestion_date", F.to_date(F.current_timestamp()))
        .withColumn("source_system", F.lit("PRODUCT_MASTER"))
    )


In [0]:
# Bronze Store / Region Reference
@dlt.table(
    name="bronze_store_region",
    comment="Raw store and region reference data ingested incrementally",
    table_properties={
        "quality": "bronze"
    },
    partition_cols=["ingestion_date"]
)
@dlt.expect("record_count_check", "store_id IS NOT NULL OR store_id IS NULL")
def bronze_store_region():

    return (
        spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("cloudFiles.schemaLocation", "/Volumes/main/default/_schemas/bronze_store_region")
        .option("cloudFiles.inferColumnTypes", "true")
        .option("header", "true")
        .load("/Volumes/main/default/enterprise_sales_data/store_region/")
        .withColumn("ingestion_timestamp", F.current_timestamp())
        .withColumn("ingestion_date", F.to_date(F.current_timestamp()))
        .withColumn("source_system", F.lit("STORE_REFERENCE"))
    )
