In [0]:
%run ../.././start_up

In [0]:
logger = create_logger(notebook_name="landing_to_bronze", log_level="DEBUG")
logger.info("🚀 Initializing landing_to_bronze notebook")

# Extract frequently used config values into variables
catalog = pipeline_config["catalog"]
bronze_schema = pipeline_config["schemas"]["bronze"]
bronze_path = pipeline_config["paths"]["bronze_path"]
landing_schema= pipeline_config["schemas"]["landing"]
landing_path = pipeline_config["paths"]["landing_path"]
today_str = datetime.today().strftime('%Y%m%d')
volume_name = f"bronze_{today_str}"
volume_path = f"/Volumes/{catalog}/{bronze_schema}/{volume_name}"
logs_schema = pipeline_config["schemas"]["logs"]
logger.info("Extracted frequently used config values into variables")

In [0]:
# Ingest data from landing volume
from pyspark.sql.functions import current_timestamp
for file in tables_to_process:
    # Read CSV file from the landing volume
    df = spark.read.format("csv").option("header", "true").load(
        f"{landing_path}/{file}.csv"
    ).withColumn("ingestion_date", current_timestamp())
    
    # Display the DataFrame to verify the data
    display(df)
    
    # Write the data into the bronze schema
    df.write.format("delta").option("mergeschema","true").mode("overwrite").saveAsTable(
        f"{catalog}.{bronze_schema}.{file}"
    )

    logger.info(f"Data ingested from {file} into {bronze_schema}.{file}")

In [0]:
from datetime import datetime
from pyspark.sql.functions import current_timestamp


# === Configuration ===
today_str = datetime.today().strftime('%Y%m%d')
volume_name = f"bronze_{today_str}"
volume_path = f"/Volumes/{catalog}/{bronze_schema}/{volume_name}"
# === Create Volume ===
spark.sql(f"""
    CREATE VOLUME IF NOT EXISTS {catalog}.{bronze_schema}.{volume_name}
""")

# === Loop through each file and ingest ===
for file in tables_to_process:
    input_path = f"{landing_path}/{file}.csv"
    
    df = (
        spark.read.format("csv")
        .option("header", "true")
        .load(input_path)
        .withColumn("ingestion_date", current_timestamp())
    )

    display(df)

    # Write to Delta inside the volume (as a folder for each table)
    df.coalesce(1).write.format("csv") \
        .option("mergeSchema", "true") \
        .mode("overwrite") \
        .save(f"{volume_path}/{file}.csv")

    logger.info(f"✅ Ingested {file}.csv into volume {volume_name} under folder {file}")
