In [0]:
%run ../.././start_up

In [0]:
logger = create_logger(notebook_name="landing_to_bronze", log_level="DEBUG")
logger.info("🚀 Initializing landing_to_bronze notebook")

# Extract frequently used config values into variables
catalog = pipeline_config["catalog"]
bronze_schema = pipeline_config["schemas"]["bronze"]
bronze_path = pipeline_config["paths"]["bronze_path"]
landing_schema= pipeline_config["schemas"]["landing"]
landing_path = pipeline_config["paths"]["landing_path"]
bronze_volume_path = pipeline_config["paths"]["bronze_volume_path"]
logs_schema = pipeline_config["schemas"]["logs"]
logger.info("Extracted frequently used config values into variables")

In [0]:

 ''' Landing to Bronze Layer (CSV Volume Ingestion)
 Ingests CSV files from landing zone into a new bronze volume (bronze_YYYYMMDD).
 - Skips already ingested files unless `force_reprocess` is true
 - Adds ingestion timestamp
 - Writes back as CSV '''

# COMMAND ----------
from datetime import datetime
from pyspark.sql.functions import current_timestamp
import os


# COMMAND ----------
dbutils.widgets.text("table_name", "")
dbutils.widgets.dropdown("mode", "all", ["all", "single"])
dbutils.widgets.dropdown("force_reprocess", "true", ["true", "false"])

table_name = dbutils.widgets.get("table_name")
mode = dbutils.widgets.get("mode")
force_reprocess = dbutils.widgets.get("force_reprocess").lower() == "true"

if mode == "single" and table_name:
    tables = [table_name]
else:
    tables = tables_to_process

# COMMAND ----------
today_str = datetime.today().strftime('%Y%m%d')
bronze_volume_name = f"bronze_{today_str}"
bronze_volume_path = f"/Volumes/{catalog}/{bronze_schema}/{bronze_volume_name}"

spark.sql(f"CREATE VOLUME IF NOT EXISTS {catalog}.{bronze_schema}.{bronze_volume_name}")
logger.info(f"✅ Volume ready: {bronze_volume_path}")

# COMMAND ----------
@log_execution(logger)
def read_landing_csv(table_name: str):
    input_path = f"{landing_path}/{table_name}.csv"
    logger.info(f"📥 Reading file: {input_path}")
    try:
        df = (spark.read.option("header", "true")
                     .csv(input_path)
                     .withColumn("ingestion_date", current_timestamp()))
        log_dataframe_info(df, f"{table_name}_read", logger)
        return df
    except Exception as e:
        logger.error(f"❌ Error reading {input_path}: {e}")
        raise

# COMMAND ----------
@log_execution(logger)
def write_csv_to_bronze(df, table_name: str):
    output_path = f"{bronze_volume_path}/{table_name}"
    logger.info(f"💾 Writing to: {output_path}")
    try:
        df.coalesce(1).write.option("header", "true").mode("overwrite").format("csv").save(output_path)
        logger.info(f"✅ Successfully written {table_name} to bronze volume")
    except Exception as e:
        logger.error(f"❌ Failed to write {table_name}: {e}")
        raise

# COMMAND ----------
def already_ingested(table_name: str) -> bool:
    output_path = f"{bronze_volume_path}/{table_name}"
    try:
        files = dbutils.fs.ls(output_path)
        return any(".csv" in f.name for f in files)
    except:
        return False

# COMMAND ----------
def process_table(table_name):
    logger.info(f"🚀 Processing: {table_name}")

    if not force_reprocess and already_ingested(table_name):
        logger.info(f"⏩ Skipping {table_name}: already exists in bronze volume")
        return "SKIPPED"

    try:
        df = read_landing_csv(table_name)
        write_csv_to_bronze(df, table_name)

        row_count = df.count()
        logger.info(f"✅ Completed {table_name}: {row_count} rows written")
        return "SUCCESS"

    except Exception as e:
        logger.error(f"❌ Failed processing {table_name}: {e}")
        return "FAILED"

# COMMAND ----------
status_map = {}
for tbl in tables:
    status = process_table(tbl)
    status_map[tbl] = status

logger.info(f"🏁 Bronze ingestion completed. Status: {status_map}")



In [0]:

# # === Create Volume ===
# spark.sql(f"""
#     CREATE VOLUME IF NOT EXISTS {catalog}.{bronze_schema}.{bronze_volume_name}
# """)

# # === Loop through each file and ingest ===
# for file in tables_to_process:
#     input_path = f"{landing_path}/{file}.csv"
    
#     df = (
#         spark.read.format("csv")
#         .option("header", "true")
#         .load(input_path)
#         .withColumn("ingestion_date", current_timestamp())
#     )

#     # display(df)

#     # Write to Delta inside the volume (as a folder for each table)
#     df.coalesce(1).write.format("csv") \
#         .option("header","true") \
#         .mode("overwrite") \
#         .save(f"{bronze_volume_path}/{file}")

#     logger.info(f"✅ Ingested {file}.csv into volume {bronze_volume_name} under folder {file}")
