In [0]:
# ============================================================
# 01_bronze_ingest.py (FULL SCRIPT + REQUESTED IMPROVEMENTS)
# Unity Catalog + Volumes compatible + Incremental scaffolding
#
# Changes applied (per your request):
#   A) Split catalog/schema names for clarity (olist.bronze and olist.ops)
#   B) Update ingest-log comment to match renamed bronze tables (orders_raw etc.)
#   C) Add .option("overwriteSchema","true") to Bronze writes for safer overwrite runs
#
# What this notebook does:
#   - Reads raw CSVs from a Databricks Volume:
#       /Volumes/workspace/default/olist_stage/olist_raw
#   - Adds Bronze metadata columns:
#       ingested_at, source_file, source_system
#   - Writes to Unity Catalog managed Bronze tables using Delta + saveAsTable()
#   - Maintains an ingest log table to support incremental runs:
#       olist.ops.bronze_ingest_log
#   - Incremental logic: when WRITE_MODE="append", ingest only NEW source files
#
# How to run:
#   - First run: WRITE_MODE = "overwrite"  (clean baseline)
#   - Future incremental runs: WRITE_MODE = "append" (only new files)
# ============================================================


# ----------------------------
# 0) Imports
# ----------------------------

from pyspark.sql.functions import (
    current_timestamp,  # Adds an ingestion timestamp column
    lit,                # Adds constant values as columns
    col                 # References columns (including _metadata.file_path)
)


# ----------------------------
# 1) Source location (Raw CSVs) - in Databricks Volume
# ----------------------------

# Raw CSV folder (already downloaded from Azure into this Volume)
RAW_VOLUME_DIR = "/Volumes/olist/stage/olist_stage/olist_raw"



# ----------------------------
# 2) Catalog/Schema configuration (explicit and clear)
# ----------------------------

# Bronze layer catalog and schema (database)
BRONZE_CATALOG = "olist"       # Catalog name you created
BRONZE_DB = "bronze"           # Schema (database) name inside the catalog
BRONZE_SCHEMA = f"{BRONZE_CATALOG}.{BRONZE_DB}"  # Fully-qualified schema, e.g. olist.bronze

# Ops layer catalog and schema (database) for control tables
OPS_CATALOG = "olist"          # Same catalog
OPS_DB = "ops"                 # Ops schema name inside the catalog
OPS_SCHEMA = f"{OPS_CATALOG}.{OPS_DB}"  # Fully-qualified schema, e.g. olist.ops

# Control table to track which source files have already been ingested per bronze table
INGEST_LOG_TABLE = f"{OPS_SCHEMA}.bronze_ingest_log"  # Fully-qualified table name


# ----------------------------
# 3) Write mode configuration
# ----------------------------

# Write mode:
#   - "overwrite" = initial full load (clean baseline)
#   - "append"    = incremental loads (only new files)
WRITE_MODE = "append"  # Change to "append" later for incremental runs


# ----------------------------
# 4) Create schemas (databases) if they don't exist
# ----------------------------

# Create the Bronze schema under the 'olist' catalog
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {BRONZE_SCHEMA}")

# Create the Ops schema under the 'olist' catalog
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {OPS_SCHEMA}")


# ----------------------------
# 5) Create ingest log table (control table) if not exist
# ----------------------------

# This table stores one row per (bronze_table, source_file) that has been ingested.
# It is used to prevent re-ingesting the same file in incremental runs.
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {INGEST_LOG_TABLE} (
  bronze_table STRING,      -- e.g. 'orders_raw'
  source_file  STRING,      -- UC-safe file path from _metadata.file_path
  ingested_at  TIMESTAMP    -- timestamp when we ingested the file
)
USING DELTA
""")


# ----------------------------
# 6) Helper: Read CSV and add Unity Catalogâ€“safe metadata columns
# ----------------------------

def read_csv_with_metadata(csv_filename: str):
    """
    Read a CSV from the raw volume directory and add standard Bronze metadata columns.
    Uses _metadata.file_path (Unity Catalog compatible) for file lineage.
    """

    # Build full path to the CSV file in the raw volume folder
    csv_path = f"{RAW_VOLUME_DIR}/{csv_filename}"

    # Read the CSV file into a Spark DataFrame
    df = (
        spark.read
        .option("header", True)       # Treat first row as header
        .option("inferSchema", True)  # Infer datatypes (ok for Bronze)
        .csv(csv_path)                # Read CSV from the Volume path
    )

    # Add standard Bronze metadata columns
    df = (
        df
        .withColumn("ingested_at", current_timestamp())          # When this row was ingested
        .withColumn("source_file", col("_metadata.file_path"))   # UC-safe source file path
        .withColumn("source_system", lit("olist_kaggle"))        # Constant source label
    )

    # Return dataframe with metadata
    return df


# ----------------------------
# 7) Helper: Filter only NEW files for incremental runs
# ----------------------------

def filter_new_files_only(df, bronze_table: str):
    """
    If WRITE_MODE == 'append', filter out rows whose source_file has already been ingested
    for this bronze_table (based on INGEST_LOG_TABLE).
    If WRITE_MODE == 'overwrite', return df unchanged (full reload).
    """

    # If full reload, do not filter anything
    if WRITE_MODE == "overwrite":
        return df

    # Load previously ingested file paths for this bronze table from the ingest log
    logged_files = (
        spark.table(INGEST_LOG_TABLE)                              # Read ingest log table
        .filter(col("bronze_table") == lit(bronze_table))           # Only this bronze table
        .select(col("source_file").alias("logged_source_file"))     # Rename for join clarity
        .distinct()                                                 # Unique file paths only
    )

    # Left anti join keeps rows from df that do NOT match logged file paths
    df_new = (
        df.join(
            logged_files,                                           # DataFrame of already-ingested files
            on=(df["source_file"] == logged_files["logged_source_file"]),  # Match on file path
            how="left_anti"                                         # Keep only rows that do not match
        )
    )

    # Return only rows coming from new, unseen files
    return df_new


# ----------------------------
# 8) Helper: Write Bronze table as Unity Catalog MANAGED table (no LOCATION)
# ----------------------------

def write_bronze_table_managed(df, bronze_table: str):
    """
    Write a DataFrame as a Unity Catalog managed Delta table using saveAsTable().
    This avoids CREATE TABLE ... LOCATION '/Volumes/...', which triggers:
      'Missing cloud file system scheme'
    """

    # Build fully-qualified UC table name (catalog.schema.table)
    full_table_name = f"{BRONZE_SCHEMA}.{bronze_table}"

    # Write as a managed Delta table:
    # - overwrite: rebuild table fully (initial load)
    # - append: add new rows (incremental)
    (
        df.write
        .format("delta")                         # Delta Lake format
        .mode(WRITE_MODE)                        # overwrite or append
        .option("overwriteSchema", "true")       # Allow schema updates on overwrite runs
        .saveAsTable(full_table_name)            # UC managed table (Databricks manages storage)
    )


# ----------------------------
# 9) Helper: Update ingest log after writing new rows
# ----------------------------

def log_ingested_files(df_written, bronze_table: str):
    """
    Record which source_file values were ingested for this bronze_table.
    This enables incremental runs to skip already processed files.
    """

    # If overwrite, clear the existing log entries for this bronze table
    # because we're rebuilding the table from scratch.
    if WRITE_MODE == "overwrite":
        spark.sql(f"DELETE FROM {INGEST_LOG_TABLE} WHERE bronze_table = '{bronze_table}'")

    # Build a dataframe of distinct source files we ingested in this run
    files_df = (
        df_written
        .select(
            lit(bronze_table).alias("bronze_table"),  # Tag with bronze table name
            col("source_file")                        # File path from metadata
        )
        .distinct()                                   # One row per file
        .withColumn("ingested_at", current_timestamp())  # When the file was ingested
    )

    # Append these file entries to the ingest log table
    (
        files_df.write
        .format("delta")                 # Store log as Delta
        .mode("append")                  # Always append new log entries
        .saveAsTable(INGEST_LOG_TABLE)   # Write into the UC log table
    )


# ----------------------------
# 10) Dataset map (bronze_table -> csv filename)
# ----------------------------

datasets = {
    # Bronze renamed tables (raw landing) - consistent naming
    "orders_raw": "olist_orders_dataset.csv",
    "order_items_raw": "olist_order_items_dataset.csv",
    "order_payments_raw": "olist_order_payments_dataset.csv",
    "customers_raw": "olist_customers_dataset.csv",
    "products_raw": "olist_products_dataset.csv",
    "sellers_raw": "olist_sellers_dataset.csv",
    "order_reviews_raw": "olist_order_reviews_dataset.csv",
    "geolocation_raw": "olist_geolocation_dataset.csv",
    "category_translation_raw": "product_category_name_translation.csv",
}


# ----------------------------
# 11) Ingest loop (read -> incremental filter -> write -> log)
# ----------------------------

for bronze_table, csv_file in datasets.items():
    # Read CSV and add Bronze metadata columns
    df = read_csv_with_metadata(csv_file)

    # Filter out already ingested files if running incrementally
    df_new = filter_new_files_only(df, bronze_table)

    # If there are no new rows, skip writing/logging to save compute
    # NOTE: This triggers a small Spark job per table, but dataset is small and this is fine.
    if df_new.limit(1).count() == 0:
        print(f"SKIP: {BRONZE_SCHEMA}.{bronze_table} (no new files)")
        continue

    # Write the new rows to a UC managed Delta table
    write_bronze_table_managed(df_new, bronze_table)

    # Log ingested files so future append runs won't reprocess them
    log_ingested_files(df_new, bronze_table)

    # Print success message
    print(f"OK: {BRONZE_SCHEMA}.{bronze_table} written with mode={WRITE_MODE} from {csv_file}")


# ----------------------------
# 12) Validation: show row counts + recent ingest log entries
# ----------------------------

# Show row counts for each bronze table (quick sanity check)
for bronze_table in datasets.keys():
    display(
        spark.sql(
            f"SELECT '{BRONZE_SCHEMA}.{bronze_table}' AS table_name, COUNT(*) AS row_count "
            f"FROM {BRONZE_SCHEMA}.{bronze_table}"
        )
    )

# Show the most recent ingest log entries so you can confirm logging works
display(
    spark.sql(
        f"SELECT * FROM {INGEST_LOG_TABLE} ORDER BY ingested_at DESC LIMIT 50"
    )
)


moving catalog from workspace to olist

In [0]:
%sql
-- Show which catalog your session is currently using (important for where schemas were created)
SELECT current_catalog(); --> showing workspace

-- Show which schema your session is currently using
SELECT current_schema(); --> showing default

-- -- List all catalogs you have access to (to confirm 'olist' doesn't already exist)
-- SHOW CATALOGS;

-- -- List schemas in the current catalog (to see where 'olist_bronze' and 'olist_ops' exist)
-- SHOW SCHEMAS;


In [0]:
%sql
-- Create the new catalog
CREATE CATALOG IF NOT EXISTS olist;

In [0]:
%sql
-- Create schemas inside the new catalog
CREATE SCHEMA IF NOT EXISTS olist.bronze;
CREATE SCHEMA IF NOT EXISTS olist.ops;


In [0]:
%sql
SHOW CATALOGS;
SHOW SCHEMAS IN olist;




In [0]:
%sql
CREATE TABLE olist.bronze.orders_raw
DEEP CLONE workspace.olist_bronze.brz_orders;

CREATE TABLE olist.bronze.order_items_raw
DEEP CLONE workspace.olist_bronze.brz_order_items;

CREATE TABLE olist.bronze.order_payments_raw
DEEP CLONE workspace.olist_bronze.brz_order_payments;

CREATE TABLE olist.bronze.customers_raw
DEEP CLONE workspace.olist_bronze.brz_customers;

CREATE TABLE olist.bronze.products_raw
DEEP CLONE workspace.olist_bronze.brz_products;

CREATE TABLE olist.bronze.sellers_raw
DEEP CLONE workspace.olist_bronze.brz_sellers;

CREATE TABLE olist.bronze.order_reviews_raw
DEEP CLONE workspace.olist_bronze.brz_order_reviews;

CREATE TABLE olist.bronze.geolocation_raw
DEEP CLONE workspace.olist_bronze.brz_geolocation;

CREATE TABLE olist.bronze.category_translation_raw
DEEP CLONE workspace.olist_bronze.brz_category_translation;


In [0]:
%sql
CREATE TABLE olist.ops.bronze_ingest_log
DEEP CLONE workspace.olist_ops.bronze_ingest_log;


In [0]:
%sql
SHOW TABLES IN olist.bronze;

In [0]:
display(dbutils.fs.ls("/Volumes/olist/stage/olist_stage/olist_raw/"))
dbutils.fs.ls("/Volumes/olist/stage/olist_stage/olist_raw")

