In [0]:
import sys
import os

# Add the config folder to Python's search path
# This allows us to "import" our databricks_secret_setup.py file
sys.path.append(os.path.abspath('../config'))

# Import your dictionary and function from the file we just created
from databricks_secret_setup import AWS_CREDS, get_s3_base_path

# Test it immediately
s3_base_path = get_s3_base_path()

print(f"ðŸš€ Success!")

In [0]:
%sql
-- This creates a dedicated 'sandbox' for your project
CREATE CATALOG IF NOT EXISTS sales_project;
CREATE SCHEMA IF NOT EXISTS sales_project.bronze;

-- A Volume is a managed folder that bypasses the "Public DBFS" restriction
CREATE VOLUME IF NOT EXISTS sales_project.bronze.landing_zone_metadata;

In [0]:
# Format: /Volumes/<catalog>/<schema>/<volume_name>/<folder>
volume_path = "/Volumes/sales_project/bronze/landing_zone_metadata"
from pyspark.sql.functions import input_file_name, current_timestamp
def ingest_to_bronze(entity_name):
    source_path = f"{s3_base_path}{entity_name}/"
    
    # Target Delta Table location
    target_path = f"{volume_path}/data/{entity_name}"
    
    # Metadata locations (Checkpoints and Schemas)
    checkpoint_path = f"{volume_path}/checkpoints/{entity_name}"
    schema_path = f"{volume_path}/schemas/{entity_name}"
    
    print(f"ðŸŽ¬ Starting ingestion for: {entity_name}")
    
    (spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("header", "true")
        .option("cloudFiles.schemaLocation", checkpoint_path) 
        .load(source_path)
        .selectExpr("*", "_metadata.file_path AS source_file", "current_timestamp() AS ingested_at")
        .writeStream
        .format("delta")
        .option("checkpointLocation", checkpoint_path)
        .option("mergeSchema", "true")
        # --- ADD THIS LINE BELOW ---
        .trigger(availableNow=True) 
        # ---------------------------
        .outputMode("append")
        .start(target_path)
        .awaitTermination()) # This tells the loop to wait for one table to finish before starting the next

In [0]:
entities = ["customers", "orders", "order_items", "products"]

for entity in entities:
    try:
        ingest_to_bronze(entity)
        print(f"Successfully ingested {entity}")
    except Exception as e:
        print(f"Failed to ingest {entity}: {e}")

In [0]:
for entity in entities:  
    path = f"/Volumes/sales_project/bronze/landing_zone_metadata/data/{entity}"
    print(f"--- Displaying Bronze Table: {entity} ---")
    df = spark.read.format("delta").load(path)
    display(df.limit(20))

to clear the already existing metadata

entities = ["customers", "orders", "order_items", "products"]

for entity in entities:
    print(f" Clearing old metadata for {entity}...")
    dbutils.fs.rm(f"{volume_path}/data/{entity}", recurse=True)
    dbutils.fs.rm(f"{volume_path}/checkpoints/{entity}", recurse=True)
    # Also clear the schema location if it was separate
    dbutils.fs.rm(f"{volume_path}/schemas/{entity}", recurse=True)

print("Workspace cleaned. Now re-run your ingestion loop.")