### Imports & constants

In [None]:
from pyspark.sql import functions as F

CATALOG = "olist_ecommerce"
BRONZE_SCHEMA = "bronze"

RAW_DATA_PATH = "/Volumes/olist_ecommerce/bronze/raw_data"
ENV = "dev"

### Dataset configuration

In [None]:
datasets = [
    {"file": "olist_orders_dataset.csv",            "table": "orders"},
    {"file": "olist_customers_dataset.csv",         "table": "customers"},
    {"file": "olist_order_items_dataset.csv",       "table": "order_items"},
    {"file": "olist_order_payments_dataset.csv",    "table": "payments"},
    {"file": "olist_products_dataset.csv",          "table": "products"},
    {"file": "olist_sellers_dataset.csv",           "table": "sellers"},
    {"file": "olist_geolocation_dataset.csv",       "table": "geolocation"},
    {
        "file": "olist_order_reviews_dataset.csv",
        "table": "reviews",
        "multiline": True
    },
    {
        "file": "product_category_name_translation.csv",
        "table": "product_category_en"
    }
]

### Bronze ingestion function

In [None]:
def ingest_csv_to_bronze(dataset):
    file_name = dataset["file"]
    table_name = dataset["table"]
    multiline = dataset.get("multiline", False)

    file_path = f"{RAW_DATA_PATH}/{file_name}"

    df = (
        spark.read
        .option("header", True)
        .option("quote", '"')
        .option("escape", '"')
        .option("multiline", multiline)
        .csv(file_path)
    )

    df = (
        df
        .withColumn("ingestion_ts", F.current_timestamp())
        .withColumn("env", F.lit(ENV))
    )

    (
        df.write
        .format("delta")
        .mode("overwrite")
        .saveAsTable(f"{CATALOG}.{BRONZE_SCHEMA}.{table_name}")
    )

    print(f"✅ Loaded {file_name} → {CATALOG}.{BRONZE_SCHEMA}.{table_name}")

### Run ingestion for all datasets

In [None]:
for dataset in datasets:
    ingest_csv_to_bronze(dataset)

✅ Loaded olist_orders_dataset.csv → olist_ecommerce.bronze.orders
✅ Loaded olist_customers_dataset.csv → olist_ecommerce.bronze.customers
✅ Loaded olist_order_items_dataset.csv → olist_ecommerce.bronze.order_items
✅ Loaded olist_order_payments_dataset.csv → olist_ecommerce.bronze.payments
✅ Loaded olist_products_dataset.csv → olist_ecommerce.bronze.products
✅ Loaded olist_sellers_dataset.csv → olist_ecommerce.bronze.sellers
✅ Loaded olist_geolocation_dataset.csv → olist_ecommerce.bronze.geolocation
✅ Loaded olist_order_reviews_dataset.csv → olist_ecommerce.bronze.reviews
✅ Loaded product_category_name_translation.csv → olist_ecommerce.bronze.product_category_en


### Validation

In [None]:
%sql
SHOW TABLES IN olist_ecommerce.bronze;

database,tableName,isTemporary
bronze,customers,False
bronze,geolocation,False
bronze,order_items,False
bronze,orders,False
bronze,payments,False
bronze,product_category_en,False
bronze,products,False
bronze,reviews,False
bronze,sellers,False
