In [0]:
# ============================================================
# 00_ingest_from_azure_to_dbfs (ONE-CELL VERSION)
# Purpose:
#   - Databricks Serverless (AWS) cannot reliably write to /dbfs via Python open()/Path().
#   - So we download Azure blobs to local /tmp (Python I/O allowed),
#     then copy them into DBFS using dbutils.fs.cp (supported).
#
# What you must fill in:
#   TENANT_ID, CLIENT_ID, CLIENT_SECRET, STORAGE_ACCOUNT_NAME (if different)
# ============================================================


# ----------------------------
# 0) Install dependencies (Azure SDK) and restart Python so imports work
# ----------------------------

# Install Azure identity + Azure blob storage SDK into the current notebook environment
%pip install -q azure-identity azure-storage-blob

# Restart the Python process so newly installed packages are available immediately
dbutils.library.restartPython()


In [0]:
dbutils.library.restartPython()

In [0]:
import azure.identity
import azure.storage.blob
print("Azure SDK imports OK")


In [0]:
# ============================================================
# Cell 2: Download Azure blobs directly into YOUR Databricks Volume
# Volume base path: /Volumes/workspace/default/olist_stage
# ============================================================

# Import Path so we can create folders and write files inside the Volume
from pathlib import Path

# Import datetime just for logging timestamps (optional)
from datetime import datetime

# Import Azure credential class to authenticate using Service Principal (client id/secret)
from azure.identity import ClientSecretCredential

# Import Azure Blob client to list and download blobs (files) from the container
from azure.storage.blob import BlobServiceClient


# ----------------------------
# 1) Azure credentials + storage config (YOU MUST FILL THESE IN)
# ----------------------------

# Tenant ID (Directory ID) from Microsoft Entra ID (Azure AD)
TENANT_ID = dbutils.secrets.get("olist-secrets", "tenant_id")
print(f"Tenant ID: {TENANT_ID}")


# Client ID (Application (client) ID / appId) for your Service Principal
CLIENT_ID = dbutils.secrets.get("olist-secrets", "client_id")

# Client Secret (password) for your Service Principal (keep private, do NOT commit)
CLIENT_SECRET = dbutils.secrets.get("olist-secrets", "client_secret")

# Storage account name created by Terraform (example: olistlake89321)
STORAGE_ACCOUNT_NAME = dbutils.secrets.get("olist-secrets", "storage_account")


# Container name created by Terraform (example: datalake)
CONTAINER_NAME = "datalake"

# Prefix inside the container where your raw CSVs were uploaded
RAW_PREFIX = "raw/olist/"


# ----------------------------
# 2) Databricks Volume destination (YOUR correct base path)
# ----------------------------
# New (under olist catalog)


# Your Volume base path (you confirmed this path exists)
VOLUME_BASE = "/Volumes/olist/stage/olist_stage"      # New volume location

# A subfolder inside the volume to keep raw downloads organized
VOLUME_DIR = f"{VOLUME_BASE}/olist_raw"              # Raw landing folder inside that volume


# Create the folder if it does not exist (Volumes support normal filesystem operations)
Path(VOLUME_DIR).mkdir(parents=True, exist_ok=True)


# ----------------------------
# 3) Build Azure endpoint URL + authenticate (OAuth)
# ----------------------------

# Azure Blob endpoint for your storage account (HTTPS endpoint works cross-cloud)
ACCOUNT_URL = f"https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net"

# Create a credential object for Azure OAuth token retrieval using client credentials flow
credential = ClientSecretCredential(
    tenant_id=TENANT_ID,        # Which tenant to authenticate against
    client_id=CLIENT_ID,        # Which application is authenticating
    client_secret=CLIENT_SECRET # Password for the application
)

# Create a top-level client for Azure Blob Storage operations
blob_service_client = BlobServiceClient(
    account_url=ACCOUNT_URL,    # Storage account endpoint
    credential=credential       # OAuth credential (Service Principal)
)

# Create a container client to interact with the specific container (datalake)
container_client = blob_service_client.get_container_client(CONTAINER_NAME)


# ----------------------------
# 4) List all blobs (files) under the prefix raw/olist/
# ----------------------------

# Collect blob names that begin with RAW_PREFIX (Azure uses prefixes like folders)
blob_names = [b.name for b in container_client.list_blobs(name_starts_with=RAW_PREFIX)]

# Print how many blobs we found (helps confirm prefix is correct)
print(f"[{datetime.utcnow().isoformat()}Z] Found {len(blob_names)} blobs under '{RAW_PREFIX}'")

# Print a few blob names as a quick sanity check
print("Sample blobs:", blob_names[:10])


# ----------------------------
# 5) Download each blob to the Volume folder
# ----------------------------

# Loop through each blob path returned by Azure
for blob_name in blob_names:
    # Skip placeholder blobs if present (some flows create raw/olist/.keep)
    if blob_name.endswith("/.keep"):
        continue

    # Extract the filename from the blob path (everything after the last slash)
    filename = blob_name.split("/")[-1]

    # Build the destination path in the Volume
    local_path = str(Path(VOLUME_DIR) / filename)

    # Create a client object for this specific blob so we can download it
    blob_client = container_client.get_blob_client(blob_name)

    # Download the blob as bytes
    data = blob_client.download_blob().readall()

    # Write the bytes directly into the Volume path (supported in Serverless)
    with open(local_path, "wb") as f:
        f.write(data)

    # Print progress so you can see what was downloaded
    print(f"Downloaded {blob_name} -> {local_path}")


# ----------------------------
# 6) Verify files exist in the Volume
# ----------------------------

# List CSV files in the Volume download folder
downloaded_csvs = sorted([p.name for p in Path(VOLUME_DIR).glob("*.csv")])

# Print downloaded filenames for verification
print("Downloaded CSVs:", downloaded_csvs)


# ----------------------------
# 7) Spark read test from the Volume path (validates end-to-end)
# ----------------------------

# Pick a known CSV to validate Spark can read what we downloaded
test_file = "olist_orders_dataset.csv"

# Build the full path to the file inside the Volume
test_path = f"{VOLUME_DIR}/{test_file}"

# Read the CSV using Spark to confirm it is usable for the Bronze pipeline
df_test = (
    spark.read
    .option("header", True)       # Use first row as header
    .option("inferSchema", True)  # Infer column types for quick testing
    .csv(test_path)               # Read from the Volume path
)

# Display a few rows in the notebook UI
display(df_test.limit(5))

# Print row count to confirm Spark can scan the file
print("Row count:", df_test.count())


In [0]:
# Confirm the scope exists by reading one secret (wonâ€™t print the secret value)
print("tenant_id length:", len(dbutils.secrets.get("olist-secrets", "tenant_id")))
print("client_id length:", len(dbutils.secrets.get("olist-secrets", "client_id")))
print("client_secret length:", len(dbutils.secrets.get("olist-secrets", "client_secret")))
print("storage_account:", dbutils.secrets.get("olist-secrets", "storage_account"))


In [0]:
tenant_id = dbutils.secrets.get("olist-secrets", "tenant_id")

print("tenant_id raw repr:", repr(tenant_id))
print("tenant_id length:", len(tenant_id))
