In [0]:
CATALOG = "workspace"
SCHEMA  = "material_master"
VOLUME  = "datastore"   # <-- includes the space exactly

base = f"/Volumes/{CATALOG}/{SCHEMA}/{VOLUME}/"
display(dbutils.fs.ls(base))  # you should see material_master_1k.csv


In [0]:
# DLT notebook: dlt_material_master

import dlt
import pyspark.sql.functions as F

CATALOG = "workspace"
SCHEMA  = "material_master"
VOLUME  = "datastore"     # ✅ correct volume name
FILE    = "material_master_1k.csv"

SOURCE_CSV = f"/Volumes/{CATALOG}/{SCHEMA}/{VOLUME}/{FILE}"


# =========================
# BRONZE — RAW AS-IS (CSV|)
# =========================
@dlt.table(
    name="material_master_bronze",
    comment="Bronze: raw pipe-delimited Material Master from Volume (as-is)."
)
def material_master_bronze():
    return (
        spark.read
            .option("header", True)
            .option("delimiter", "|")
            .csv(SOURCE_CSV)
    )


# ============================================
# SILVER — CLEAN, STANDARDIZE, FIX DATATYPES
# ============================================
@dlt.table(
    name="material_master_silver",
    comment="Silver: cleaned & standardized Material Master with datatype corrections and validation."
)
@dlt.expect_or_drop("not_null_material_id", "material_id IS NOT NULL")
def material_master_silver():
    df = dlt.read("material_master_bronze")

    # 1) Normalize column names + trim strings
    df = df.select([F.col(c).alias(c.lower()) for c in df.columns])
    for c, t in df.dtypes:
        if t == "string":
            df = df.withColumn(c, F.trim(F.col(c)))

    cols = set(df.columns)

    # 2) unit_cost -> DECIMAL(18,2)
    if "unit_cost" in cols:
        df = (df.withColumn("unit_cost_raw", F.col("unit_cost"))
                .withColumn("unit_cost", F.col("unit_cost").cast("decimal(18,2)"))
                .filter(~(F.col("unit_cost_raw").isNotNull() & F.col("unit_cost").isNull())))

    # 3) lead_time_days, safety_stock, reorder_level -> INT
    for numeric_col in ["lead_time_days", "safety_stock", "reorder_level"]:
        if numeric_col in cols:
            df = (df.withColumn(f"{numeric_col}_raw", F.col(numeric_col))
                    .withColumn(numeric_col, F.col(numeric_col).cast("int"))
                    .filter(~(F.col(f"{numeric_col}_raw").isNotNull() & F.col(numeric_col).isNull())))

    # 4) last_updated -> DATE (yyyy-MM-dd)
    if "last_updated" in cols:
        df = df.withColumn("last_updated", F.to_date(F.col("last_updated"), "yyyy-MM-dd"))

    return df
