In [0]:
import dlt
import re
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import current_timestamp

# --- LAPD Bronze ---
@dlt.table(
    name="lapd.bronze.lapd_bronze",
    comment="Raw LAPD data"
)
def lapd_bronze():
    df = (
        spark.read.format("csv")
        .option("header", True)
        .option("multiLine", True)
        .load("/Volumes/lapd/raw_dataset/raw_dataset/lapd_cleaned_csv.csv")
        .withColumn("load_dt", current_timestamp())
    )
    clean_cols = [re.sub('[^A-Za-z0-9_]', '', c) for c in df.columns]
    df = df.toDF(*clean_cols)
    return df

In [0]:
import dlt
from pyspark.sql.functions import *
from pyspark.sql.types import *

# ============================================================
# EXPECTATIONS DICTIONARY  (all as expect_or_drop)
# ============================================================

expectations = {
    "file_number_not_null":       "File_Number IS NOT NULL",
    "date_reported_not_null":     "Date_Reported IS NOT NULL",
    "date_occured_not_null":      "Date_Occured IS NOT NULL",
    "time_occured_not_null":      "Time_Occured IS NOT NULL",
    "crime_code_not_null":        "Crime_Code IS NOT NULL",
    "victim_sex_not_null":        "Victim_Sex IS NOT NULL",
    "victim_age_not_null":        "Victim_Age IS NOT NULL",
    "victim_descent_not_null":    "Victim_Descent IS NOT NULL",
    "premis_code_not_null":       "Premis_Code IS NOT NULL",
    "weapon_code_not_null":       "Weapon_Used_Code IS NOT NULL",
    "lat_not_null":               "Latitude IS NOT NULL",
    "lon_not_null":               "Longitude IS NOT NULL"
}

# ============================================================
# SILVER TABLE (DLT)
# ============================================================

@dlt.table(
    name="lapd.silver.lapd_silver",
    comment="Cleansed + Validated LAPD Silver Layer",
    table_properties={"quality": "silver"}
)
@dlt.expect_all_or_drop(expectations)
def lapd_silver():

    # -----------------------------
    # READ FROM BRONZE
    # -----------------------------
    df = dlt.read("lapd.bronze.lapd_bronze")

    # ============================================================
    # CLEANING RULES (FROM YOUR FINAL PROFILE DOCUMENT)
    # ============================================================

    # Victim_Sex → replace nulls, 'H', '-' → “X”
    df = df.withColumn(
        "Victim_Sex",
        when(col("Victim_Sex").isin("", "H", "-") | col("Victim_Sex").isNull(), "X")
        .otherwise(col("Victim_Sex"))
    )

    # Victim_Descent → replace nulls, '-' → X
    df = df.withColumn(
        "Victim_Descent",
        when(col("Victim_Descent").isin("", "-") | col("Victim_Descent").isNull(), "X")
        .otherwise(col("Victim_Descent"))
    )

    # Victim_Age → replace 0 and negative with 99999
    df = df.withColumn(
        "Victim_Age",
        when((col("Victim_Age") <= 0) | col("Victim_Age").isNull(), lit(99999))
        .otherwise(col("Victim_Age"))
    )

    # Premis_Code → replace null with -1
    df = df.withColumn(
        "Premis_Code",
        when(col("Premis_Code").isNull() | (col("Premis_Code") == ""), lit(-1))
        .otherwise(col("Premis_Code"))
    )

    # Premis_Description → replace nulls + remove "*" 
    df = df.withColumn(
        "Premis_Description",
        regexp_replace(
            when(col("Premis_Description").isNull() | (col("Premis_Description") == ""), "Unknown")
            .otherwise(col("Premis_Description")),
            "\\*",
            ""
        )
    )

    # Weapon_Used_Code → nulls → -1
    df = df.withColumn(
        "Weapon_Used_Code",
        when(col("Weapon_Used_Code").isNull() | (col("Weapon_Used_Code") == ""), lit(-1))
        .otherwise(col("Weapon_Used_Code"))
    )

    # Weapon Desc → nulls → Unknown
    df = df.withColumn(
        "Weapon_Desc",
        when(col("Weapon_Desc").isNull() | (col("Weapon_Desc") == ""), "Unknown")
        .otherwise(col("Weapon_Desc"))
    )

    # Arrest Status → nulls → Unknown
    df = df.withColumn(
        "Arrest_Status",
        when(col("Arrest_Status").isNull() | (col("Arrest_Status") == ""), "Unknown")
        .otherwise(col("Arrest_Status"))
    )

    # Cross Street → nulls → Unknown
    df = df.withColumn(
        "Cross_Street",
        when(col("Cross_Street").isNull() | (col("Cross_Street") == ""), "Unknown")
        .otherwise(col("Cross_Street"))
    )

    # Latitude → 0 → 99999
    df = df.withColumn(
        "Latitude",
        when(col("Latitude") == 0, lit(99999)).otherwise(col("Latitude"))
    )

    # Longitude → 0 → -99999
    df = df.withColumn(
        "Longitude",
        when(col("Longitude") == 0, lit(-99999)).otherwise(col("Longitude"))
    )

    # ============================================================
    # DATATYPE CASTING
    # ============================================================

    df = (
        df
        .withColumn("Date_Reported", to_date("Date_Reported"))
        .withColumn("Date_Occured", to_date("Date_Occured"))
        .withColumn("Time_Occured",date_format(to_timestamp("Time_Occured", "HH:mm:ss"), "HH:mm:ss"))
        .withColumn("Area", col("Area").cast("INT"))
        .withColumn("Reported_District_No", col("Reported_District_No").cast("INT"))
        .withColumn("Part_1_2", col("Part_1_2").cast("INT"))
        .withColumn("Crime_Code", col("Crime_Code").cast("INT"))
        .withColumn("Victim_Age", col("Victim_Age").cast("INT"))
        .withColumn("Premis_Code", col("Premis_Code").cast("INT"))
        .withColumn("Weapon_Used_Code", col("Weapon_Used_Code").cast("INT"))
        .withColumn("Latitude", col("Latitude").cast("DOUBLE"))
        .withColumn("Longitude", col("Longitude").cast("DOUBLE"))
    )

    return df
