In [0]:
# ============================================================
# 02_silver_transforms.py
#
# PURPOSE
# -------
# Transform Unity Catalog Bronze raw tables (all columns as STRING) into Silver tables:
#   - Clean IDs and text columns (trim, lower/upper)
#   - Safely cast numerics using try_cast (malformed -> NULL)
#   - Safely parse timestamps using to_timestamp (unparseable -> NULL)
#   - Deduplicate by business keys using latest ingested_at
#   - Write Unity Catalog managed Delta tables (saveAsTable)
#
# WHY SILVER
# ----------
# Bronze is raw and stable (no schema inference, no casting, append safe).
# Silver is curated: correct types, consistent formatting, fewer duplicates.
# ============================================================


# ----------------------------
# 0) Imports (Spark DataFrame column functions)
# ----------------------------

from pyspark.sql.functions import col  # Reference a column by name (builds an expression)
from pyspark.sql.functions import trim  # Remove leading/trailing whitespace from strings
from pyspark.sql.functions import lower  # Convert string to lowercase for consistency
from pyspark.sql.functions import upper  # Convert string to uppercase (e.g., state codes)
from pyspark.sql.functions import to_timestamp  # Convert a string into timestamp (bad parse -> NULL)
from pyspark.sql.functions import to_date  # Convert timestamp into date (for time-series grouping)
from pyspark.sql.functions import coalesce  # Pick first non-null value among expressions
from pyspark.sql.functions import current_timestamp  # Current processing time (fallback for ordering)
from pyspark.sql.functions import row_number  # Window function used for deduplication
from pyspark.sql.functions import expr  # Allows SQL expressions like try_cast(...)
from pyspark.sql.window import Window  # Defines partition/order rules for window functions


# ----------------------------
# 1) Configuration (catalog/schema + behavior)
# ----------------------------

BRONZE_SCHEMA = "olist.bronze"  # Where raw tables live
SILVER_SCHEMA = "olist.silver"  # Where curated silver tables will be written

SILVER_WRITE_MODE = "overwrite"  # Rebuild silver each time (simple for capstone)

KEEP_LINEAGE_COLS = True  # Keep Bronze audit columns (ingested_at/source_file/source_system) in Silver


# ----------------------------
# 2) Create the Silver schema if it does not exist
# ----------------------------

spark.sql(f"CREATE SCHEMA IF NOT EXISTS {SILVER_SCHEMA}")  # Idempotent schema creation


# ----------------------------
# 3) Helper: Write a managed UC Delta table
# ----------------------------

def write_silver_table(df, table_name: str):
    """
    Write df to Unity Catalog as a managed Delta table: olist.silver.<table_name>
    We use saveAsTable() so UC manages storage location (no LOCATION paths).
    """

    full_name = f"{SILVER_SCHEMA}.{table_name}"  # Build fully-qualified UC name

    (  # Begin write chain
        df.write  # Spark DataFrame writer
        .format("delta")  # Delta Lake format
        .mode(SILVER_WRITE_MODE)  # Overwrite for rebuild style
        .option("overwriteSchema", "true")  # Allow schema evolution on overwrite
        .saveAsTable(full_name)  # Write managed UC table
    )  # End write chain


# ----------------------------
# 4) Helper: Deduplicate keeping latest record by timestamp
# ----------------------------

def dedupe_latest(df, key_cols: list, ts_col: str):
    """
    Deduplicate df by key_cols, keeping only the latest row according to ts_col.
    We use ingested_at (from Bronze) as the 'latest' signal.
    """

    ordering_expr = coalesce(col(ts_col), current_timestamp())  # Prefer ts_col, fallback to now
    w = Window.partitionBy(*[col(c) for c in key_cols]).orderBy(ordering_expr.desc())  # Window by keys, latest first
    df_ranked = df.withColumn("_rn", row_number().over(w))  # Add row number per partition
    df_deduped = df_ranked.filter(col("_rn") == 1).drop("_rn")  # Keep only newest row, drop helper column
    return df_deduped  # Return cleaned dataframe


# ============================================================
# 5) Load Bronze tables (all raw columns are strings)
# ============================================================

orders_raw = spark.table(f"{BRONZE_SCHEMA}.orders_raw")  # Orders
customers_raw = spark.table(f"{BRONZE_SCHEMA}.customers_raw")  # Customers
order_items_raw = spark.table(f"{BRONZE_SCHEMA}.order_items_raw")  # Order items
order_payments_raw = spark.table(f"{BRONZE_SCHEMA}.order_payments_raw")  # Payments
products_raw = spark.table(f"{BRONZE_SCHEMA}.products_raw")  # Products
sellers_raw = spark.table(f"{BRONZE_SCHEMA}.sellers_raw")  # Sellers
order_reviews_raw = spark.table(f"{BRONZE_SCHEMA}.order_reviews_raw")  # Reviews
category_translation_raw = spark.table(f"{BRONZE_SCHEMA}.category_translation_raw")  # Category translation
# geolocation_raw exists but is optional in Silver; we'll keep it bronze-only for now.


# ============================================================
# 6) SILVER: orders
# ============================================================

orders_silver = (  # Start orders transformations
    orders_raw  # Source data
    .withColumn("order_id", trim(col("order_id")))  # Clean IDs: remove whitespace
    .withColumn("customer_id", trim(col("customer_id")))  # Clean IDs
    .withColumn("order_status", lower(trim(col("order_status"))))  # Normalize status for consistent grouping

    # Parse timestamps (if parse fails, result becomes NULL, which is okay in Silver)
    .withColumn("order_purchase_ts", to_timestamp(col("order_purchase_timestamp")))  # Purchase timestamp
    .withColumn("order_approved_ts", to_timestamp(col("order_approved_at")))  # Approved timestamp
    .withColumn("order_delivered_carrier_ts", to_timestamp(col("order_delivered_carrier_date")))  # Carrier delivery timestamp
    .withColumn("order_delivered_customer_ts", to_timestamp(col("order_delivered_customer_date")))  # Customer delivery timestamp
    .withColumn("order_estimated_delivery_ts", to_timestamp(col("order_estimated_delivery_date")))  # Estimated delivery timestamp

    # Derive a date column for easy time-series analysis (daily aggregations)
    .withColumn("order_purchase_date", to_date(col("order_purchase_ts")))
)

orders_cols = [  # Columns we keep in orders silver
    "order_id",
    "customer_id",
    "order_status",
    "order_purchase_ts",
    "order_purchase_date",
    "order_approved_ts",
    "order_delivered_carrier_ts",
    "order_delivered_customer_ts",
    "order_estimated_delivery_ts",
]

if KEEP_LINEAGE_COLS:  # Keep Bronze audit columns if enabled
    orders_cols += ["ingested_at", "source_file", "source_system"]

orders_silver = orders_silver.select(*orders_cols)  # Select final columns
orders_silver = dedupe_latest(orders_silver, ["order_id"], "ingested_at")  # Deduplicate orders
write_silver_table(orders_silver, "orders")  # Write olist.silver.orders


# ============================================================
# 7) SILVER: customers
# ============================================================

customers_silver = (  # Start customers transformations
    customers_raw
    .withColumn("customer_id", trim(col("customer_id")))  # Clean customer_id
    .withColumn("customer_unique_id", trim(col("customer_unique_id")))  # Clean unique id
    .withColumn("customer_zip_code_prefix", trim(col("customer_zip_code_prefix")))  # Keep as string, trim spaces
    .withColumn("customer_city", lower(trim(col("customer_city"))))  # Normalize city
    .withColumn("customer_state", upper(trim(col("customer_state"))))  # Normalize state (SP, RJ, etc.)
)

customers_cols = [
    "customer_id",
    "customer_unique_id",
    "customer_zip_code_prefix",
    "customer_city",
    "customer_state",
]

if KEEP_LINEAGE_COLS:
    customers_cols += ["ingested_at", "source_file", "source_system"]

customers_silver = customers_silver.select(*customers_cols)  # Select final columns
customers_silver = dedupe_latest(customers_silver, ["customer_id"], "ingested_at")  # Deduplicate by customer_id
write_silver_table(customers_silver, "customers")  # Write olist.silver.customers


# ============================================================
# 8) SILVER: order_items
# ============================================================

order_items_silver = (
    order_items_raw
    .withColumn("order_id", trim(col("order_id")))  # Clean order_id
    .withColumn("order_item_id", expr("try_cast(order_item_id as int)"))  # Safe int cast; bad -> NULL
    .withColumn("product_id", trim(col("product_id")))  # Clean product_id
    .withColumn("seller_id", trim(col("seller_id")))  # Clean seller_id
    .withColumn("shipping_limit_ts", to_timestamp(col("shipping_limit_date")))  # Parse timestamp
    .withColumn("price", expr("try_cast(price as double)"))  # Safe numeric cast
    .withColumn("freight_value", expr("try_cast(freight_value as double)"))  # Safe numeric cast
)

order_items_cols = [
    "order_id",
    "order_item_id",
    "product_id",
    "seller_id",
    "shipping_limit_ts",
    "price",
    "freight_value",
]

if KEEP_LINEAGE_COLS:
    order_items_cols += ["ingested_at", "source_file", "source_system"]

order_items_silver = order_items_silver.select(*order_items_cols)
order_items_silver = dedupe_latest(order_items_silver, ["order_id", "order_item_id"], "ingested_at")
write_silver_table(order_items_silver, "order_items")


# ============================================================
# 9) SILVER: order_payments
# ============================================================

order_payments_silver = (
    order_payments_raw
    .withColumn("order_id", trim(col("order_id")))  # Clean order_id
    .withColumn("payment_sequential", expr("try_cast(payment_sequential as int)"))  # Safe int cast
    .withColumn("payment_type", lower(trim(col("payment_type"))))  # Normalize payment type
    .withColumn("payment_installments", expr("try_cast(payment_installments as int)"))  # Safe int cast
    .withColumn("payment_value", expr("try_cast(payment_value as double)"))  # Safe double cast
)

payments_cols = [
    "order_id",
    "payment_sequential",
    "payment_type",
    "payment_installments",
    "payment_value",
]

if KEEP_LINEAGE_COLS:
    payments_cols += ["ingested_at", "source_file", "source_system"]

order_payments_silver = order_payments_silver.select(*payments_cols)
order_payments_silver = dedupe_latest(order_payments_silver, ["order_id", "payment_sequential"], "ingested_at")
write_silver_table(order_payments_silver, "order_payments")


# ============================================================
# 10) SILVER: products
# ============================================================

products_silver = (
    products_raw
    .withColumn("product_id", trim(col("product_id")))  # Clean product_id
    .withColumn("product_category_name", lower(trim(col("product_category_name"))))  # Normalize category
    # Olist column names contain typos (e.g., product_name_lenght). We keep the same names but cast safely.
    .withColumn("product_name_lenght", expr("try_cast(product_name_lenght as int)"))  # Safe int cast
    .withColumn("product_description_lenght", expr("try_cast(product_description_lenght as int)"))  # Safe int cast
    .withColumn("product_photos_qty", expr("try_cast(product_photos_qty as int)"))  # Safe int cast
    .withColumn("product_weight_g", expr("try_cast(product_weight_g as int)"))  # Safe int cast
    .withColumn("product_length_cm", expr("try_cast(product_length_cm as int)"))  # Safe int cast
    .withColumn("product_height_cm", expr("try_cast(product_height_cm as int)"))  # Safe int cast
    .withColumn("product_width_cm", expr("try_cast(product_width_cm as int)"))  # Safe int cast
)

products_cols = [
    "product_id",
    "product_category_name",
    "product_name_lenght",
    "product_description_lenght",
    "product_photos_qty",
    "product_weight_g",
    "product_length_cm",
    "product_height_cm",
    "product_width_cm",
]

if KEEP_LINEAGE_COLS:
    products_cols += ["ingested_at", "source_file", "source_system"]

products_silver = products_silver.select(*products_cols)
products_silver = dedupe_latest(products_silver, ["product_id"], "ingested_at")
write_silver_table(products_silver, "products")


# ============================================================
# 11) SILVER: sellers
# ============================================================

sellers_silver = (
    sellers_raw
    .withColumn("seller_id", trim(col("seller_id")))  # Clean seller_id
    .withColumn("seller_zip_code_prefix", trim(col("seller_zip_code_prefix")))  # Keep as string, trim
    .withColumn("seller_city", lower(trim(col("seller_city"))))  # Normalize city
    .withColumn("seller_state", upper(trim(col("seller_state"))))  # Normalize state
)

sellers_cols = [
    "seller_id",
    "seller_zip_code_prefix",
    "seller_city",
    "seller_state",
]

if KEEP_LINEAGE_COLS:
    sellers_cols += ["ingested_at", "source_file", "source_system"]

sellers_silver = sellers_silver.select(*sellers_cols)
sellers_silver = dedupe_latest(sellers_silver, ["seller_id"], "ingested_at")
write_silver_table(sellers_silver, "sellers")


# ============================================================
# 12) SILVER: order_reviews (safe casting with try_cast)
# ============================================================

# ============================================================
# 12) SILVER: order_reviews (STRICT-CAST SAFE VERSION)
# Why this fix:
#   - Some rows contain text in a field we try to interpret as timestamp
#   - Under Photon/ANSI behavior, strict casts can throw errors
#   - try_cast(...) returns NULL instead of failing the job
# ============================================================

# Build the order_reviews silver DataFrame from the bronze raw table
order_reviews_silver = (
    order_reviews_raw  # Start from Bronze reviews (all columns are strings)
    .withColumn("review_id", trim(col("review_id")).cast("string"))  # Ensure review_id is clean string
    .withColumn("order_id", trim(col("order_id")).cast("string"))  # Ensure order_id is clean string

    # Safely cast review_score to int; malformed values become NULL instead of failing
    .withColumn("review_score", expr("try_cast(review_score as int)"))

    # Keep comment fields as strings (these can contain commas/quotes and should NOT be cast)
    .withColumn("review_comment_title", col("review_comment_title").cast("string"))
    .withColumn("review_comment_message", col("review_comment_message").cast("string"))

    # Safely parse timestamps using try_cast to tolerate malformed inputs (returns NULL)
    .withColumn("review_creation_ts", expr("try_cast(review_creation_date as timestamp)"))
    .withColumn("review_answer_ts", expr("try_cast(review_answer_timestamp as timestamp)"))
)

# Define the exact output columns (stable schema helps prevent drift)
reviews_cols = [
    "review_id",  # Review key
    "order_id",  # FK to orders
    "review_score",  # Score (int or NULL)
    "review_comment_title",  # Title (string)
    "review_comment_message",  # Message (string)
    "review_creation_ts",  # Creation timestamp (timestamp or NULL)
    "review_answer_ts",  # Answer timestamp (timestamp or NULL)
]

# Optionally keep lineage columns so you can trace back to files and ingestion time
if KEEP_LINEAGE_COLS:
    reviews_cols += ["ingested_at", "source_file", "source_system"]

# Select final columns only (prevents accidental extra columns from sneaking in)
order_reviews_silver = order_reviews_silver.select(*reviews_cols)

# Deduplicate by review_id keeping the latest ingested record
order_reviews_silver = dedupe_latest(order_reviews_silver, ["review_id"], "ingested_at")

# Write the managed UC silver table
write_silver_table(order_reviews_silver, "order_reviews")


# ============================================================
# 13) SILVER: category_translation
# ============================================================

category_translation_silver = (
    category_translation_raw
    .withColumn("product_category_name", lower(trim(col("product_category_name"))))  # Normalize Portuguese category
    .withColumn("product_category_name_english", lower(trim(col("product_category_name_english"))))  # Normalize English label
)

translation_cols = [
    "product_category_name",
    "product_category_name_english",
]

if KEEP_LINEAGE_COLS:
    translation_cols += ["ingested_at", "source_file", "source_system"]

category_translation_silver = category_translation_silver.select(*translation_cols)
category_translation_silver = dedupe_latest(category_translation_silver, ["product_category_name"], "ingested_at")
write_silver_table(category_translation_silver, "category_translation")


# ============================================================
# 14) Validation: Row counts for Silver tables
# ============================================================

silver_tables = [
    "orders",
    "customers",
    "order_items",
    "order_payments",
    "products",
    "sellers",
    "order_reviews",
    "category_translation",
]

for t in silver_tables:
    display(
        spark.sql(
            f"SELECT '{SILVER_SCHEMA}.{t}' AS table_name, COUNT(*) AS row_count FROM {SILVER_SCHEMA}.{t}"
        )
    )


# ============================================================
# 15) Quick function explanations (brief, practical)
# ============================================================
#
# trim(col("x"))
#   - Removes spaces at the start/end of strings (prevents join keys from mismatching).
#
# lower(col("x")) / upper(col("x"))
#   - Normalizes text to consistent case so grouping and filtering behave consistently.
#
# to_timestamp(col("x"))
#   - Attempts to parse a string into a timestamp.
#   - If parsing fails, Spark returns NULL (tolerant behavior).
#
# expr("try_cast(x as int)")
#   - Safely casts string column x to int.
#   - If x is malformed (e.g., '2018-04-01 00:27:51'), returns NULL instead of error.
#
# Window + row_number()
#   - Used to deduplicate data:
#     partitionBy(keys) groups rows by business key
#     orderBy(latest ingested_at) chooses newest record
#     row_number == 1 keeps only one row per key
#
# saveAsTable()
#   - Writes a managed Unity Catalog table.
#   - You do NOT supply a storage path (UC handles it).
# ============================================================


In [0]:
# ============================================================
# 12A) DATA QUALITY CHECKS: order_reviews (NON-BLOCKING)
# ============================================================

# Reload the silver table we just wrote (ensures we validate persisted data)
order_reviews_silver_df = spark.table("olist.silver.order_reviews")

# ----------------------------
# 1) Total row count
# ----------------------------

total_reviews = order_reviews_silver_df.count()  # Total number of review records

# ----------------------------
# 2) Malformed timestamp checks
# ----------------------------

# Count rows where creation timestamp failed to parse
bad_creation_ts = order_reviews_silver_df.filter(
    col("review_creation_ts").isNull()
).count()

# Count rows where answer timestamp failed to parse
bad_answer_ts = order_reviews_silver_df.filter(
    col("review_answer_ts").isNull()
).count()

# ----------------------------
# 3) Review score validity check
# ----------------------------

# Valid review scores should be between 1 and 5
invalid_review_scores = order_reviews_silver_df.filter(
    (col("review_score").isNull()) |  # NULL after try_cast
    (col("review_score") < 1) |       # Less than valid range
    (col("review_score") > 5)         # Greater than valid range
).count()

# ----------------------------
# 4) Print DQ summary (human-readable)
# ----------------------------

print("========== DATA QUALITY REPORT: olist.silver.order_reviews ==========")
print(f"Total rows                  : {total_reviews}")
print(f"Bad creation timestamps     : {bad_creation_ts}")
print(f"Bad answer timestamps       : {bad_answer_ts}")
print(f"Invalid review scores (1â€“5) : {invalid_review_scores}")
print("=====================================================================")


In [0]:
display(
    order_reviews_silver_df
    .filter(col("review_creation_ts").isNull())
    .select("review_id", "review_comment_message", "review_creation_ts")
    .limit(20)
)
