In [1]:
from datetime import datetime

import polars as pl

import udfs

#### TRANSFORMATION FUNCTIONS ##
- Provides transformations specific to the staging layer's tables
- Load and transform customer data from raw parquet.
- Operations:
    - Clean and standardize fields and flags
    - Replace invalid/null/invalid with defaults (e.g., 'Unknown').
    - Deduplicate by table's 'id' column.
    - Add load timestamp.

In [None]:
# ------------------ Function to Load Customer Table ------------------
def load_customer_table(raw_path: str) -> pl.DataFrame:
    df_bronze_customer = pl.read_parquet(raw_path)    
    df_bronze_customer = df_bronze_customer.filter(
                  df_bronze_customer["customer_id"]
                  .is_not_null() & (df_bronze_customer["customer_id"] != "")
                )
    
    df_staging = (
        df_bronze_customer
        .with_columns([
            # --- customer_id ---
            udfs.clean(pl.col("customer_id")).cast(pl.Int64).alias("customer_id"),
            
            # --- signup_date ---
            pl.col("signup_date")
              .str.strptime(pl.Datetime, "%Y-%m-%d", strict=False)
              .fill_null(datetime(1900,1,1))
              .alias("signup_date"),
            
            # --- gender ---
            pl.when(pl.col("gender").str.to_lowercase().str.starts_with("m")).then(pl.lit("Male"))
              .when(pl.col("gender").str.to_lowercase().str.starts_with("f")).then(pl.lit("Female"))
              .when(pl.col("gender").str.to_lowercase().str.starts_with("o")).then(pl.lit("Other"))
              .when(pl.col("gender").is_null() | (pl.col("gender") == "")).then(pl.lit("Unknown"))
              .otherwise(udfs.clean(pl.col("gender")))
              .alias("gender"),

            # --- customer_dob ---
            pl.col("customer_dob")
              .str.strptime(pl.Date, "%Y-%m-%d", strict=False)
              .alias("customer_dob"),
    
            # --- customer_name ---
            udfs.clean(pl.col("customer_name")).alias("customer_name"),
            
            # --- marital_status ---
            pl.when(pl.col("marital_status").str.to_lowercase().str.starts_with("s")).then(pl.lit("Single"))
              .when(pl.col("marital_status").str.to_lowercase().str.starts_with("m")).then(pl.lit("Married"))
              .when(pl.col("marital_status").is_null() | (pl.col("marital_status")=="")).then(pl.lit("Unknown"))
              .otherwise(udfs.clean(pl.col("marital_status")))
              .alias("marital_status"),

            # --- email / phone ---
            udfs.clean_validate_email(pl.col("email")).alias("email"),
            pl.when((pl.col("phone") == "") | pl.col("phone").is_null())
              .then(None)
              .otherwise(udfs.remove_non_numeric(pl.col("phone")))
              .cast(pl.Int64)
              .alias("phone"),
            
            # --- customer_type ---
            pl.when(pl.col("customer_type").str.to_lowercase().str.starts_with("p")).then(pl.lit("Prime"))
              .when(pl.col("customer_type").str.to_lowercase().str.starts_with("n")).then(pl.lit("Non-prime"))
              .when(pl.col("customer_type").is_null() | (pl.col("customer_type")=="")).then(pl.lit("Unknown"))
              .otherwise(udfs.clean(pl.col("customer_type")))
              .alias("customer_type"),

            # --- account_status ---
            pl.when(pl.col("account_status").is_null() | (pl.col("account_status") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udfs.clean(pl.col("account_status")))
              .alias("account_status"),

            # --- region ---
            pl.when(pl.col("region").is_null() | (pl.col("region") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udfs.clean(pl.col("region")))
              .alias("region"),

            # --- country ---
            pl.when(pl.col("country").is_null() | (pl.col("country") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udfs.clean(pl.col("country")))
              .alias("country"),

            # --- state (with translation as option) ---
            pl.when(pl.col("state").is_null() | (pl.col("state") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udfs.clean(pl.col("state")))
              .alias("state"),

            # --- city ---
            pl.when(pl.col("city").is_null() | (pl.col("city") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udfs.clean(pl.col("city")))
              .alias("city"),

            # --- postal_code (rename from 'postal code') ---
            pl.when(pl.col("postal code").is_null() | (pl.col("postal code") == ""))
            .then(pl.lit("Unknown"))
            .otherwise(udfs.clean(pl.col("postal code")))
            .alias("postal code"),

            # --- load_timestamp ---
            pl.lit(datetime.now()).alias("load_timestamp")
        ])
    )

    # deduplicate like ROW_NUMBER PARTITION BY (keep the first row per shipping_type_id)
    df_staging = (
        df_staging
        .group_by("customer_id")
        .agg(pl.all().last())
    )
    
    df_staging = df_staging.rename({"postal code": "postal_code"})

    df_staging = df_staging.select([
        "customer_id",
        "signup_date",
        "gender",
        "customer_dob",
        "customer_name",
        "marital_status",
        "email",
        "phone",
        "customer_type",
        "account_status",
        "country",
        "state",
        "city",
        "postal_code",
        "region",
        "load_timestamp"
    ])
    
    return df_staging


In [None]:
# ------------------ Function to Load Product Table ------------------
def load_product_table(raw_path: str) -> pl.DataFrame:
    df_bronze_product = pl.read_parquet(raw_path)

    # filter out invalid product_id
    df_bronze_product = df_bronze_product.filter(
        df_bronze_product["product_id"].is_not_null() & (df_bronze_product["product_id"] != "")
    )

    df_staging = (
        df_bronze_product
        .with_columns([
            # --- product_id ---
            udfs.clean(pl.col("product_id")).alias("product_id"),

            # --- product_name ---
            pl.when(pl.col("product_name").is_null() | (pl.col("product_name") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(pl.col("product_name"))
              .alias("product_name"),

            # --- brand_tier ---
            pl.when(pl.col("brand_tier").is_null() | (pl.col("brand_tier") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udfs.clean(pl.col("brand_tier")))
              .alias("brand_tier"),

            # --- brand_name ---
            pl.when(pl.col("brand_name").is_null() | (pl.col("brand_name") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udfs.clean(pl.col("brand_name")))
              .alias("brand_name"),

            # --- brand_country ---
            pl.when(pl.col("brand_country").is_null() | (pl.col("brand_country") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udfs.clean(pl.col("brand_country")))
              .alias("brand_country"),

            # --- main_category ---
            pl.when(pl.col("main_category").is_null() | (pl.col("main_category") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udfs.clean(pl.col("main_category")))
              .alias("main_category"),

            # --- sub_category ---
            pl.when(pl.col("sub_category").is_null() | (pl.col("sub_category") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udfs.clean(pl.col("sub_category")))
              .alias("sub_category"),

            # --- discount_percent ---
            pl.col("discount_percent")
              .fill_null("0")
              .replace("", "0")
              .cast(pl.Float64)
              .clip(lower_bound=0.0)
              .round(2)
              .alias("discount_percent"),

            # --- actual_price ---
            pl.col("actual_price")
              .fill_null("0")
              .replace("", "0")
              .cast(pl.Float64)
              .clip(lower_bound=0.0)
              .round(2)
              .alias("actual_price"),

            # # --- rating ---
            pl.col("rating")
              .fill_null("0")
              .replace("", "0")
              .cast(pl.Float64)
              .clip(lower_bound=0.0)
              .round(2)
              .alias("rating"),

            # # --- no_of_ratings ---
            pl.col("no of ratings")
              .fill_null("0")
              .replace("", "0")
              .cast(pl.Float64)
              .cast(pl.Int64)
              .clip(lower_bound=0)
              .alias("no of ratings"),

            # --- load_timestamp ---
            pl.lit(datetime.now()).alias("load_timestamp")
        ])
    )

    # deduplicate like ROW_NUMBER PARTITION BY (keep the first row per product_id)
    df_staging = (
        df_staging
        .group_by("product_id")
        .agg(pl.all().first())
    )
    
    df_staging = df_staging.rename({"no of ratings": "no_of_ratings"})

    return df_staging


In [None]:
# ------------------ Function to Load Shipping-Type Table ------------------
def load_shipping_type_table(raw_path: str) -> pl.DataFrame:
    df_bronze_shipping = pl.read_parquet(raw_path)
    
    # filter out invalid shipping_type_id
    df_bronze_shipping = df_bronze_shipping.filter(
        df_bronze_shipping["shipping_type_id"].is_not_null() & (df_bronze_shipping["shipping_type_id"] != "")
    )
    
    df_staging = (
        df_bronze_shipping
        .with_columns([
            # --- shipping_type_id ---
            udfs.clean(pl.col("shipping_type_id")).alias("shipping_type_id"),
            
            # --- shipping_type ---
            pl.when(pl.col("shipping_type").is_null() | (pl.col("shipping_type") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udfs.clean(pl.col("shipping_type")))
              .alias("shipping_type"),

            # --- delivery_estimate ---
            pl.when(pl.col("delivery_estimate").is_null() | (pl.col("delivery_estimate") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udfs.clean(pl.col("delivery_estimate")))
              .alias("delivery_estimate"),

            # --- load_timestamp ---
            pl.lit(datetime.now()).alias("load_timestamp")
        ])
    )

    # deduplicate like ROW_NUMBER PARTITION BY (keep the first row per shipping_type_id)
    df_staging = (
        df_staging
        .group_by("shipping_type_id")
        .agg(pl.all().last())
    )

    return df_staging


In [None]:
# ------------------ Function to Load Orders Table ------------------
def load_orders_table(raw_path: str) -> pl.DataFrame:
    df_bronze_order = pl.read_parquet(raw_path)

    # filter invalid orders_id
    df_bronze_order = df_bronze_order.filter(
        df_bronze_order["orders_id"].is_not_null() & (df_bronze_order["orders_id"] != "")
    )

    df_staging = (
        df_bronze_order
        .with_columns([
            # --- orders_id / customer_id / product_id ---
            udfs.clean(pl.col("orders_id")).alias("orders_id"),
            udfs.clean(pl.col("customer_id")).cast(pl.Int64).alias("customer_id"),
            udfs.clean(pl.col("product_id")).alias("product_id"),

            # --- shipping_type / payment_source / lead_type / order_status ---
            pl.when(pl.col("shipping_type").is_null() | (pl.col("shipping_type") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udfs.clean(pl.col("shipping_type")))
              .alias("shipping_type"),

            pl.when(pl.col("payment_src").is_null() | (pl.col("payment_src") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udfs.clean(pl.col("payment_src")))
              .alias("payment_src"),

            pl.when(pl.col("lead_type").is_null() | (pl.col("lead_type") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udfs.clean(pl.col("lead_type")))
              .alias("lead_type"),

            pl.when(pl.col("order_status").is_null() | (pl.col("order_status") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udfs.clean(pl.col("order_status")))
              .alias("order_status"),

            # --- dates ---
            pl.col("order_date").str.strptime(pl.Datetime, strict=False)
              .fill_null(datetime(1900,1,1)).alias("order_date"),
            pl.col("shipping_date").str.strptime(pl.Datetime, strict=False)
              .fill_null(datetime(1900,1,1)).alias("shipping_date"),
            pl.col("expected_delivery_date").str.strptime(pl.Datetime, strict=False).alias("expected_delivery_date"),
            pl.col("delivery_date").str.strptime(pl.Datetime, strict=False).alias("delivery_date"),
            pl.col("return_date").str.strptime(pl.Datetime, strict=False).alias("return_date"),
            pl.col("refund_date").str.strptime(pl.Datetime, strict=False).alias("refund_date"),

            # --- quantity ---
            pl.when(
                pl.col("quantity").is_null() |
                (pl.col("quantity") == "") |
                (pl.col("quantity").cast(pl.Int64) < 0)
            )
            .then(pl.lit(0))
            .otherwise(pl.col("quantity").cast(pl.Int64))
            .alias("quantity"),

            # --- unit_price ---
            pl.when(
                pl.col("unit_price").is_null() |
                (pl.col("unit_price") == "") |
                (pl.col("unit_price").cast(pl.Float64) < 0)
            )
            .then(pl.lit(0.0))
            .otherwise(pl.col("unit_price").cast(pl.Float64))
            .alias("unit_price"),

            # --- is_gift + gift_message ---
            pl.when(
                (pl.col("is_gift").fill_null("0").replace("", "0").cast(pl.Int64) == 1)
                & (pl.col("gift_message").is_not_null())
                & (pl.col("gift_message") != "")
            )
            .then(pl.lit(1))
            .otherwise(pl.lit(0))
            .alias("is_gift"),

            pl.when(
                (pl.col("is_gift").fill_null("0").replace("", "0").cast(pl.Int64) == 1)
                & (pl.col("gift_message").is_not_null())
                & (pl.col("gift_message") != "")
            )
            .then(pl.col("gift_message"))
            .otherwise(pl.lit(""))
            .alias("gift_message"),

            # --- has_coupon + coupon_code ---
            pl.when(
                (pl.col("has_coupon").fill_null("0").replace("", "0").cast(pl.Int64) == 1)
                & (pl.col("coupon_code").is_not_null())
                & (pl.col("coupon_code") != "")
            )
            .then(pl.lit(1))
            .otherwise(pl.lit(0))
            .alias("has_coupon"),

            pl.when(
                (pl.col("has_coupon").fill_null("0").replace("", "0").cast(pl.Int64) == 1)
                & (pl.col("coupon_code").is_not_null())
                & (pl.col("coupon_code") != "")
            )
            .then(pl.col("coupon_code"))
            .otherwise(pl.lit(""))
            .alias("coupon_code"),


            # --- load_timestamp ---
            pl.lit(datetime.now()).alias("load_timestamp")
        ])
    )

    # deduplicate like ROW_NUMBER PARTITION BY (keep first row per orders_id)
    df_staging = (
        df_staging
        .sort(["orders_id", "product_id", "load_timestamp"], descending=True)
        .unique(subset=["orders_id", "product_id"], keep="first")
    )
    
    df_staging = df_staging.rename({"payment_src": "payment_source"})

    return df_staging
