# STAGING LAYER

In [143]:
import polars as pl
from datetime import datetime
import os
import sys
from pathlib import Path
from dotenv import load_dotenv

sys.path.append(os.path.abspath(".."))
import udfs as udf

In [118]:
df = pl.DataFrame({
    "name": ["john doe", "JANE_doe!!"],
    "phone": ["123-45@abc", "98_76*"],
    "email": ["JOHN..DOE@@mail.comm", "valid.email@example.commm"],
    "date": ["2023-01-05", "invalid"]
})
df

name,phone,email,date
str,str,str,str
"""john doe""","""123-45@abc""","""JOHN..DOE@@mail.comm""","""2023-01-05"""
"""JANE_doe!!""","""98_76*""","""valid.email@example.commm""","""invalid"""


In [119]:
df_clean = df.with_columns([
    udf.clean(pl.col("name")),
    udf.remove_special_chars(pl.col("phone")),
    udf.clean_validate_email(pl.col("email")),
    udf.date_to_int(pl.col("date")),
])
print(df_clean)

shape: (2, 4)
┌──────────┬───────────┬─────────────────────────┬─────────┐
│ name     ┆ phone     ┆ email                   ┆ date    │
│ ---      ┆ ---       ┆ ---                     ┆ ---     │
│ str      ┆ str       ┆ str                     ┆ i64     │
╞══════════╪═══════════╪═════════════════════════╪═════════╡
│ John Doe ┆ 123-45abc ┆ null                    ┆ 5012023 │
│ Jane_Doe ┆ 98_76     ┆ valid.email@example.com ┆ null    │
└──────────┴───────────┴─────────────────────────┴─────────┘


In [None]:
load_dotenv(override=True) 

POLARS_DWH = Path(os.getenv("POLARS_DWH"))
print(POLARS_DWH)

bronze_dir = POLARS_DWH/'bronze_layer'
staging_dir = POLARS_DWH/'staging_layer'
print(bronze_dir, staging_dir, sep="\n")

/home/sapna.choudhary/Data-Engineering-Training/Polars_DWH
/home/sapna.choudhary/Data-Engineering-Training/Polars_DWH/bronze_layer
/home/sapna.choudhary/Data-Engineering-Training/Polars_DWH/staging_layer
/home/sapna.choudhary/Data-Engineering-Training/Polars_DWH/staging_hist_layer


In [121]:
df_customer = pl.read_parquet(f"{bronze_dir}/bronze_customer.parquet")
df_customer.columns

['customer_id',
 'gender',
 'signup_date',
 'customer_dob',
 'customer_name',
 'marital_status',
 'customer_type',
 'account_status',
 'region',
 'country',
 'state',
 'city',
 'postal code',
 'email',
 'phone',
 'load_timestamp']

In [122]:
def load_customer_table(raw_path: str) -> pl.DataFrame:
    df_bronze_customer = pl.read_parquet(raw_path)
    
    # gender = df_bronze_order.with_columns([
    #     pl.when(pl.col("gender").str.starts_with("M")).then(pl.lit("Male"))
    #     .when(pl.col("gender").str.starts_with("F")).then(pl.lit("Female"))
    #     .when(pl.col("gender").str.starts_with("O")).then(pl.lit("Other"))
    #     .when(pl.col("gender").is_null() | (pl.col("gender") == "")).then(pl.lit("Unknown"))
    #     .otherwise(pl.col("gender")).alias("gender")
    # ])
    # print(gender)
    
    df_bronze_customer = df_bronze_customer.filter(
                  df_bronze_customer["customer_id"].is_not_null() & (df_bronze_customer["customer_id"] != "")
                )
    
    df_staging = (
        df_bronze_customer
        .with_columns([
            # --- customer_id ---
            # udf.clean(pl.col("customer_id")).alias("customer_id"),
            udf.clean(pl.col("customer_id")).cast(pl.Int64).alias("customer_id"),
            
            # --- signup_date ---
            pl.col("signup_date")
              .str.strptime(pl.Datetime, "%Y-%m-%d", strict=False)
              .fill_null(datetime(1900,1,1))
              .alias("signup_date"),
            
            # --- gender ---
            pl.when(pl.col("gender").str.to_lowercase().str.starts_with("m")).then(pl.lit("Male"))
              .when(pl.col("gender").str.to_lowercase().str.starts_with("f")).then(pl.lit("Female"))
              .when(pl.col("gender").str.to_lowercase().str.starts_with("o")).then(pl.lit("Other"))
              .when(pl.col("gender").is_null() | (pl.col("gender") == "")).then(pl.lit("Unknown"))
              .otherwise(udf.clean(pl.col("gender")))
              .alias("gender"),

            # --- customer_dob ---
            pl.col("customer_dob")
              .str.strptime(pl.Date, "%Y-%m-%d", strict=False)
              .alias("customer_dob"),
    
            # --- customer_name ---
            udf.clean(pl.col("customer_name")).alias("customer_name"),
            
            # --- marital_status ---
            pl.when(pl.col("marital_status").str.to_lowercase().str.starts_with("s")).then(pl.lit("Single"))
              .when(pl.col("marital_status").str.to_lowercase().str.starts_with("m")).then(pl.lit("Married"))
              .when(pl.col("marital_status").is_null() | (pl.col("marital_status")=="")).then(pl.lit("Unknown"))
              .otherwise(udf.clean(pl.col("marital_status")))
              .alias("marital_status"),

            # --- email / phone ---
            udf.clean_validate_email(pl.col("email")).alias("email"),
            udf.remove_special_chars(pl.col("phone")).alias("phone"),
            
            # --- customer_type ---
            pl.when(pl.col("customer_type").str.to_lowercase().str.starts_with("p")).then(pl.lit("Prime"))
              .when(pl.col("customer_type").str.to_lowercase().str.starts_with("n")).then(pl.lit("Non-prime"))
              .when(pl.col("customer_type").is_null() | (pl.col("customer_type")=="")).then(pl.lit("Unknown"))
              .otherwise(udf.clean(pl.col("customer_type")))
              .alias("customer_type"),

            # --- account_status ---
            pl.when(pl.col("account_status").is_null() | (pl.col("account_status") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udf.clean(pl.col("account_status")))
              .alias("account_status"),

            # --- region ---
            pl.when(pl.col("region").is_null() | (pl.col("region") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udf.clean(pl.col("region")))
              .alias("region"),

            # --- country ---
            pl.when(pl.col("country").is_null() | (pl.col("country") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udf.clean(pl.col("country")))
              .alias("country"),

            # --- state (with translation as option) ---
            pl.when(pl.col("state").is_null() | (pl.col("state") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udf.clean(pl.col("state")))
            #   .otherwise(udf.translate(pl.col("state")))
              .alias("state"),

            # --- city ---
            pl.when(pl.col("city").is_null() | (pl.col("city") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udf.clean(pl.col("city")))
              .alias("city"),

            # --- postal_code (rename from 'postal code') ---
            pl.when(pl.col("postal code").is_null() | (pl.col("postal code") == ""))
            .then(pl.lit("Unknown"))
            .otherwise(udf.clean(pl.col("postal code")))
            .alias("postal code"),

            # --- load_timestamp ---
            pl.lit(datetime.now()).alias("load_timestamp")
        ])
    )

    # deduplicate like ROW_NUMBER PARTITION BY (keep the first row per shipping_type_id)
    df_staging = (
        df_staging
        .group_by("customer_id")
        .agg(pl.all().last())
    )
    
    df_staging = df_staging.rename({"postal code": "postal_code"})

    df_staging = df_staging.select([
        "customer_id",
        "signup_date",
        "gender",
        "customer_dob",
        "customer_name",
        "marital_status",
        "email",
        "phone",
        "customer_type",
        "account_status",
        "country",
        "state",
        "city",
        "postal_code",
        "region",
        "load_timestamp"
    ])
    
    return df_staging.sort("customer_id")

In [123]:
def load_product_table(raw_path: str) -> pl.DataFrame:
    df_bronze_product = pl.read_parquet(raw_path)

    # filter out invalid product_id
    df_bronze_product = df_bronze_product.filter(
        df_bronze_product["product_id"].is_not_null() & (df_bronze_product["product_id"] != "")
    )

    df_staging = (
        df_bronze_product
        .with_columns([
            # --- product_id ---
            udf.clean(pl.col("product_id")).alias("product_id"),

            # --- product_name ---
            pl.when(pl.col("product_name").is_null() | (pl.col("product_name") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udf.clean(pl.col("product_name")))
              .alias("product_name"),

            # --- brand_tier ---
            pl.when(pl.col("brand_tier").is_null() | (pl.col("brand_tier") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udf.clean(pl.col("brand_tier")))
              .alias("brand_tier"),

            # --- brand_name ---
            pl.when(pl.col("brand_name").is_null() | (pl.col("brand_name") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udf.clean(pl.col("brand_name")))
              .alias("brand_name"),

            # --- brand_country ---
            pl.when(pl.col("brand_country").is_null() | (pl.col("brand_country") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udf.clean(pl.col("brand_country")))
              .alias("brand_country"),

            # --- main_category ---
            pl.when(pl.col("main_category").is_null() | (pl.col("main_category") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udf.clean(pl.col("main_category")))
              .alias("main_category"),

            # --- sub_category ---
            pl.when(pl.col("sub_category").is_null() | (pl.col("sub_category") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udf.clean(pl.col("sub_category")))
              .alias("sub_category"),

            # --- discount_percent ---
            pl.when(
                pl.col("discount_percent").is_null() |
                (pl.col("discount_percent") == "") |
                (pl.col("discount_percent").cast(pl.Float64) < 0)
            )
            .then(pl.lit(0.0))
            .otherwise(pl.col("discount_percent").cast(pl.Float64))
            .alias("discount_percent"),

            # --- actual_price ---
            pl.when(
                pl.col("actual_price").is_null() |
                (pl.col("actual_price") == "") |
                (pl.col("actual_price").cast(pl.Float64) < 0)
            )
            .then(pl.lit(0.0))
            .otherwise(pl.col("actual_price").cast(pl.Float64))
            .alias("actual_price"),

            # --- rating ---
            pl.when(
                pl.col("rating").is_null() |
                (pl.col("rating") == "") |
                (pl.col("rating").cast(pl.Float64) < 0)
            )
            .then(pl.lit(0.0))
            .otherwise(pl.col("rating").cast(pl.Float64))
            .alias("rating"),

            # --- no_of_ratings ---
            pl.when(
                pl.col("no of ratings").is_null() |
                (pl.col("no of ratings") == "") |
                (pl.col("no of ratings").cast(pl.Float64) < 0)
            )
            .then(pl.lit(0))
            .otherwise(pl.col("no of ratings").cast(pl.Float64).cast(pl.Int64))
            .alias("no of ratings"),

            # --- load_timestamp ---
            pl.lit(datetime.now()).alias("load_timestamp")
        ])
    )

    # deduplicate like ROW_NUMBER PARTITION BY (keep the first row per product_id)
    df_staging = (
        df_staging
        .group_by("product_id")
        .agg(pl.all().first())
    )
    
    df_staging = df_staging.rename({"no of ratings": "no_of_ratings"})

    # # enforce dtypes to match SQL schema
    # df_staging = df_staging.cast({
    #     "product_id": pl.Utf8,
    #     "product_name": pl.Utf8,
    #     "brand_tier": pl.Utf8,
    #     "brand_name": pl.Utf8,
    #     "brand_country": pl.Utf8,
    #     "main_category": pl.Utf8,
    #     "sub_category": pl.Utf8,
    #     "discount_percent": pl.Float64,
    #     "actual_price": pl.Float64,
    #     "rating": pl.Float64,
    #     "no_of_ratings": pl.Int64,
    #     "load_timestamp": pl.Datetime
    # })

    return df_staging.sort("product_id")


In [124]:
def load_shipping_type_table(raw_path: str) -> pl.DataFrame:
    df_bronze_shipping = pl.read_parquet(raw_path)
    
    # filter out invalid shipping_type_id
    df_bronze_shipping = df_bronze_shipping.filter(
        df_bronze_shipping["shipping_type_id"].is_not_null() & (df_bronze_shipping["shipping_type_id"] != "")
    )
    
    df_staging = (
        df_bronze_shipping
        .with_columns([
            # --- shipping_type_id ---
            udf.clean(pl.col("shipping_type_id")).alias("shipping_type_id"),
            
            # --- shipping_type ---
            pl.when(pl.col("shipping_type").is_null() | (pl.col("shipping_type") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udf.clean(pl.col("shipping_type")))
              .alias("shipping_type"),

            # --- delivery_estimate ---
            pl.when(pl.col("delivery_estimate").is_null() | (pl.col("delivery_estimate") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udf.clean(pl.col("delivery_estimate")))
              .alias("delivery_estimate"),

            # --- load_timestamp ---
            pl.lit(datetime.now()).alias("load_timestamp")
        ])
    )

    # deduplicate like ROW_NUMBER PARTITION BY (keep the first row per shipping_type_id)
    df_staging = (
        df_staging
        .group_by("shipping_type_id")
        .agg(pl.all().last())
    )

    # # enforce schema (SQL: NVARCHAR → Utf8, load_timestamp → Datetime)
    # df_staging = df_staging.cast({
    #     "shipping_type": pl.Utf8,
    #     "delivery_estimate": pl.Utf8,
    #     "load_timestamp": pl.Datetime
    # })
    return df_staging.sort("shipping_type_id")


In [125]:
def load_order_table(raw_path: str) -> pl.DataFrame:
    df_bronze_order = pl.read_parquet(raw_path)

    # filter invalid order_id
    df_bronze_order = df_bronze_order.filter(
        df_bronze_order["order_id"].is_not_null() & (df_bronze_order["order_id"] != "")
    )

    df_staging = (
        df_bronze_order
        .with_columns([
            # --- order_id / customer_id / product_id ---
            udf.clean(pl.col("order_id")).alias("order_id"),
            udf.clean(pl.col("customer_id")).cast(pl.Int64).alias("customer_id"),
            udf.clean(pl.col("product_id")).alias("product_id"),

            # --- shipping_type / payment_source / lead_type / order_status ---
            pl.when(pl.col("shipping_type").is_null() | (pl.col("shipping_type") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udf.clean(pl.col("shipping_type")))
              .alias("shipping_type"),

            pl.when(pl.col("payment_src").is_null() | (pl.col("payment_src") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udf.clean(pl.col("payment_src")))
              .alias("payment_src"),

            pl.when(pl.col("lead_type").is_null() | (pl.col("lead_type") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udf.clean(pl.col("lead_type")))
              .alias("lead_type"),

            pl.when(pl.col("order_status").is_null() | (pl.col("order_status") == ""))
              .then(pl.lit("Unknown"))
              .otherwise(udf.clean(pl.col("order_status")))
              .alias("order_status"),

            # --- dates ---
            pl.col("order_date").str.strptime(pl.Datetime, strict=False)
              .fill_null(datetime(1900,1,1)).alias("order_date"),
            pl.col("shipping_date").str.strptime(pl.Datetime, strict=False)
              .fill_null(datetime(1900,1,1)).alias("shipping_date"),
            pl.col("expected_delivery_date").str.strptime(pl.Datetime, strict=False).alias("expected_delivery_date"),
            pl.col("delivery_date").str.strptime(pl.Datetime, strict=False).alias("delivery_date"),
            pl.col("return_date").str.strptime(pl.Datetime, strict=False).alias("return_date"),
            pl.col("refund_date").str.strptime(pl.Datetime, strict=False).alias("refund_date"),

            # --- quantity ---
            pl.when(
                pl.col("quantity").is_null() |
                (pl.col("quantity") == "") |
                (pl.col("quantity").cast(pl.Int64) < 0)
            )
            .then(pl.lit(0))
            .otherwise(pl.col("quantity").cast(pl.Int64))
            .alias("quantity"),

            # --- unit_price ---
            pl.when(
                pl.col("unit_price").is_null() |
                (pl.col("unit_price") == "") |
                (pl.col("unit_price").cast(pl.Float64) < 0)
            )
            .then(pl.lit(0.0))
            .otherwise(pl.col("unit_price").cast(pl.Float64))
            .alias("unit_price"),

            # --- is_gift + gift_message ---
            pl.when(
                (pl.col("is_gift").cast(pl.Int64) == 1) &
                (pl.col("gift_message").is_not_null()) &
                (pl.col("gift_message") != "")
            ).then(pl.lit(1)).otherwise(pl.lit(0)).alias("is_gift"),

            pl.when(
                (pl.col("is_gift").cast(pl.Int64) == 1) &
                (pl.col("gift_message").is_not_null()) &
                (pl.col("gift_message") != "")
            ).then(pl.col("gift_message"))
             .otherwise(pl.lit("")).alias("gift_message"),

            # --- has_coupon + coupon_code ---
            pl.when(
                (pl.col("has_coupon").cast(pl.Int64) == 1) &
                (pl.col("coupon_code").is_not_null()) &
                (pl.col("coupon_code") != "")
            ).then(pl.lit(1)).otherwise(pl.lit(0)).alias("has_coupon"),

            pl.when(
                (pl.col("has_coupon").cast(pl.Int64) == 1) &
                (pl.col("coupon_code").is_not_null()) &
                (pl.col("coupon_code") != "")
            ).then(pl.col("coupon_code"))
             .otherwise(pl.lit("")).alias("coupon_code"),

            # --- load_timestamp ---
            pl.lit(datetime.now()).alias("load_timestamp")
        ])
    )

    # deduplicate like ROW_NUMBER PARTITION BY (keep first row per order_id)
    df_staging = (
        df_staging
        .group_by("order_id")
        .agg(pl.all().first())
    )
    
    df_staging = df_staging.rename({"payment_src": "payment_source"})

    # # enforce schema consistency
    # df_staging = df_staging.cast({
    #     "order_id": pl.Utf8,
    #     "customer_id": pl.Int64,
    #     "product_id": pl.Utf8,
    #     "shipping_type": pl.Utf8,
    #     "payment_source": pl.Utf8,
    #     "lead_type": pl.Utf8,
    #     "order_status": pl.Utf8,
    #     "order_date": pl.Datetime,
    #     "shipping_date": pl.Datetime,
    #     "expected_delivery_date": pl.Datetime,
    #     "delivery_date": pl.Datetime,
    #     "return_date": pl.Datetime,
    #     "refund_date": pl.Datetime,
    #     "quantity": pl.Int64,
    #     "unit_price": pl.Float64,
    #     "is_gift": pl.Int64,
    #     "gift_message": pl.Utf8,
    #     "has_coupon": pl.Int64,
    #     "coupon_code": pl.Utf8,
    #     "load_timestamp": pl.Datetime
    # })

    return df_staging.sort("order_id")


In [126]:
df_customer = load_customer_table(f"{bronze_dir}/bronze_customer.parquet")
df_product = load_product_table(f"{bronze_dir}/bronze_product.parquet")
df_shipping_type = load_shipping_type_table(f"{bronze_dir}/bronze_shipping_type.parquet")
df_order = load_order_table(f"{bronze_dir}/bronze_order.parquet")

In [127]:
print(df_customer)
print(df_product)
print(df_shipping_type)
print(df_order)

shape: (499, 16)
┌────────────┬────────────┬────────┬────────────┬───┬───────────┬────────────┬────────┬────────────┐
│ customer_i ┆ signup_dat ┆ gender ┆ customer_d ┆ … ┆ city      ┆ postal_cod ┆ region ┆ load_times │
│ d          ┆ e          ┆ ---    ┆ ob         ┆   ┆ ---       ┆ e          ┆ ---    ┆ tamp       │
│ ---        ┆ ---        ┆ str    ┆ ---        ┆   ┆ str       ┆ ---        ┆ str    ┆ ---        │
│ i64        ┆ datetime[μ ┆        ┆ date       ┆   ┆           ┆ str        ┆        ┆ datetime[μ │
│            ┆ s]         ┆        ┆            ┆   ┆           ┆            ┆        ┆ s]         │
╞════════════╪════════════╪════════╪════════════╪═══╪═══════════╪════════════╪════════╪════════════╡
│ 1000       ┆ 2024-01-30 ┆ Male   ┆ 2003-04-02 ┆ … ┆ Bengaluru ┆ 560001     ┆ Asia   ┆ 2025-09-01 │
│            ┆ 00:00:00   ┆        ┆            ┆   ┆           ┆            ┆        ┆ 13:05:32.2 │
│            ┆            ┆        ┆            ┆   ┆           ┆         

In [128]:
print(df_customer.schema)
print(df_product.schema)
print(df_shipping_type.schema)
print(df_order.schema)

Schema([('customer_id', Int64), ('signup_date', Datetime(time_unit='us', time_zone=None)), ('gender', String), ('customer_dob', Date), ('customer_name', String), ('marital_status', String), ('email', String), ('phone', String), ('customer_type', String), ('account_status', String), ('country', String), ('state', String), ('city', String), ('postal_code', String), ('region', String), ('load_timestamp', Datetime(time_unit='us', time_zone=None))])
Schema([('product_id', String), ('product_name', String), ('brand_tier', String), ('brand_name', String), ('brand_country', String), ('main_category', String), ('sub_category', String), ('discount_percent', Float64), ('actual_price', Float64), ('rating', Float64), ('no_of_ratings', Int64), ('load_timestamp', Datetime(time_unit='us', time_zone=None))])
Schema([('shipping_type_id', String), ('shipping_type', String), ('delivery_estimate', String), ('load_timestamp', Datetime(time_unit='us', time_zone=None))])
Schema([('order_id', String), ('custom

In [129]:
df_customer.write_parquet("staging_customer.parquet")
df_product.write_parquet("staging_product.parquet")
df_shipping_type.write_parquet("staging_shipping_type.parquet")
df_order.write_parquet("staging_order.parquet")

In [130]:
df_shipping_type_parquet = pl.read_parquet("staging_shipping_type.parquet")
df_shipping_type_parquet

shipping_type_id,shipping_type,delivery_estimate,load_timestamp
str,str,str,datetime[μs]
"""Ship_0001""","""Express""","""3-5 Days""",2025-09-01 13:05:32.391948
"""Ship_0002""","""Normal""","""3-7 Days""",2025-09-01 13:05:32.391948
"""Ship_0003""","""One-Day Delivery""","""1 Day""",2025-09-01 13:05:32.391948
"""Ship_0004""","""Fast Delivery""","""1-3 Days""",2025-09-01 13:05:32.391948
