# GOLD LAYER

In [5]:
import polars as pl
from datetime import date, timedelta, datetime
import os
import sys
from pathlib import Path
from dotenv import load_dotenv

sys.path.append(os.path.abspath(".."))
import udfs as udf

In [14]:
load_dotenv(override=True) 

POLARS_DWH = Path(os.getenv("POLARS_DWH"))
print(POLARS_DWH)

staging_dir = POLARS_DWH/'staging_layer'
gold_dir = POLARS_DWH/'gold_layer'
print(staging_dir, gold_dir, sep="\n")

/home/sapna.choudhary/Data-Engineering-Training/Polars_DWH
/home/sapna.choudhary/Data-Engineering-Training/Polars_DWH/staging_layer
/home/sapna.choudhary/Data-Engineering-Training/Polars_DWH/gold_layer


In [15]:
def load_dim_date(start_date: str = "1954-08-18", end_date: str = "2025-07-30") -> pl.DataFrame:
    # generate date range
    start = datetime.strptime(start_date, "%Y-%m-%d").date()
    end = datetime.strptime(end_date, "%Y-%m-%d").date()
    
    num_days = (end - start).days + 1
    date_list = [start + timedelta(days=i) for i in range(num_days)]
    
    df = pl.DataFrame({"date": date_list})

    df = (
        df.with_columns([
            pl.col("date").dt.year().alias("year"),
            pl.col("date").dt.month().alias("month"),
            pl.col("date").dt.day().alias("day"),
            pl.col("date").dt.weekday().alias("weekday"),   # 0=Mon, 6=Sun
            pl.col("date").dt.strftime("%A").alias("weekday_name"),
            pl.when(pl.col("date").dt.weekday().is_in([5, 6]))
              .then(pl.lit(1)).otherwise(pl.lit(0))
              .alias("is_weekend"),
            ("Q" + pl.col("date").dt.quarter().cast(pl.Utf8)).alias("fiscal_quarter")
        ])
    )

    # enforce types for base df
    df = df.cast({
        "date": pl.Date,
        "year": pl.Int32,
        "month": pl.Int32,
        "day": pl.Int32,
        "weekday": pl.Int32,
        "weekday_name": pl.Utf8,
        "is_weekend": pl.Int32,
        "fiscal_quarter": pl.Utf8,
    })

    # add special mapping row with same dtypes
    df_unknown = pl.DataFrame({
        "date": [date(1900,1,1)],
        "year": [0],
        "month": [0],
        "day": [0],
        "weekday": [0],
        "weekday_name": ["Unknown"],
        "is_weekend": [0],
        "fiscal_quarter": ["Q0"]
    }).cast(df.schema) 

    df_final = pl.concat([df, df_unknown], how="vertical")

    return df_final.sort("date")

In [16]:
df_dim_date = load_dim_date("1954-08-18", "2025-07-30")
df_dim_date

date,year,month,day,weekday,weekday_name,is_weekend,fiscal_quarter
date,i32,i32,i32,i32,str,i32,str
1900-01-01,0,0,0,0,"""Unknown""",0,"""Q0"""
1954-08-18,1954,8,18,3,"""Wednesday""",0,"""Q3"""
1954-08-19,1954,8,19,4,"""Thursday""",0,"""Q3"""
1954-08-20,1954,8,20,5,"""Friday""",1,"""Q3"""
1954-08-21,1954,8,21,6,"""Saturday""",1,"""Q3"""
…,…,…,…,…,…,…,…
2025-07-26,2025,7,26,6,"""Saturday""",1,"""Q3"""
2025-07-27,2025,7,27,7,"""Sunday""",0,"""Q3"""
2025-07-28,2025,7,28,1,"""Monday""",0,"""Q3"""
2025-07-29,2025,7,29,2,"""Tuesday""",0,"""Q3"""


In [54]:
def merge_upsert(df_src: pl.DataFrame, df_tgt: pl.DataFrame | None, key_col: str) -> pl.DataFrame:
    """
    Simulates SQL MERGE (UPSERT) in Polars:
      - If key exists in both → keep
      - If key exists only in source → insert
      - If key exists only in target → delete
    """
    df_src = df_src.select(key_col).drop_nulls().unique()

    if df_tgt is None or df_tgt.is_empty():
      print("Entered if")
      print(df_src)
      return df_src

    # 1. INSERT: new rows from source (in source only but not in target)
    new_rows = df_src.filter(~pl.col(key_col).is_in(df_tgt[key_col]))
    
    # 2. KEEP: rows that exist in both
    common = df_tgt.filter(pl.col(key_col).is_in(df_src[key_col]))

    # 3. DELETE: target-only rows are excluded
    # (automatically handled, since we don’t include target-only rows)
    
    # 4. Final merged result
    df_final = pl.concat([common, new_rows], how="vertical") \
                .unique() \
                .sort(key_col)
    return df_final


In [55]:
tables = ['customer']
        #   , 'order', 'product', 'shipping_type']
table_name = 'customer'
key_col = ['gender', 'marital_status', 'customer_type', 'account_status']

staging_paths = {}
for col in key_col:
    print(col)
    # staging_paths[f"df_{table_name}"] = staging_dir / f"{table_name}.parquet"
    src_file_name = staging_dir / f"staging_{table_name}.parquet"
    
    df_src = pl.read_parquet(src_file_name)

    # load gold table if exists
    tgt_file_name = gold_dir / f"gold_dim_{col}.parquet"
    
    if os.path.exists(tgt_file_name):
        df_tgt = pl.read_parquet(tgt_file_name)
    else:
        df_tgt = None

    # merge
    df_tgt = merge_upsert(df_src, df_tgt, col)
    print(df_tgt)    

    # save
    df_tgt.write_parquet(tgt_file_name)

gender
shape: (4, 1)
┌─────────┐
│ gender  │
│ ---     │
│ str     │
╞═════════╡
│ Female  │
│ Male    │
│ Other   │
│ Unknown │
└─────────┘
marital_status
shape: (3, 1)
┌────────────────┐
│ marital_status │
│ ---            │
│ str            │
╞════════════════╡
│ Married        │
│ Single         │
│ Unknown        │
└────────────────┘
customer_type
shape: (3, 1)
┌───────────────┐
│ customer_type │
│ ---           │
│ str           │
╞═══════════════╡
│ Non-prime     │
│ Prime         │
│ Unknown       │
└───────────────┘
account_status
shape: (4, 1)
┌────────────────┐
│ account_status │
│ ---            │
│ str            │
╞════════════════╡
│ Active         │
│ Inactive       │
│ Suspended      │
│ Unknown        │
└────────────────┘


Please use `implode` to return to previous behavior.

See https://github.com/pola-rs/polars/issues/22149 for more information.
  new_rows = df_src.filter(~pl.col(key_col).is_in(df_tgt[key_col]))
Please use `implode` to return to previous behavior.

See https://github.com/pola-rs/polars/issues/22149 for more information.
  common = df_tgt.filter(pl.col(key_col).is_in(df_src[key_col]))


In [56]:
dff = pl.read_parquet(gold_dir / f"gold_dim_account_status.parquet")
dff

account_status
str
"""Active"""
"""Inactive"""
"""Suspended"""
"""Unknown"""
