# GOLD LAYER

### Necessary Imports

In [8]:
import os
import logging
import importlib
from pathlib import Path
from datetime import date, timedelta, datetime
from dotenv import load_dotenv

import polars as pl

import config


importlib.reload(config)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

load_dotenv(override=True) 

True

In [9]:
%run ./etl_runner.ipynb

In [10]:
PARQUET_FILES_DIR = Path(os.getenv("PARQUET_FILES_DIR"))

gold_parquet = PARQUET_FILES_DIR/'gold_layer'
gold_parquet.mkdir(parents=True, exist_ok=True)

### Delete all saved parquet files and cache files of the Gold layer

In [None]:
#### ------- Functionality to delete the parquet files - when needed -------

# for file in gold_parquet.glob("*.parquet"):
#     logger.info(f"Deleting: {file}")
#     os.remove(file)

# _parquet_cache.clear()

### Generate the 'Date' table

In [12]:
def load_dim_date(start_date: str = "1954-08-18", end_date: str = "2025-07-30") -> pl.DataFrame:
    # generate date range
    start = datetime.strptime(start_date, "%Y-%m-%d").date()
    end = datetime.strptime(end_date, "%Y-%m-%d").date()
    
    num_days = (end - start).days + 1
    date_list = [start + timedelta(days=i) for i in range(num_days)]
    
    df = pl.DataFrame({"date": date_list})
    
    df = (
        df.with_columns([
            pl.col("date").dt.year().alias("year"),
            pl.col("date").dt.month().alias("month"),
            pl.col("date").dt.day().alias("day"),
            pl.col("date").dt.weekday().alias("weekday"),   # 0=Mon, 6=Sun
            pl.col("date").dt.strftime("%a").alias("weekday_name"),
            pl.when(pl.col("date").dt.weekday().is_in([5, 6]))
              .then(pl.lit(1)).otherwise(pl.lit(0))
              .alias("is_weekend"),
            ("Q" + pl.col("date").dt.quarter().cast(pl.Utf8)).alias("fiscal_quarter")
        ])
    )
    
    # ensure data types
    df = df.cast({
        "date": pl.Date,
        "year": pl.Int32,
        "month": pl.Int32,
        "day": pl.Int32,
        "weekday": pl.Int32,
        "weekday_name": pl.Utf8,
        "is_weekend": pl.Int32,
        "fiscal_quarter": pl.Utf8,
    })
    
    # add row for 'unknown' date
    df_unknown = pl.DataFrame({
        "date": [date(1900,1,1)],
        "year": [0],
        "month": [0],
        "day": [0],
        "weekday": [0],
        "weekday_name": ["Unknown"],
        "is_weekend": [0],
        "fiscal_quarter": ["Q0"]
    }).cast(df.schema) 
    
    
    df_final = pl.concat([df, df_unknown], how="vertical")
    
    df_final.write_parquet(gold_parquet / "gold_dim_date.parquet")
    return df_final

load_dim_date(start_date="1954-08-18", end_date="2025-07-30")

date,year,month,day,weekday,weekday_name,is_weekend,fiscal_quarter
date,i32,i32,i32,i32,str,i32,str
1954-08-18,1954,8,18,3,"""Wed""",0,"""Q3"""
1954-08-19,1954,8,19,4,"""Thu""",0,"""Q3"""
1954-08-20,1954,8,20,5,"""Fri""",1,"""Q3"""
1954-08-21,1954,8,21,6,"""Sat""",1,"""Q3"""
1954-08-22,1954,8,22,7,"""Sun""",0,"""Q3"""
…,…,…,…,…,…,…,…
2025-07-27,2025,7,27,7,"""Sun""",0,"""Q3"""
2025-07-28,2025,7,28,1,"""Mon""",0,"""Q3"""
2025-07-29,2025,7,29,2,"""Tue""",0,"""Q3"""
2025-07-30,2025,7,30,3,"""Wed""",0,"""Q3"""


### Generate other tables

In [14]:
build_and_merge_all(config.TABLE_GROUPS)

INFO:__main__:WORKING FOR INITIAL TABLE: delivery_estimate
INFO:helper_utils:# --- INSIDE MAIN ---
INFO:__main__:attr_cols : None
INFO:__main__:extra_cols : None
INFO:__main__:df_src.columns : ['shipping_type_id', 'shipping_type', 'delivery_estimate', 'load_timestamp']
INFO:__main__:df_final.columns : ['delivery_estimate_skey', 'delivery_estimate', 'is_active', 'batch_id', 'load_timestamp']
INFO:__main__:Saved delivery_estimate into DB schema gold
INFO:__main__:WORKING FOR INITIAL TABLE: shipping_type
INFO:helper_utils:# --- INSIDE MAIN ---
INFO:__main__:attr_cols : ['delivery_estimate']
INFO:__main__:extra_cols : None
INFO:__main__:df_src.columns : ['shipping_type_id', 'shipping_type', 'delivery_estimate', 'load_timestamp']
INFO:__main__:Looking for parent dim: /home/sapna.choudhary/Data-Engineering-Training/Polars_DWH/parquet_files/gold_layer/gold_dim_delivery_estimate.parquet
INFO:__main__:df_final.columns : ['shipping_type_skey', 'shipping_type', 'delivery_estimate_skey', 'is_activ

{'delivery_estimate': shape: (5, 5)
 ┌────────────────────────┬───────────────────┬───────────┬──────────┬─────────────────────┐
 │ delivery_estimate_skey ┆ delivery_estimate ┆ is_active ┆ batch_id ┆ load_timestamp      │
 │ ---                    ┆ ---               ┆ ---       ┆ ---      ┆ ---                 │
 │ i64                    ┆ str               ┆ i32       ┆ i32      ┆ datetime[μs]        │
 ╞════════════════════════╪═══════════════════╪═══════════╪══════════╪═════════════════════╡
 │ 1                      ┆ 1-3 Days          ┆ 1         ┆ 1        ┆ 2025-09-19 16:30:43 │
 │ 2                      ┆ 3-5 Days          ┆ 1         ┆ 1        ┆ 2025-09-19 16:30:43 │
 │ 3                      ┆ 1 Day             ┆ 1         ┆ 1        ┆ 2025-09-19 16:30:43 │
 │ 4                      ┆ 3-7 Days          ┆ 1         ┆ 1        ┆ 2025-09-19 16:30:43 │
 │ 5                      ┆ Unknown           ┆ 1         ┆ 1        ┆ 2025-09-19 16:30:43 │
 └────────────────────────┴───────