# GOLD LAYER

### Necessary Imports

In [75]:
import os
import importlib
from pathlib import Path
from datetime import date, timedelta, datetime
from dotenv import load_dotenv

import urllib
import polars as pl
from sqlalchemy import create_engine

import config
importlib.reload(config)

load_dotenv(override=True) 

True

In [76]:
%run ./helper_utils.ipynb
%run ./runner.ipynb

In [77]:
PARQUET_FILES_DIR = Path(os.getenv("PARQUET_FILES_DIR"))

gold_parquet = PARQUET_FILES_DIR/'gold_layer'
gold_parquet.mkdir(parents=True, exist_ok=True)

### Setup the db-connection

In [78]:
params = urllib.parse.quote_plus(
    "DRIVER={ODBC Driver 17 for SQL Server};"
    "SERVER=associatetraining.database.windows.net,1433;"
    "DATABASE=associatetraining;"
    "UID=training;"
    "PWD=dFyUT1#$rKIh26;"
)

engine = create_engine(f"mssql+pyodbc:///?odbc_connect={params}")

### Delete all parquet files of the Gold layer and Cache

In [79]:
# for file in gold_parquet.glob("*.parquet"):
#     print(f"Deleting: {file}")
#     os.remove(file)

# _parquet_cache.clear()

### Generate the 'Date' table

In [None]:
def load_dim_date(start_date: str = "1954-08-18", end_date: str = "2025-07-30") -> pl.DataFrame:
    # generate date range
    start = datetime.strptime(start_date, "%Y-%m-%d").date()
    end = datetime.strptime(end_date, "%Y-%m-%d").date()
    
    num_days = (end - start).days + 1
    date_list = [start + timedelta(days=i) for i in range(num_days)]
    
    df = pl.DataFrame({"date": date_list})
    
    df = (
        df.with_columns([
            pl.col("date").dt.year().alias("year"),
            pl.col("date").dt.month().alias("month"),
            pl.col("date").dt.day().alias("day"),
            pl.col("date").dt.weekday().alias("weekday"),   # 0=Mon, 6=Sun
            pl.col("date").dt.strftime("%a").alias("weekday_name"),
            pl.when(pl.col("date").dt.weekday().is_in([5, 6]))
              .then(pl.lit(1)).otherwise(pl.lit(0))
              .alias("is_weekend"),
            ("Q" + pl.col("date").dt.quarter().cast(pl.Utf8)).alias("fiscal_quarter")
        ])
    )
    
    # ensure data types
    df = df.cast({
        "date": pl.Date,
        "year": pl.Int32,
        "month": pl.Int32,
        "day": pl.Int32,
        "weekday": pl.Int32,
        "weekday_name": pl.Utf8,
        "is_weekend": pl.Int32,
        "fiscal_quarter": pl.Utf8,
    })
    
    # add row for 'unknown' date
    df_unknown = pl.DataFrame({
        "date": [date(1900,1,1)],
        "year": [0],
        "month": [0],
        "day": [0],
        "weekday": [0],
        "weekday_name": ["Unknown"],
        "is_weekend": [0],
        "fiscal_quarter": ["Q0"]
    }).cast(df.schema) 
    
    df_final = pl.concat([df, df_unknown], how="vertical")
    
    df_final.write_parquet(gold_parquet / "gold_dim_date.parquet")
    return df_final

load_dim_date(start_date="1954-08-18", end_date="2025-07-30")

### Generate other tables

In [None]:
# build_and_merge_all(config.TABLE_GROUPS)



WORKING FOR FINAL TABLE: product
Parquet file not found: /home/sapna.choudhary/Data-Engineering-Training/Polars_DWH/parquet_files/gold_layer/gold_dim_product.parquet
# --- INSIDE MAIN ---
attr_cols : ['brand', 'sub_category']
extra_cols : ['product_name', 'product_description', 'rating', 'no_of_ratings', 'discount_percent', 'actual_price']
df_src.columns : ['product_id', 'brand_skey', 'product_name', 'product_description', 'rating', 'no_of_ratings', 'sub_category_skey', 'discount_percent', 'actual_price']
Looking for parent dim: /home/sapna.choudhary/Data-Engineering-Training/Polars_DWH/parquet_files/gold_layer/gold_dim_brand.parquet
Looking for parent dim: /home/sapna.choudhary/Data-Engineering-Training/Polars_DWH/parquet_files/gold_layer/gold_dim_sub_category.parquet
df_final.columns : ['product_skey', 'product_id', 'brand_skey', 'sub_category_skey', 'product_name', 'product_description', 'rating', 'no_of_ratings', 'discount_percent', 'actual_price', 'is_active', 'batch_id', 'load_

Saved product into DB schema gold


WORKING FOR FINAL TABLE: orders
Parquet file not found: /home/sapna.choudhary/Data-Engineering-Training/Polars_DWH/parquet_files/gold_layer/gold_dim_orders.parquet
# --- INSIDE MAIN ---
attr_cols : ['customer', 'shipping_type', 'payment_source', 'lead_type']
extra_cols : ['has_coupon', 'coupon_code', 'is_gift', 'gift_message']
df_src.columns : ['orders_id', 'customer_skey', 'shipping_type_skey', 'payment_source_skey', 'lead_type_skey', 'has_coupon', 'coupon_code', 'is_gift', 'gift_message']
Looking for parent dim: /home/sapna.choudhary/Data-Engineering-Training/Polars_DWH/parquet_files/gold_layer/gold_dim_customer.parquet
Looking for parent dim: /home/sapna.choudhary/Data-Engineering-Training/Polars_DWH/parquet_files/gold_layer/gold_dim_shipping_type.parquet
Looking for parent dim: /home/sapna.choudhary/Data-Engineering-Training/Polars_DWH/parquet_files/gold_layer/gold_dim_payment_source.parquet
Looking for parent dim: /home/sapna.choudhary/Data-Engi

{'product': shape: (49_997, 13)
 ┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬──────────┬───────────┐
 │ product_s ┆ product_i ┆ brand_ske ┆ sub_categ ┆ … ┆ actual_pr ┆ is_active ┆ batch_id ┆ load_time │
 │ key       ┆ d         ┆ y         ┆ ory_skey  ┆   ┆ ice       ┆ ---       ┆ ---      ┆ stamp     │
 │ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ i32       ┆ i32      ┆ ---       │
 │ i64       ┆ str       ┆ i64       ┆ i64       ┆   ┆ f64       ┆           ┆          ┆ datetime[ │
 │           ┆           ┆           ┆           ┆   ┆           ┆           ┆          ┆ μs]       │
 ╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪══════════╪═══════════╡
 │ 1         ┆ Prd_00038 ┆ 121       ┆ 12        ┆ … ┆ 93598.88  ┆ 1         ┆ 1        ┆ 2025-09-1 │
 │           ┆ 686       ┆           ┆           ┆   ┆           ┆           ┆          ┆ 6         │
 │           ┆           ┆           ┆           ┆