## BRONZE & STAGING LAYER:- HIST TABLES

### Necessary Imports

In [44]:
import os
import sys
import logging
from pathlib import Path
from dotenv import load_dotenv

import polars as pl
from sqlalchemy import inspect, text

import config

top_level = Path().resolve().parent
sys.path.append(str(top_level))
from gold_layer.helper_utils import get_batch_id
from db_utils import engine


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

load_dotenv(override=True)

True

In [45]:
POLARS_DWH = Path(os.getenv("POLARS_DWH"))
PARQUET_FILES_DIR = Path(os.getenv("PARQUET_FILES_DIR"))

bronze_parquet = PARQUET_FILES_DIR/'bronze_layer'
bronze_parquet.mkdir(parents=True, exist_ok=True)

staging_parquet = PARQUET_FILES_DIR/'staging_layer'
staging_parquet.mkdir(parents=True, exist_ok=True)

### Delete all "hist" parquet files of the layer

In [46]:
# for file in bronze_parquet.glob("*.parquet"):
#     if file.name.endswith("_hist.parquet"):
#         logger.info(f"Deleting: {file}")
#         os.remove(file)

# for file in staging_parquet.glob("*.parquet"):
#     if file.name.endswith("_hist.parquet"):
#         logger.info(f"Deleting: {file}")
#         os.remove(file)

In [47]:
def table_exists(engine, schema, table):
    insp = inspect(engine)
    return insp.has_table(table_name=table, schema=schema)

In [None]:
def process_layer(
    layer_name: str,
    parquet_dir: Path,
    table_names: list[str],
    cleanup_after: bool = False,
) -> dict[str, pl.DataFrame]:
    """
    Process a layer: append to history, print latest customer, optionally cleanup non-hist parquet files.
    """
    logger.info(f"\nProcessing {layer_name.upper()} layer: {parquet_dir}\n")
    dfs = {}
    # Dynamically constructs the current table’s parquet & history file path
    for table_name in table_names:
        file_path = parquet_dir / f"{layer_name}_{table_name}.parquet"
        hist_file_path = parquet_dir / f"{layer_name}_{table_name}_hist.parquet"

        # Fetch the batch_id
        if hist_file_path.exists():
            df_hist = pl.read_parquet(hist_file_path)
            batch_id = get_batch_id(df_hist)
        else:
            df_hist = None
            batch_id = 1

        if file_path.exists():
            logger.info(f"Appending {table_name} to history...")
            df = pl.read_parquet(file_path)
            df_final = (
                df.with_columns([
                    pl.lit(batch_id).alias("batch_id")
                ])
            )
            # store current table in 'dfs' dictionary to be able to access them later
            dfs[table_name] = df_final

            if df_hist is not None:
                # If a history file already exists: vertical concatenation 
                df_final = pl.concat([df_hist, df_final], how="vertical")

            # Save to parquet (local hist layer)
            df_final.write_parquet(hist_file_path)
            logger.info(f"Created {table_name}: {df_final.shape} at {hist_file_path}")

            # Save to SQL Server (bronze schema in DB)
            tbl = f"{layer_name}.{table_name}_hist"
            # if table exists, check schema
            if table_exists(engine, layer_name, table_name):
                # fetch existing columns
                with engine.connect() as conn:
                    existing_cols = [c["name"] for c in inspect(engine).get_columns(table_name, schema=layer_name)]

                if "batch_id" not in existing_cols:
                    logger.warning(f"Table {tbl} missing 'batch_id'. Recreating table...")
                    mode = "replace"
                else:
                    mode = "append"
            else:
                mode = "replace"
                
            df_final.write_database(
                table_name=tbl,
                connection=engine,
                if_table_exists=mode,
            )
            logger.info(f"Saved {table_name} into DB schema {layer_name}")

        else:
            logger.info(f"Skipping {table_name} — no parquet found at {file_path}")

    # Cleanup if needed
    if cleanup_after:
        for f in parquet_dir.glob("*.parquet"):
            if not f.name.endswith("_hist.parquet"):
                logger.info(f"Deleting: {f}")
                # delete parquet file
                os.remove(f)

                # delete DB table, if it exists
                table_to_delete = f"{layer_name}.{f.stem.replace(f'{layer_name}_','')}"
                try:
                    with engine.begin() as conn:
                        conn.execute(text(f"DROP TABLE IF EXISTS {table_to_delete}"))
                    logger.info(f"Dropped table {table_to_delete} in DB")
                except Exception as e:
                    logger.warning(f"Failed to drop table {table_to_delete}: {e}")

    return dfs


In [None]:
# Process bronze
dfs_bronze = process_layer("bronze", bronze_parquet, config.HIST_CONFIG, cleanup_after=True)

# Process staging
dfs_staging = process_layer("staging", staging_parquet, config.HIST_CONFIG, cleanup_after=True)


INFO:__main__:
Processing STAGING layer: /home/sapna.choudhary/Data-Engineering-Training/Polars_DWH/parquet_files/staging_layer

INFO:__main__:Skipping customer — no parquet found at /home/sapna.choudhary/Data-Engineering-Training/Polars_DWH/parquet_files/staging_layer/staging_customer.parquet
INFO:__main__:Appending shipping_type to history...
INFO:__main__:Created shipping_type: (12, 5) at /home/sapna.choudhary/Data-Engineering-Training/Polars_DWH/parquet_files/staging_layer/staging_shipping_type_hist.parquet
INFO:__main__:Saved shipping_type into DB schema staging
INFO:__main__:Skipping product — no parquet found at /home/sapna.choudhary/Data-Engineering-Training/Polars_DWH/parquet_files/staging_layer/staging_product.parquet
INFO:__main__:Skipping orders — no parquet found at /home/sapna.choudhary/Data-Engineering-Training/Polars_DWH/parquet_files/staging_layer/staging_orders.parquet
INFO:__main__:Deleting: /home/sapna.choudhary/Data-Engineering-Training/Polars_DWH/parquet_files/sta

In [54]:
df = pl.read_parquet('/home/sapna.choudhary/Data-Engineering-Training/Polars_DWH/parquet_files/staging_layer/staging_shipping_type_hist.parquet')
df

shipping_type_id,shipping_type,delivery_estimate,load_timestamp,batch_id
str,str,str,datetime[μs],i32
"""Ship_0002""","""Normal""","""3-7 Days""",2025-09-19 15:21:34.047037,1
"""Ship_0003""","""One-Day Delivery""","""1 Day""",2025-09-19 15:21:34.047037,1
"""Ship_0001""","""Express""","""3-5 Days""",2025-09-19 15:21:34.047037,1
"""Ship_0004""","""Fast Delivery""","""1-3 Days""",2025-09-19 15:21:34.047037,1
"""Ship_0003""","""One-Day Delivery""","""1 Day""",2025-09-19 15:53:57.066318,2
"""Ship_0002""","""Normal""","""3-7 Days""",2025-09-19 15:53:57.066318,2
"""Ship_0004""","""Fast Delivery""","""1-3 Days""",2025-09-19 15:53:57.066318,2
"""Ship_0001""","""Express""","""3-5 Days""",2025-09-19 15:53:57.066318,2
