## BRONZE & STAGING LAYER:- HIST TABLES

### Necessary Imports

In [1]:
import os
import polars as pl
from pathlib import Path
from dotenv import load_dotenv

import sys
sys.path.append(os.path.abspath("."))
from gold_layer.helper_utils import get_batch_id

In [2]:
load_dotenv(override=True) 

POLARS_DWH = Path(os.getenv("POLARS_DWH"))
PARQUET_FILES_DIR = Path(os.getenv("PARQUET_FILES_DIR"))

bronze_parquet = PARQUET_FILES_DIR/'bronze_layer'
bronze_parquet.mkdir(parents=True, exist_ok=True)

staging_parquet = PARQUET_FILES_DIR/'staging_layer'
staging_parquet.mkdir(parents=True, exist_ok=True)

print(POLARS_DWH, PARQUET_FILES_DIR, bronze_parquet, staging_parquet, sep="\n")

/home/sapna.choudhary/Data-Engineering-Training/Polars_DWH
/home/sapna.choudhary/Data-Engineering-Training/Polars_DWH/parquet_files
/home/sapna.choudhary/Data-Engineering-Training/Polars_DWH/parquet_files/bronze_layer
/home/sapna.choudhary/Data-Engineering-Training/Polars_DWH/parquet_files/staging_layer


### Delete all "hist" parquet files of the layer

In [3]:
# for file in bronze_parquet.glob("*.parquet"):
#     if file.name.endswith("_hist.parquet"):
#         print(f"Deleting: {file}")
#         os.remove(file)

# for file in staging_parquet.glob("*.parquet"):
#     if file.name.endswith("_hist.parquet"):
#         print(f"Deleting: {file}")
#         os.remove(file)

In [None]:
def process_layer(layer_name: str, parquet_dir: Path, table_names: list[str], cleanup_after: bool = False):
    """
    Process a layer: append to history, print latest customer, optionally cleanup non-hist parquet files.
    """
    print(f"\nProcessing {layer_name.upper()} layer: {parquet_dir}\n")
    dfs = {}
    # Dynamically constructs the current table’s parquet & history file path
    for table_name in table_names:
        file_path = parquet_dir / f"{layer_name}_{table_name}.parquet"
        hist_file_path = parquet_dir / f"{layer_name}_{table_name}_hist.parquet"

        # Fetch the batch_id
        if hist_file_path.exists():
            df_hist = pl.read_parquet(hist_file_path)
            batch_id = get_batch_id(df_hist)
        else:
            batch_id = 1

        if file_path.exists():
            print(f"Appending {table_name} to history...")
            df = pl.read_parquet(file_path)
            df_final = (
                df.with_columns([
                    pl.lit(batch_id).alias("batch_id")
                ])
            )
            # store current table in 'dfs' dictionary to be able to access them later
            dfs[table_name] = df_final

            if hist_file_path.exists():
                # If a history file already exists: vertical concatenation 
                df_hist = pl.read_parquet(hist_file_path)
                df_hist = pl.concat([df_hist, df_final], how="vertical")
                df_hist.write_parquet(hist_file_path)
            else:
                df_final.write_parquet(hist_file_path)
        else:
            print(f"Skipping {table_name} — no parquet found at {file_path}")

    # Cleanup if needed
    if cleanup_after:
        for f in parquet_dir.glob("*.parquet"):
            if not f.name.endswith("_hist.parquet"):
                print(f"Deleting: {f}")
                os.remove(f)

    return dfs


In [None]:
table_names = ["customer", "orders", "product", "shipping_type"]

# Process bronze
dfs_bronze = process_layer("bronze", bronze_parquet, table_names, cleanup_after=True)

# Process staging
dfs_staging = process_layer("staging", staging_parquet, table_names, cleanup_after=True)



Processing BRONZE layer: /home/sapna.choudhary/Data-Engineering-Training/Polars_DWH/parquet_files/bronze_layer

Appending customer to history...
Appending order to history...
Appending product to history...
Appending shipping_type to history...

Processing STAGING layer: /home/sapna.choudhary/Data-Engineering-Training/Polars_DWH/parquet_files/staging_layer

Appending customer to history...
Appending order to history...
Appending product to history...
Appending shipping_type to history...
