In [0]:
def get_env_config(env: str):
    env = (env or "dev").lower().strip()
    if env not in ["dev", "prod"]:
        env = "dev"

    return {
        "ENV": env,
        "CATALOG": "finance_data",
        "SCHEMA": env,
        "RAW_BUCKET": "gs://finance-datalake-raw",
        "PROCESSED_BUCKET": "gs://finance-datalake-processed",
    }

def tbl(cfg, name: str) -> str:
    return f"{cfg['CATALOG']}.{cfg['SCHEMA']}.{name}"

def path(cfg, layer: str, name: str) -> str:
    # keep history separate per env
    return f"{cfg['PROCESSED_BUCKET']}/{cfg['ENV']}/{layer}/{name}/"

def checkpoint(cfg, name: str) -> str:
    return f"{cfg['PROCESSED_BUCKET']}/{cfg['ENV']}/checkpoints/{name}/"



In [0]:

RAW_INCOMING = "gs://finance-datalake-raw/transactions/incoming/"


In [0]:

#RAW_BUCKET = "gs://finance-datalake-raw"
#PROCESSED_BUCKET = "gs://finance-datalake-processed"

#RAW_INCOMING = f"{RAW_BUCKET}/transactions/incoming/"
#BRONZE_PATH = f"{PROCESSED_BUCKET}/bronze/transactions/"
#SILVER_PATH = f"{PROCESSED_BUCKET}/silver/transactions/"
#GOLD_PATH   = f"{PROCESSED_BUCKET}/gold/"

#BRONZE_CHECKPOINT = f"{PROCESSED_BUCKET}/checkpoints/bronze_transactions/"
#SILVER_CHECKPOINT = f"{PROCESSED_BUCKET}/checkpoints/silver_transactions/"


In [0]:
def gcs_ls(path: str):
    jvm = spark._jvm
    conf = spark._jsc.hadoopConfiguration()
    uri = jvm.java.net.URI(path)
    fs = jvm.org.apache.hadoop.fs.FileSystem.get(uri, conf)
    statuses = fs.listStatus(jvm.org.apache.hadoop.fs.Path(path))
    return [s.getPath().toString() for s in statuses]


In [0]:
import random
from datetime import datetime, timezone
from pyspark.sql import Row
from pyspark.sql import functions as F

merchant_categories = ["GROCERY","FUEL","RESTAURANT","ONLINE_RETAIL","UTILITIES","TRAVEL","PHARMACY","TRANSFER"]
channels = ["POS","ONLINE","ATM"]
cities = ["Toronto","Mississauga","Brampton","Ottawa","Montreal","Vancouver","Calgary","Edmonton"]

def make_batch(n=1000):
    now = datetime.now(timezone.utc).isoformat()
    rows = []
    for _ in range(n):
        amt = round(random.uniform(2, 5000), 2)
        if random.random() < 0.02:
            amt = round(random.uniform(7000, 25000), 2)

        rows.append(Row(
            transaction_id=f"TXN{random.randint(10**11, 10**12-1)}",
            account_id=f"ACCT{random.randint(10000, 99999)}",
            transaction_timestamp=now,
            transaction_type="debit" if random.random() < 0.85 else "credit",
            amount=amt,
            merchant_category=random.choice(merchant_categories),
            channel=random.choice(channels),
            city=random.choice(cities)
        ))
    return spark.createDataFrame(rows)
