In [6]:
#%pip install -r requirements.txt

In [7]:
import os, re, csv, pytz
from pathlib import Path
from datetime import datetime
import pandas as pd
from google.cloud import storage

SP = pytz.timezone("America/Sao_Paulo")

In [8]:
#from dotenv import load_dotenv
#from pathlib import Path
#load_dotenv("../../.env")

In [9]:
def snake(s: str) -> str:
    s = (s or "").strip().lower()
    s = re.sub(r"[^\w]+", "_", s, flags=re.UNICODE)
    s = re.sub(r"_+", "_", s).strip("_")
    return s or "col"

In [10]:
def detect_sep(sample_bytes: bytes) -> str:
    # tenta deduzir separador por sniffing
    try:
        txt = sample_bytes.decode("utf-8", errors="ignore")
        dialect = csv.Sniffer().sniff(txt, delimiters=[",", ";", "\t", "|"])
        return dialect.delimiter
    except Exception:
        return ","

In [11]:
def main(years: list[int]):
    bucket_name = os.environ["BUCKET"]
    raw_prefix = os.environ.get("RAW_PREFIX", "raw/bps/")
    stg_prefix = os.environ.get("STG_PREFIX", "stg/bps/")

    ingest_date_filter = os.environ.get("INGEST_DATE")

    client = storage.Client()
    bucket = client.bucket(bucket_name)

    # Lista CSVs no RAW
    blobs = list(client.list_blobs(bucket, prefix=raw_prefix))
    csv_blobs = [b for b in blobs if b.name.lower().endswith(".csv")]

    if not csv_blobs:
        raise RuntimeError(f"Nenhum .csv encontrado em gs://{bucket_name}/{raw_prefix}")

    def parse_year_and_ingest(name: str):
        my = re.search(r"year=(20\d{2})", name)
        mi = re.search(r"ingest_date=(\d{4}-\d{2}-\d{2})", name)
        y = int(my.group(1)) if my else None
        ingest = mi.group(1) if mi else None
        return y, ingest

    selected = []
    for b in csv_blobs:
        y, ingest = parse_year_and_ingest(b.name)
        if y is None or ingest is None:
            continue
        if years and y not in years:
            continue
        if ingest_date_filter and ingest != ingest_date_filter:
            continue
        selected.append((y, ingest, b))

    if not selected:
        raise RuntimeError(
            f"Nenhum CSV selecionado em gs://{bucket_name}/{raw_prefix} "
            f"para years={years} ingest_date={ingest_date_filter or '*'}"
        )

    for y, ingest, b in sorted(selected, key=lambda t: (t[0], t[1], t[2].name)):
        print(f"\n==> Processando RAW: gs://{bucket_name}/{b.name}")

        base_name = os.path.basename(b.name)
        stem = Path(base_name).stem

        local_csv = f"/tmp/{stem}__{y}__{ingest}.csv"
        b.download_to_filename(local_csv)

        # Detecta separador
        with open(local_csv, "rb") as fh:
            sample = fh.read(4096)
        sep = detect_sep(sample)

        def make_reader(enc: str):
            return pd.read_csv(
                local_csv,
                sep=sep,
                dtype=str,
                encoding=enc,
                chunksize=200_000,
                low_memory=False,
            )

        try:
            reader = make_reader("utf-8")
            next(iter(reader))
            reader = make_reader("utf-8")
        except Exception:
            reader = make_reader("latin1")

        part = 0
        for chunk in reader:
            chunk.columns = [snake(c) for c in chunk.columns]

            # metadados
            chunk["source_year"] = str(y)
            chunk["ingest_date"] = ingest
            chunk["source_object"] = f"gs://{bucket_name}/{b.name}"
            chunk["load_ts_utc"] = datetime.utcnow().isoformat(timespec="seconds") + "Z"

            local_parquet = f"/tmp/bps_{y}_{ingest}_part{part:05d}.parquet"

            chunk.to_parquet(
                local_parquet,
                index=False,
                engine="pyarrow",
                compression="snappy",
            )

            # üëá agora sem source_file na parti√ß√£o
            gcs_key = f"{stg_prefix}year={y}/ingest_date={ingest}/part-{part:05d}.parquet"

            bucket.blob(gcs_key).upload_from_filename(local_parquet)

            print(f"   - OK: gs://{bucket_name}/{gcs_key} (rows={len(chunk)})")

            os.remove(local_parquet)
            part += 1

        os.remove(local_csv)

        print(f"==> Conclu√≠do: year={y} ingest_date={ingest} parts={part}")

In [12]:
if __name__ == "__main__":
    import sys
    years = [int(a) for a in sys.argv[1:] if re.fullmatch(r"20\d{2}", a)]
    main(years)


==> Processando RAW: gs://rq-pharma-raw-rq-pharma-data-lab-26k9/raw/bps/year=2024/ingest_date=2026-02-13/2024.csv
   - OK: gs://rq-pharma-raw-rq-pharma-data-lab-26k9/stg/bps/year=2024/ingest_date=2026-02-13/part-00000.parquet (rows=20512)
==> Conclu√≠do: year=2024 ingest_date=2026-02-13 parts=1

==> Processando RAW: gs://rq-pharma-raw-rq-pharma-data-lab-26k9/raw/bps/year=2025/ingest_date=2026-02-13/2025.csv
   - OK: gs://rq-pharma-raw-rq-pharma-data-lab-26k9/stg/bps/year=2025/ingest_date=2026-02-13/part-00000.parquet (rows=2474)
==> Conclu√≠do: year=2025 ingest_date=2026-02-13 parts=1
