In [None]:
#%pip install -r requirements.txt

In [3]:
import io, os, re, zipfile, shutil
from datetime import datetime

from google.cloud import storage
import pytz

SP = pytz.timezone("America/Sao_Paulo")



In [None]:
#from dotenv import load_dotenv
#from pathlib import Path
#load_dotenv("../../.env")

In [4]:
def main(years: list[int]):
    bucket_name = os.environ["BUCKET"]
    landing_prefix = os.environ.get("LANDING_PREFIX", "landing/bps/")
    raw_prefix = os.environ.get("RAW_PREFIX", "raw/bps/")  # <-- raw em vez de stg

    ingest_date_default = os.environ.get("INGEST_DATE") or datetime.now(SP).strftime("%Y-%m-%d")

    client = storage.Client()
    bucket = client.bucket(bucket_name)

    # Lista objetos zip no landing
    blobs = list(client.list_blobs(bucket, prefix=landing_prefix))
    zip_blobs = [b for b in blobs if b.name.endswith(".csv.zip")]

    if not zip_blobs:
        raise RuntimeError(f"Nenhum .csv.zip encontrado em gs://{bucket_name}/{landing_prefix}")

    # Filtra por ano (aceita tanto year=YYYY quanto YYYY.csv.zip)
    def blob_year(name: str) -> int | None:
        m = re.search(r"year=(20\d{2})", name)
        if m:
            return int(m.group(1))
        m = re.search(r"/(20\d{2})\.csv\.zip$", name)
        if m:
            return int(m.group(1))
        return None

    selected: list[tuple[int, storage.Blob]] = []
    for b in zip_blobs:
        y = blob_year(b.name)
        if y is None:
            continue
        if not years or y in years:
            selected.append((y, b))

    if not selected:
        raise RuntimeError(f"Nenhum zip para anos {years} em gs://{bucket_name}/{landing_prefix}")

    for y, b in sorted(selected, key=lambda t: (t[0], t[1].name)):
        # tenta capturar ingest_date do path, se existir
        m = re.search(r"ingest_date=(\d{4}-\d{2}-\d{2})", b.name)
        ingest = m.group(1) if m else ingest_date_default

        print(f"\n==> Processando: gs://{bucket_name}/{b.name}")
        data = b.download_as_bytes()

        zf = zipfile.ZipFile(io.BytesIO(data))

        # Lista arquivos dentro do zip (ignorando diretórios)
        members = [zi for zi in zf.infolist() if not zi.is_dir()]
        if not members:
            raise RuntimeError(f"Zip vazio (sem arquivos): {b.name}")

        for zi in members:
            # Se quiser enviar APENAS o primeiro CSV, descomente:
            # if not zi.filename.lower().endswith(".csv"):
            #     continue

            # Nome seguro (evita subpastas do zip virarem estrutura no GCS)
            base_name = os.path.basename(zi.filename)
            if not base_name:
                continue

            local_path = f"/tmp/{base_name}"

            # Extrai para /tmp (streaming)
            with zf.open(zi) as src, open(local_path, "wb") as dst:
                shutil.copyfileobj(src, dst, length=1024 * 1024)  # buffer 1MB

            # Sobe para RAW no GCS
            gcs_key = f"{raw_prefix}year={y}/ingest_date={ingest}/{base_name}"
            bucket.blob(gcs_key).upload_from_filename(local_path)

            print(f"   - OK: gs://{bucket_name}/{gcs_key}")

            # Limpa /tmp
            try:
                os.remove(local_path)
            except OSError:
                pass

        print(f"==> Concluído ano {y} (ingest_date={ingest})")

In [5]:
if __name__ == "__main__":
    import sys
    years = [int(a) for a in sys.argv[1:] if re.fullmatch(r"20\d{2}", a)]
    main(years)


==> Processando: gs://rq-pharma-raw-rq-pharma-data-lab-26k9/landing/bps/year=2024/ingest_date=2026-02-13/2024.csv.zip
   - OK: gs://rq-pharma-raw-rq-pharma-data-lab-26k9/raw/bps/year=2024/ingest_date=2026-02-13/2024.csv
==> Concluído ano 2024 (ingest_date=2026-02-13)

==> Processando: gs://rq-pharma-raw-rq-pharma-data-lab-26k9/landing/bps/year=2025/ingest_date=2026-02-13/2025.csv.zip
   - OK: gs://rq-pharma-raw-rq-pharma-data-lab-26k9/raw/bps/year=2025/ingest_date=2026-02-13/2025.csv
==> Concluído ano 2025 (ingest_date=2026-02-13)
