Imports

In [0]:
pip install dotenv

In [0]:
from helpers import RAW, table_exists, CATALOG, BUCKET_BASE, TAG, _root, CREDS_NAME, TAXI_TYPES
import os, requests, tempfile, pathlib, shutil, sys
from pyspark.sql import SparkSession

### üîß Cria√ß√£o da estrutura base do Data Lakehouse

Este trecho cria a estrutura inicial no Unity Catalog:

- Define o **cat√°logo** e os **schemas** para as camadas `raw`, `bronze`, `quarentine`, `silver` e `gold`;
- Cria **volumes UC** e **volumes externos** para armazenar os dados em cada camada.

üéØ **Objetivo**: organizar os dados por est√°gio de processamento e garantir governan√ßa, separa√ß√£o l√≥gica e controle de acesso adequado.


In [0]:
spark = SparkSession.getActiveSession()

# 1. root
ddl_root = f"""
CREATE EXTERNAL LOCATION IF NOT EXISTS {CATALOG}_root
  URL '{BUCKET_BASE}'
  WITH (STORAGE CREDENTIAL `{CREDS_NAME}`);
"""

# 2. Cat√°logo e SCHEMAs
ddl_catalog = f"""
CREATE CATALOG IF NOT EXISTS {CATALOG}
  MANAGED LOCATION '{BUCKET_BASE}/uc_root';
"""

layers = ["raw", "bronze", "quarentine", "silver", "gold"]
ddl_schemas = "\n".join(
    f"""
    CREATE SCHEMA IF NOT EXISTS {CATALOG}.{layer}
      MANAGED LOCATION '{BUCKET_BASE}/{layer}/_tables/';
    """.strip()
    for layer in layers
)

#3. Volumes UC (RAW) + External Volumes
ddl_volumes = f"""
-- RAW ‚îÄ‚îÄ Volume UC (DBFS)
CREATE VOLUME IF NOT EXISTS {CATALOG}.raw.{TAG}_ingest
COMMENT 'Landing zone RAW {TAG.upper()}';

""" + "\n".join(
    f"""
    -- {layer.upper()} ‚îÄ‚îÄ external volume
    CREATE EXTERNAL VOLUME IF NOT EXISTS {CATALOG}.{layer}.{TAG}_{layer}
    LOCATION '{_root(layer, "volumes")}'
    COMMENT 'Camada {layer} ‚Äì dados {TAG.upper()}';
    """.strip()
    for layer in ["bronze", "quarentine", "silver", "gold"]
)

for stmt in [ddl_root, ddl_catalog, ddl_schemas, ddl_volumes]:
    for sql in filter(None, (s.strip() for s in stmt.split(";"))):
        spark.sql(sql)         


### üì• Download e armazenamento dos arquivos Parquet

Este trecho realiza o **download dos arquivos Parquet** diretamente da fonte oficial (NYC Taxi Data) e os **salva no volume UC da camada `raw`**, organizando por tipo de t√°xi, ano e m√™s.
Aqui s√£o coletados os dados de 2023 em diante, pois registros podem ser enviados posteriormente, a limpeza e verifica√ß√£o ser√° feita nas camadas bronze/silver.

üéØ **Objetivo**: garantir a ingest√£o dos dados brutos no Data Lake, mantendo uma estrutura de particionamento que facilita o processamento posterior.


In [0]:
import sys
def download_one(url: str, dst: str) -> bool:

    try:
        with tempfile.NamedTemporaryFile(delete=False) as tmp:
            with requests.get(url, stream=True, timeout=180) as r:
                if r.status_code in (403, 404):
                    print(f"{r.status_code} ‚Äì n√£o dispon√≠vel")
                    return False
                r.raise_for_status()
                for blk in r.iter_content(RAW.CHUNK):
                    tmp.write(blk)
            local_tmp = tmp.name

        dbutils.fs.cp("file:" + local_tmp, dst)
        print("Arquivo salvo em:", dst)
        return True

    except requests.exceptions.HTTPError as e:
        print("erro HTTP:", e)
        return False
    finally:
        try:
            os.remove(local_tmp)
        except Exception:
            pass


total = 0
for tt in TAXI_TYPES:
    for yy in RAW.YEARS:
        for mm in RAW.MONTHS:
            fname = f"{tt}_tripdata_{yy}-{mm}.parquet"
            url   = f"{RAW.BASE_URL}/{fname}"
            dst   = (f"{RAW.DEST_ROOT}/"
                     f"taxi_type={tt}/year={yy}/month={mm}/{fname}")

            sys.stdout.write(f"{tt}/{yy}-{mm}: ")
            sys.stdout.flush()
            if download_one(url, dst):
                total += 1

print(f"Download conclu√≠do ‚Äì {total:,} arquivos novos")
