In [10]:
import os
import sys

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable
from open_finance_lakehouse.utils.spark_session import get_spark_session

get_spark_session.cache_clear()
spark = get_spark_session()

In [11]:
# 2. Ingestão BACEN API
import io
from datetime import datetime

import pandas as pd
import requests

from open_finance_lakehouse.utils.spark_session import get_spark_session

# --- Parameters ---
bacen_series_id = 11  # Example: SELIC
bacen_start_date = (datetime.today().replace(year=datetime.today().year - 10)).strftime("%d/%m/%Y")
bacen_end_date = datetime.today().strftime("%d/%m/%Y")
cvm_year = 2024
cvm_month = 4


bacen_url = (
    f"https://api.bcb.gov.br/dados/serie/bcdata.sgs.{bacen_series_id}/dados"
    f"?formato=json&dataInicial={bacen_start_date}&dataFinal={bacen_end_date}"
)

response = requests.get(bacen_url)

df_bacen = pd.read_json(io.StringIO(response.text))
df_bacen.columns = ["data", "valor"]
df_bacen["data"] = pd.to_datetime(df_bacen["data"], format="%d/%m/%Y")
df_bacen["valor"] = pd.to_numeric(df_bacen["valor"], errors="coerce")
df_bacen.head()


Unnamed: 0,data,valor
0,2015-05-04,0.049037
1,2015-05-05,0.049037
2,2015-05-06,0.049037
3,2015-05-07,0.049037
4,2015-05-08,0.049037


In [20]:
import os

import boto3
import requests
from botocore.exceptions import ClientError
from dotenv import load_dotenv

load_dotenv()
# Configurações do MinIO
MINIO_ENDPOINT = "http://localhost:9000"
ACCESS_KEY = os.getenv("MINIO_USER")
SECRET_KEY = os.getenv("MINIO_PASSWORD")
print(os.getenv("AIRFLOW_PROJ_DIR"))
BUCKET_NAME = "lakehouse"

# Conectar ao MinIO (S3-compatible)
s3 = boto3.client(
    "s3",
    endpoint_url=MINIO_ENDPOINT,
    aws_access_key_id=ACCESS_KEY,
    aws_secret_access_key=SECRET_KEY,
)

# Checar e criar bucket
try:
    s3.head_bucket(Bucket=BUCKET_NAME)
    print(f"✅ Bucket '{BUCKET_NAME}' já existe.")
except ClientError as e:
    error_code = int(e.response["Error"]["Code"])
    if error_code == requests.codes.not_found:
        print(f"🔧 Criando bucket '{BUCKET_NAME}'...")
        s3.create_bucket(Bucket=BUCKET_NAME)
        print("✅ Bucket criado com sucesso.")
    else:
        raise


/airflow
✅ Bucket 'lakehouse' já existe.


In [13]:
import os


def find_in_path(filename):
    for path in os.environ["PATH"].split(os.pathsep):
        full_path = os.path.join(path, filename)
        if os.path.isfile(full_path):
            return full_path
    return None

print("winutils.exe:", find_in_path("winutils.exe"))
print("hadoop.dll:", find_in_path("hadoop.dll"))

winutils.exe: C:\Users\Rodrigo\.spark\hadoop\bin\winutils.exe
hadoop.dll: C:\Users\Rodrigo\.spark\hadoop\bin\hadoop.dll


In [14]:
# 3. Salvar como Delta (Bronze) no MinIO
df_bacen_spark = spark.createDataFrame(df_bacen)

In [15]:
df_bacen_spark.write.format("delta").mode("overwrite").save("s3a://lakehouse/bronze/bacen_selic/")

In [16]:
# 4. Transformações da camada Silver
from pyspark.sql import functions as F

silver_bacen = (
    df_bacen_spark
    .withColumn("ano", F.year("data"))
    .withColumn("mes", F.month("data"))
    .withColumn("dia", F.dayofmonth("data"))
)

silver_bacen.write.format("delta").mode("overwrite").save("s3a://lakehouse/silver/bacen_selic/")


In [17]:
import great_expectations as gx
from great_expectations.core.expectation_configuration import ExpectationConfiguration

context = gx.data_context.DataContext("gx")

# (Re-)register the asset with the current DataFrame
spark_ds = context.get_datasource("spark_datasource")
if "silver_bacen_selic" in [asset.name for asset in spark_ds.assets]:
    spark_ds.delete_asset("silver_bacen_selic")
spark_ds.add_dataframe_asset(name="silver_bacen_selic", dataframe=df_bacen_spark)

# Overwrite the expectation suite with only valid expectations (NO validation run)
suite_name = "silver_bacen_selic_suite"
if suite_name in [suite.expectation_suite_name for suite in context.list_expectation_suites()]:
    context.delete_expectation_suite(suite_name)
suite = context.add_expectation_suite(suite_name)

# Add expectations directly to the suite (no validator, no data access)
suite.expectations = [
    ExpectationConfiguration(
        expectation_type="expect_column_to_exist",
        kwargs={"column": "data"}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_to_exist",
        kwargs={"column": "valor"}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={"column": "data"}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={"column": "valor"}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_of_type",
        kwargs={"column": "data", "type_": "TimestampType"}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_of_type",
        kwargs={"column": "valor", "type_": "DoubleType"}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={"column": "valor", "min_value": 0}
    ),
    ExpectationConfiguration(
        expectation_type="expect_table_row_count_to_be_between",
        kwargs={"min_value": 1}
    ),
]

context.save_expectation_suite(suite)
print("✅ Clean expectation suite created and saved (no validation run, no validator used).")

✅ Clean expectation suite created and saved (no validation run, no validator used).


In [18]:
import great_expectations as gx
from great_expectations.checkpoint import SimpleCheckpoint

context = gx.data_context.DataContext("gx")

# 1. Ensure Spark datasource exists
if "spark_datasource" not in context.datasources:
    context.sources.add_spark(name="spark_datasource")

# 2. (Re-)register the asset with the current DataFrame
spark_ds = context.get_datasource("spark_datasource")
asset_names = [asset.name for asset in spark_ds.assets]
if "silver_bacen_selic" in asset_names:
    spark_ds.delete_asset("silver_bacen_selic")
spark_ds.add_dataframe_asset(name="silver_bacen_selic", dataframe=df_bacen_spark)

# 3. Ensure expectation suite exists
suite_names = [suite.expectation_suite_name for suite in context.list_expectation_suites()]
if "silver_bacen_selic_suite" not in suite_names:
    context.add_expectation_suite("silver_bacen_selic_suite")

# 4. Build the batch request (Fluent API: just use asset name)
batch_request = {
    "datasource_name": "spark_datasource",
    "data_asset_name": "silver_bacen_selic",
}

# 5. Run the checkpoint
checkpoint = SimpleCheckpoint(
    name="silver_bacen_checkpoint",
    data_context=context,
    validations=[
        {
            "batch_request": batch_request,
            "expectation_suite_name": "silver_bacen_selic_suite",
        }
    ]
)

result = checkpoint.run()
print("✅ Validation Success:", result["success"])

Calculating Metrics:   0%|          | 0/22 [00:00<?, ?it/s]

✅ Validation Success: True
