In [2]:
import os


def find_in_path(filename):
    for path in os.environ["PATH"].split(os.pathsep):
        full_path = os.path.join(path, filename)
        if os.path.isfile(full_path):
            return full_path
    return None

print("winutils.exe:", find_in_path("winutils.exe"))
print("hadoop.dll:", find_in_path("hadoop.dll"))

winutils.exe: C:\Users\Rodrigo\.spark\hadoop\bin\winutils.exe
hadoop.dll: C:\Users\Rodrigo\.spark\hadoop\bin\hadoop.dll


In [3]:

import sys

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable
from open_finance_lakehouse.utils.spark_session import get_spark_session

get_spark_session.cache_clear()
spark = get_spark_session()

In [4]:
# 2. Ingestão BACEN API
import io
from datetime import datetime

import pandas as pd
import requests

from open_finance_lakehouse.utils.spark_session import get_spark_session

# --- Parameters ---
bacen_series_id = 11  # Example: SELIC
bacen_start_date = (datetime.today().replace(year=datetime.today().year - 10)).strftime("%d/%m/%Y")
bacen_end_date = datetime.today().strftime("%d/%m/%Y")
cvm_year = 2024
cvm_month = 4


bacen_url = (
    f"https://api.bcb.gov.br/dados/serie/bcdata.sgs.{bacen_series_id}/dados"
    f"?formato=json&dataInicial={bacen_start_date}&dataFinal={bacen_end_date}"
)

response = requests.get(bacen_url)

df_bacen = pd.read_json(io.StringIO(response.text))
df_bacen.columns = ["data", "valor"]
df_bacen["data"] = pd.to_datetime(df_bacen["data"], format="%d/%m/%Y")
df_bacen["valor"] = pd.to_numeric(df_bacen["valor"], errors="coerce")
df_bacen.head()


Unnamed: 0,data,valor
0,2015-05-04,0.049037
1,2015-05-05,0.049037
2,2015-05-06,0.049037
3,2015-05-07,0.049037
4,2015-05-08,0.049037


In [5]:
import tempfile
import zipfile

# --- CVM Fetch (Spark) ---
cvm_base_url = "https://dados.cvm.gov.br/dados/FI/DOC/INF_DIARIO/DADOS/"
cvm_file_name = f"inf_diario_fi_{cvm_year}{str(cvm_month).zfill(2)}.zip"
cvm_url = f"{cvm_base_url}{cvm_file_name}"

with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp:
    print("CVM URL:", cvm_url)
    content = requests.get(cvm_url).content
    tmp.write(content)
    tmp_path = tmp.name


with zipfile.ZipFile(tmp_path, 'r') as zip_ref:
    csv_name = zip_ref.namelist()[0]  # Assume only one CSV in the zip
    zip_ref.extract(csv_name, os.path.dirname(tmp_path))
    csv_path = os.path.join(os.path.dirname(tmp_path), csv_name)

# Ensure Spark reads the file before it is deleted
cvm_spark_df = spark.read.csv(
    csv_path,
    header=True,
    sep=";",
    inferSchema=True,
    encoding="ISO-8859-1"
)

# Wait for Spark to finish reading the file before deleting it
cvm_spark_df.cache()
cvm_spark_df.count()
os.unlink(tmp_path)
os.unlink(csv_path)


print("CVM Spark DataFrame Schema:")
cvm_spark_df.printSchema()
print("CVM Spark DataFrame Sample:")
cvm_spark_df.show(5)

CVM URL: https://dados.cvm.gov.br/dados/FI/DOC/INF_DIARIO/DADOS/inf_diario_fi_202404.zip
CVM Spark DataFrame Schema:
root
 |-- TP_FUNDO_CLASSE: string (nullable = true)
 |-- CNPJ_FUNDO_CLASSE: string (nullable = true)
 |-- ID_SUBCLASSE: string (nullable = true)
 |-- DT_COMPTC: date (nullable = true)
 |-- VL_TOTAL: double (nullable = true)
 |-- VL_QUOTA: double (nullable = true)
 |-- VL_PATRIM_LIQ: double (nullable = true)
 |-- CAPTC_DIA: double (nullable = true)
 |-- RESG_DIA: double (nullable = true)
 |-- NR_COTST: integer (nullable = true)

CVM Spark DataFrame Sample:
+---------------+------------------+------------+----------+----------+----------+-------------+---------+--------+--------+
|TP_FUNDO_CLASSE| CNPJ_FUNDO_CLASSE|ID_SUBCLASSE| DT_COMPTC|  VL_TOTAL|  VL_QUOTA|VL_PATRIM_LIQ|CAPTC_DIA|RESG_DIA|NR_COTST|
+---------------+------------------+------------+----------+----------+----------+-------------+---------+--------+--------+
|             FI|00.017.024/0001-53|        NULL

In [6]:
import os

import boto3
import requests
from botocore.exceptions import ClientError
from dotenv import load_dotenv

load_dotenv()
# Configurações do MinIO
MINIO_ENDPOINT = "http://localhost:9000"
ACCESS_KEY = os.getenv("MINIO_USER")
SECRET_KEY = os.getenv("MINIO_PASSWORD")
print(os.getenv("AIRFLOW_PROJ_DIR"))
BUCKET_NAME = "lakehouse"

# Conectar ao MinIO (S3-compatible)
s3 = boto3.client(
    "s3",
    endpoint_url=MINIO_ENDPOINT,
    aws_access_key_id=ACCESS_KEY,
    aws_secret_access_key=SECRET_KEY,
)

# Checar e criar bucket
try:
    s3.head_bucket(Bucket=BUCKET_NAME)
    print(f"✅ Bucket '{BUCKET_NAME}' já existe.")
except ClientError as e:
    error_code = int(e.response["Error"]["Code"])
    if error_code == requests.codes.not_found:
        print(f"🔧 Criando bucket '{BUCKET_NAME}'...")
        s3.create_bucket(Bucket=BUCKET_NAME)
        print("✅ Bucket criado com sucesso.")
    else:
        raise


./airflow
✅ Bucket 'lakehouse' já existe.


In [7]:
# 3. Salvar como Delta (Bronze) no MinIO
df_bacen_spark = spark.createDataFrame(df_bacen)

In [8]:
df_bacen_spark.write.format("delta").mode("overwrite").save("s3a://lakehouse/bronze/bacen_selic/")
cvm_spark_df.write.format("delta").mode("overwrite").save("s3a://lakehouse/bronze/cvm_if_di/")

In [9]:
# 4. Transformações da camada Silver
from pyspark.sql import functions as F

silver_bacen = (
    df_bacen_spark
    .withColumn("ano", F.year("data"))
    .withColumn("mes", F.month("data"))
    .withColumn("dia", F.dayofmonth("data"))
)

silver_cvm = (
    cvm_spark_df.withColumn("cap_liquida_dia", F.col("CAPTC_DIA") - F.col("RESG_DIA"))
        .withColumn("ano", F.year("DT_COMPTC"))
        .withColumn("mes", F.month("DT_COMPTC"))
        .withColumn("dia", F.dayofmonth("DT_COMPTC"))
        .filter(F.col("VL_QUOTA") > 0)
        .filter(F.col("VL_PATRIM_LIQ") > 0)
)




silver_bacen.write.format("delta").mode("overwrite").save("s3a://lakehouse/silver/bacen_selic/")
silver_cvm.write.format("delta").mode("overwrite").save("s3a://lakehouse/silver/cvm_if_di/")


In [14]:
silver_cvm.show(20, truncate=False)

+---------------+------------------+------------+----------+----------+----------+-------------+---------+--------+--------+---------------+----+---+---+
|TP_FUNDO_CLASSE|CNPJ_FUNDO_CLASSE |ID_SUBCLASSE|DT_COMPTC |VL_TOTAL  |VL_QUOTA  |VL_PATRIM_LIQ|CAPTC_DIA|RESG_DIA|NR_COTST|cap_liquida_dia|ano |mes|dia|
+---------------+------------------+------------+----------+----------+----------+-------------+---------+--------+--------+---------------+----+---+---+
|FI             |00.017.024/0001-53|NULL        |2024-04-01|1110061.31|35.0215439|1110855.35   |0.0      |0.0     |1       |0.0            |2024|4  |1  |
|FI             |00.017.024/0001-53|NULL        |2024-04-02|1110510.5 |35.0310826|1111157.91   |0.0      |0.0     |1       |0.0            |2024|4  |2  |
|FI             |00.017.024/0001-53|NULL        |2024-04-03|1110959.34|35.042477 |1111519.33   |0.0      |0.0     |1       |0.0            |2024|4  |3  |
|FI             |00.017.024/0001-53|NULL        |2024-04-04|1111404.79|35.05

In [11]:
import great_expectations as gx
from great_expectations.core.expectation_configuration import ExpectationConfiguration

context = gx.data_context.DataContext("gx")

# (Re-)register the asset with the current DataFrame
spark_ds = context.get_datasource("spark_datasource")
if "silver_bacen_selic" in [asset.name for asset in spark_ds.assets]:
    spark_ds.delete_asset("silver_bacen_selic")
spark_ds.add_dataframe_asset(name="silver_bacen_selic", dataframe=df_bacen_spark)

# Overwrite the expectation suite with only valid expectations (NO validation run)
suite_name = "silver_bacen_selic_suite"
if suite_name in [suite.expectation_suite_name for suite in context.list_expectation_suites()]:
    context.delete_expectation_suite(suite_name)
suite = context.add_expectation_suite(suite_name)

# Add expectations directly to the suite (no validator, no data access)
suite.expectations = [
    ExpectationConfiguration(
        expectation_type="expect_column_to_exist",
        kwargs={"column": "data"}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_to_exist",
        kwargs={"column": "valor"}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={"column": "data"}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={"column": "valor"}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_of_type",
        kwargs={"column": "data", "type_": "TimestampType"}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_of_type",
        kwargs={"column": "valor", "type_": "DoubleType"}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={"column": "valor", "min_value": 0}
    ),
    ExpectationConfiguration(
        expectation_type="expect_table_row_count_to_be_between",
        kwargs={"min_value": 1}
    ),
]

context.save_expectation_suite(suite)
print("✅ Clean expectation suite created and saved (no validation run, no validator used).")

✅ Clean expectation suite created and saved (no validation run, no validator used).


In [25]:
import great_expectations as gx
from great_expectations.core.expectation_configuration import ExpectationConfiguration

context = gx.data_context.DataContext("gx")

# (Re-)register the asset with the current DataFrame
spark_ds = context.get_datasource("spark_datasource")
if "silver_cvm_if_di" in [asset.name for asset in spark_ds.assets]:
    spark_ds.delete_asset("silver_cvm_if_di")
spark_ds.add_dataframe_asset(name="silver_cvm_if_di", dataframe=silver_cvm)

# Overwrite the expectation suite with only valid expectations (NO validation run)
suite_name = "silver_cvm_if_di_suite"
if suite_name in [suite.expectation_suite_name for suite in context.list_expectation_suites()]:
    context.delete_expectation_suite(suite_name)
suite = context.add_expectation_suite(suite_name)

# Add expectations directly to the suite (no validator, no data access)
suite.expectations = [
    ExpectationConfiguration(
        expectation_type="expect_column_to_exist",
        kwargs={"column": "DT_COMPTC"}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_to_exist",
        kwargs={"column": "CNPJ_FUNDO_CLASSE"}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_to_exist",
        kwargs={"column": "VL_QUOTA"}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_to_exist",
        kwargs={"column": "VL_PATRIM_LIQ"}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_to_exist",
        kwargs={"column": "cap_liquida_dia"}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={"column": "DT_COMPTC"}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={"column": "CNPJ_FUNDO_CLASSE"}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={"column": "VL_QUOTA"}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={"column": "VL_PATRIM_LIQ"}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_of_type",
        kwargs={"column": "DT_COMPTC", "type_": "DateType"}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_of_type",
        kwargs={"column": "VL_QUOTA", "type_": "DoubleType"}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_of_type",
        kwargs={"column": "VL_PATRIM_LIQ", "type_": "DoubleType"}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={"column": "VL_QUOTA", "min_value": 0}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={"column": "VL_PATRIM_LIQ", "min_value": 0}
    ),
    ExpectationConfiguration(
        expectation_type="expect_table_row_count_to_be_between",
        kwargs={"min_value": 1}
    ),
]

context.save_expectation_suite(suite)
print("✅ Clean expectation suite for silver_cvm created and saved (no validation run, no validator used).")

✅ Clean expectation suite for silver_cvm created and saved (no validation run, no validator used).


In [26]:
import great_expectations as gx
from great_expectations.checkpoint import SimpleCheckpoint

context = gx.data_context.DataContext("gx")

# Ensure Spark datasource exists
if "spark_datasource" not in context.datasources:
    context.sources.add_spark(name="spark_datasource")

# (Re-)register the assets with the current DataFrames
spark_ds = context.get_datasource("spark_datasource")
asset_names = [asset.name for asset in spark_ds.assets]
if "silver_bacen_selic" in asset_names:
    spark_ds.delete_asset("silver_bacen_selic")
if "silver_cvm_if_di" in asset_names:
    spark_ds.delete_asset("silver_cvm_if_di")
spark_ds.add_dataframe_asset(name="silver_bacen_selic", dataframe=df_bacen_spark)
spark_ds.add_dataframe_asset(name="silver_cvm_if_di", dataframe=silver_cvm)

# Ensure expectation suites exist
suite_names = [suite.expectation_suite_name for suite in context.list_expectation_suites()]
if "silver_bacen_selic_suite" not in suite_names:
    context.add_expectation_suite("silver_bacen_selic_suite")
if "silver_cvm_if_di_suite" not in suite_names:
    context.add_expectation_suite("silver_cvm_if_di_suite")

# Build batch requests
batch_request_bacen = {
    "datasource_name": "spark_datasource",
    "data_asset_name": "silver_bacen_selic",
}
batch_request_cvm = {
    "datasource_name": "spark_datasource",
    "data_asset_name": "silver_cvm_if_di",
}

# Run the checkpoint for both assets
checkpoint = SimpleCheckpoint(
    name="silver_combined_checkpoint",
    data_context=context,
    validations=[
        {
            "batch_request": batch_request_bacen,
            "expectation_suite_name": "silver_bacen_selic_suite",
        },
        {
            "batch_request": batch_request_cvm,
            "expectation_suite_name": "silver_cvm_if_di_suite",
        }
    ]
)

result = checkpoint.run()
print("✅ Validation Success (bacen & cvm):", result["success"])

Calculating Metrics:   0%|          | 0/22 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/40 [00:00<?, ?it/s]

✅ Validation Success (bacen & cvm): True


In [None]:
for run_result in result["run_results"].values():
    validation_result = run_result["validation_result"]
    for res in validation_result["results"]:
        if not res["success"]:
            print(f"❌ Failed: {res['expectation_config']['expectation_type']} on column {res['expectation_config']['kwargs'].get('column')}")
            print(f"    Details: {res['result']}")


Asset: unknown

Asset: unknown


In [30]:
from pyspark.sql import DataFrame
from pyspark.sql import functions as F


def run_gold_pipeline():

    # Load silver datasets
    # silver_cvm = spark.read.format("delta").load("s3a://lakehouse/silver/cvm/")
    silver_bacen = spark.read.format("delta").load("s3a://lakehouse/silver/bacen_selic/")

    # KPIs CVM Fundos
    gold_cvm = (
        silver_cvm.groupBy("CNPJ_FUNDO_CLASSE")
        .agg(
            F.min("DT_COMPTC").alias("data_inicio"),
            F.max("DT_COMPTC").alias("data_fim"),
            F.first("VL_QUOTA", ignorenulls=True).alias("vl_quota_inicio"),
            F.last("VL_QUOTA", ignorenulls=True).alias("vl_quota_fim"),
            F.sum("cap_liquida_dia").alias("cap_liquida_total"),
            F.first("NR_COTST", ignorenulls=True).alias("cotistas_inicio"),
            F.last("NR_COTST", ignorenulls=True).alias("cotistas_fim")
        )
        .withColumn("rentabilidade_pct", ((F.col("vl_quota_fim") - F.col("vl_quota_inicio")) / F.col("vl_quota_inicio")) * 100)
        .withColumn("crescimento_cotistas", F.col("cotistas_fim") - F.col("cotistas_inicio"))
    )

    # KPIs BACEN indicadores
    gold_bacen = (
        silver_bacen.agg(
            F.min("data").alias("inicio"),
            F.max("data").alias("fim"),
            F.min("valor").alias("min"),
            F.max("valor").alias("max"),
            F.mean("valor").alias("media")
        )
    )

    # Save gold outputs
    # gold_cvm.write.format("delta").mode("overwrite").save("s3a://lakehouse/gold/cvm_kpis/")
    gold_bacen.write.format("delta").mode("overwrite").save("s3a://lakehouse/gold/bacen_kpis/")
    gold_cvm.write.format("delta").mode("overwrite").save("s3a://lakehouse/gold/cvm_kpis/")
    return gold_bacen, gold_cvm
gold_bacen, gold_cvm = run_gold_pipeline()
gold_bacen.show(20,truncate=False)
gold_cvm.show(20,truncate=False)

+-------------------+-------------------+--------+--------+-------------------+
|inicio             |fim                |min     |max     |media              |
+-------------------+-------------------+--------+--------+-------------------+
|2015-05-04 00:00:00|2025-04-30 00:00:00|0.007469|0.052531|0.03535479306220097|
+-------------------+-------------------+--------+--------+-------------------+

+------------------+-----------+----------+---------------+-------------+--------------------+---------------+------------+--------------------+--------------------+
|CNPJ_FUNDO_CLASSE |data_inicio|data_fim  |vl_quota_inicio|vl_quota_fim |cap_liquida_total   |cotistas_inicio|cotistas_fim|rentabilidade_pct   |crescimento_cotistas|
+------------------+-----------+----------+---------------+-------------+--------------------+---------------+------------+--------------------+--------------------+
|00.222.725/0001-24|2024-04-01 |2024-04-30|4898.6985647   |4933.7912316 |-894263.5           |941    