In [16]:
import os
import sys

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable
os.environ["SPARK_DRIVER_MEMORY"] = "8g"
os.environ["SPARK_EXECUTOR_MEMORY"] = "8g"
import pandas as pd
import pyspark

print(sys.executable)
print("PySpark:", pyspark.__version__)
print("pandas:", pd.__version__)

c:\Users\Mo.DESKTOP-3491UQD\Documents\Open-Finance-LakeHouse\.venv\Scripts\python.exe
PySpark: 3.5.5
pandas: 2.2.3


In [17]:
import io
import tempfile
from datetime import datetime

import requests
from pyspark.sql.types import DateType, DoubleType, StructField, StructType

from open_finance_lakehouse.utils.spark_session import get_spark_session

# --- Parameters ---
bacen_series_id = 11  # Example: SELIC
bacen_start_date = (datetime.today().replace(year=datetime.today().year - 10)).strftime("%d/%m/%Y")
bacen_end_date = datetime.today().strftime("%d/%m/%Y")
cvm_year = 2024
cvm_month = 4

# --- Spark Session ---
get_spark_session.cache_clear()
spark = get_spark_session()

In [18]:


# --- BACEN Fetch (pandas) ---
bacen_url = (
    f"https://api.bcb.gov.br/dados/serie/bcdata.sgs.{bacen_series_id}/dados"
    f"?formato=json&dataInicial={bacen_start_date}&dataFinal={bacen_end_date}"
)
bacen_response = requests.get(bacen_url)
if bacen_response.status_code != 200:
    raise ValueError(f"Erro ao buscar dados do BACEN: {bacen_response.text}")

bacen_df = pd.read_json(io.StringIO(bacen_response.text))
bacen_df.columns = ["data", "valor"]
bacen_df["data"] = pd.to_datetime(bacen_df["data"], format="%d/%m/%Y")
bacen_df["valor"] = pd.to_numeric(bacen_df["valor"], errors="coerce")
print(len(bacen_df), '\n',bacen_df)

# schema = StructType([
#     StructField("data", DateType(), True),
#     StructField("valor", DoubleType(), True)
# ])
# Convert BACEN pandas DataFrame to Spark DataFrame
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "false")
bacen_spark_df = spark.createDataFrame(bacen_df)



2509 
            data     valor
0    2015-04-27  0.047314
1    2015-04-28  0.047350
2    2015-04-29  0.047350
3    2015-04-30  0.049037
4    2015-05-04  0.049037
...         ...       ...
2504 2025-04-17  0.052531
2505 2025-04-22  0.052531
2506 2025-04-23  0.052531
2507 2025-04-24  0.052531
2508 2025-04-25  0.052531

[2509 rows x 2 columns]


In [19]:
test_df = pd.DataFrame({"data": [pd.Timestamp("2024-01-01")], "valor": [1.23]})
test_spark_df = spark.createDataFrame(test_df)
test_spark_df.show()

+-------------------+-----+
|               data|valor|
+-------------------+-----+
|2024-01-01 00:00:00| 1.23|
+-------------------+-----+



In [20]:
bacen_spark_df.show(10)

+-------------------+--------+
|               data|   valor|
+-------------------+--------+
|2015-04-27 00:00:00|0.047314|
|2015-04-28 00:00:00| 0.04735|
|2015-04-29 00:00:00| 0.04735|
|2015-04-30 00:00:00|0.049037|
|2015-05-04 00:00:00|0.049037|
|2015-05-05 00:00:00|0.049037|
|2015-05-06 00:00:00|0.049037|
|2015-05-07 00:00:00|0.049037|
|2015-05-08 00:00:00|0.049037|
|2015-05-11 00:00:00|0.049037|
+-------------------+--------+
only showing top 10 rows



In [21]:
import zipfile

# --- CVM Fetch (Spark) ---
cvm_base_url = "https://dados.cvm.gov.br/dados/FI/DOC/INF_DIARIO/DADOS/"
cvm_file_name = f"inf_diario_fi_{cvm_year}{str(cvm_month).zfill(2)}.zip"
cvm_url = f"{cvm_base_url}{cvm_file_name}"

with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp:
    print("CVM URL:", cvm_url)
    content = requests.get(cvm_url).content
    tmp.write(content)
    tmp_path = tmp.name


with zipfile.ZipFile(tmp_path, 'r') as zip_ref:
    csv_name = zip_ref.namelist()[0]  # Assume only one CSV in the zip
    zip_ref.extract(csv_name, os.path.dirname(tmp_path))
    csv_path = os.path.join(os.path.dirname(tmp_path), csv_name)

# Ensure Spark reads the file before it is deleted
cvm_spark_df = spark.read.csv(
    csv_path,
    header=True,
    sep=";",
    inferSchema=True,
    encoding="ISO-8859-1"
)

# Wait for Spark to finish reading the file before deleting it
cvm_spark_df.cache()
cvm_spark_df.count()
os.unlink(tmp_path)
os.unlink(csv_path)

# --- EDA Examples ---
print("BACEN Spark DataFrame Schema:")
bacen_spark_df.printSchema()
print("BACEN Spark DataFrame Sample:")
bacen_spark_df.show(5)

print("CVM Spark DataFrame Schema:")
cvm_spark_df.printSchema()
print("CVM Spark DataFrame Sample:")
cvm_spark_df.show(5)

# You can now use bacen_spark_df and cvm_spark_df for further EDA in Spark or convert to pandas if needed:
# bacen_pd = bacen_spark_df.toPandas()
# cvm_pd = cvm_spark_df.toPandas()

CVM URL: https://dados.cvm.gov.br/dados/FI/DOC/INF_DIARIO/DADOS/inf_diario_fi_202404.zip
BACEN Spark DataFrame Schema:
root
 |-- data: timestamp (nullable = true)
 |-- valor: double (nullable = true)

BACEN Spark DataFrame Sample:
+-------------------+--------+
|               data|   valor|
+-------------------+--------+
|2015-04-27 00:00:00|0.047314|
|2015-04-28 00:00:00| 0.04735|
|2015-04-29 00:00:00| 0.04735|
|2015-04-30 00:00:00|0.049037|
|2015-05-04 00:00:00|0.049037|
+-------------------+--------+
only showing top 5 rows

CVM Spark DataFrame Schema:
root
 |-- TP_FUNDO_CLASSE: string (nullable = true)
 |-- CNPJ_FUNDO_CLASSE: string (nullable = true)
 |-- ID_SUBCLASSE: string (nullable = true)
 |-- DT_COMPTC: date (nullable = true)
 |-- VL_TOTAL: double (nullable = true)
 |-- VL_QUOTA: double (nullable = true)
 |-- VL_PATRIM_LIQ: double (nullable = true)
 |-- CAPTC_DIA: double (nullable = true)
 |-- RESG_DIA: double (nullable = true)
 |-- NR_COTST: integer (nullable = true)

CVM S