# Cvm Daily Funds Data

In [1]:
from catalog_api.infrastructure.CkanApi import CkanApi
import pandas as pd
from datetime import datetime

from catalog_api.services.CatalogDataService import CatalogDataService

catalog_data = CatalogDataService()
portal = CkanApi("https://dados.cvm.gov.br")
organization = "br_gov_cvm"
dataset = "fi-doc-inf_diario"
table_name = "inf_diario_fi"
fi_doc_inf_diario = portal.get_package(dataset)

In [2]:
from datetime import timezone


def process_resource(resource):
    url = resource.url
    filename = url.split("/")[-1]
    filename_prefix = filename.split(".")[0]
    partition_full = filename_prefix.split("_")[-1]
    year = partition_full[:4]
    month = partition_full[4:6]
    if not year.isnumeric() or not month.isnumeric():
        print(f"Invalid partition: {filename_prefix}")
        return
    partitions = {
        "year": year,
        "month": month,
    }
    bucket_last_updated = catalog_data.get_parquet_last_updated(
        organization, dataset, table_name, partitions
    ) or datetime(2000, 1, 1)
    resource_last_updated = datetime.fromisoformat(
        resource.created or resource.last_modified
    )
    if bucket_last_updated.replace(
        tzinfo=timezone.utc
    ) >= resource_last_updated.replace(tzinfo=timezone.utc):
        return
    df = pd.read_csv(url, sep=";", encoding="latin1", low_memory=False)
    df["DT_COMPTC"] = df["DT_COMPTC"].apply(
        lambda x: datetime.strptime(x, "%Y-%m-%d").date() if x else None
    )
    res = catalog_data.put_parquet_data(
        df, organization, dataset, table_name, partitions
    )
    print(res)


for resource in fi_doc_inf_diario.resources:  # type: ignore
    process_resource(resource)

Invalid partition: meta_inf_diario_fi


## Query the data

In [3]:
db = catalog_data.get_duckdb(organization, dataset, table_name)
db.register_parquet_table("inf_diario_fi")
db.fetch_df("SELECT * FROM inf_diario_fi LIMIT 10")

AttributeError: 'DuckDbParquet' object has no attribute 'register_parquet_table'