# Fundos de Investimento: Documentos: Informe Diário

source: https://dados.cvm.gov.br/dataset/fi-doc-inf_diario

O INFORME DIÁRIO é um demonstrativo que contém as seguintes informações do fundo, relativas à data de competência:

- Valor total da carteira do fundo;
- Patrimônio líquido;
- Valor da cota;
- Captações realizadas no dia;
- Resgates pagos no dia;
- Número de cotistas

**Importante**: A partir de maio/2022, os arquivos de dados de Informe Diário de Fundos passarão a ser disponibilizados no formato csv compactado (zip).



### Import relevant packages

In [1]:
from datetime import datetime
from io import BytesIO
from typing import Optional, Union, List

import pandas as pd
import requests
from dateutil.relativedelta import relativedelta

from pyportela.models.DataResource import DataResource
from pyportela.services.CachedDownload import CachedDownload
from pyportela.utils import unzip_csv_to_df

## Recursos de dados

Aqui vamos montar todas as urls relevantes para nossa base. Cada url vai ter uma data de expiração que indica se deve ser atualizada ou se nossa basse já está ok com ela.

In [2]:
resources: List[DataResource] = []
for year in range(2004, 2021):
    fileName = f"inf_diario_fi_{year}.zip"
    url = "https://dados.cvm.gov.br/dados/FI/DOC/INF_DIARIO/DADOS/HIST/" + fileName
    resource = DataResource(dataset_id="br_gov_cvm", url=url, tags=fileName)
    resources.append(resource)
dt = datetime(2021, 1, 1)
end = datetime.now()
while dt < end:
    year = dt.year
    month = dt.month
    fileName = f"inf_diario_fi_{year}{month:02d}.zip"
    url = "https://dados.cvm.gov.br/dados/FI/DOC/INF_DIARIO/DADOS/" + fileName
    resource = DataResource(dataset_id="br_gov_cvm", url=url, tags=fileName)
    resources.append(resource)
    dt = dt + relativedelta(months=1)
resources[-1].expires = True
resources[-1].expires_at = datetime.now() + relativedelta(hours=12)

Unnamed: 0,dataset_id,url,expires,expires_at,downloaded_at,created_at,tags
55,br_gov_cvm,https://dados.cvm.gov.br/dados/FI/DOC/INF_DIAR...,False,NaT,,2024-07-29 23:53:24.769197,inf_diario_fi_202403.zip
56,br_gov_cvm,https://dados.cvm.gov.br/dados/FI/DOC/INF_DIAR...,False,NaT,,2024-07-29 23:53:24.769197,inf_diario_fi_202404.zip
57,br_gov_cvm,https://dados.cvm.gov.br/dados/FI/DOC/INF_DIAR...,False,NaT,,2024-07-29 23:53:24.769197,inf_diario_fi_202405.zip
58,br_gov_cvm,https://dados.cvm.gov.br/dados/FI/DOC/INF_DIAR...,False,NaT,,2024-07-29 23:53:24.769197,inf_diario_fi_202406.zip
59,br_gov_cvm,https://dados.cvm.gov.br/dados/FI/DOC/INF_DIAR...,True,2024-07-30 11:53:24.786863,,2024-07-29 23:53:24.769197,inf_diario_fi_202407.zip


In [5]:
def download_history(urls: list):
    for year in range(2004, 2021):
        url = get_year_url(year)
        urls.append(url)
        downloads.download(url, expiry)

download_history(urls)

In [7]:
def download_recent_history(urls: list):
    dt = datetime(2021, 1, 1)
    end = datetime.now() - relativedelta(months=1)
    while dt < end:
        url = get_year_month_url(dt.year, dt.month)
        urls.append(url)
        downloads.download(url, expiry)
        dt = dt + relativedelta(months=1)

download_recent_history(urls)

In [1]:
current_month_date = datetime.now()
current_month_url = get_year_month_url(
    current_month_date.year, current_month_date.month
)
downloads.download(current_month_url, relativedelta(hours=12))

NameError: name 'datetime' is not defined

In [4]:
def to_df(zip_file: Union[str, BytesIO]) -> pd.DataFrame:
    """
    This method takes a zip file and returns a pandas DataFrame with the data
    contained in the csv files inside the zip file.
    """
    df = unzip_csv_to_df(zip_file, sep=";", dtype=str)
    df["DT_COMPTC"] = df["DT_COMPTC"].apply(
        lambda x: datetime.strptime(x, "%Y-%m-%d").date()
    )
    for col in [
        "VL_TOTAL",
        "VL_QUOTA",
        "VL_PATRIM_LIQ",
        "CAPTC_DIA",
        "RESG_DIA",
        "NR_COTST",
    ]:
        df[col] = df[col].astype(float)
    if "TP_FUNDO" not in df.columns:
        df["TP_FUNDO"] = None
    col_names = {}
    for col in df.columns:
        col_names[col] = col.lower()
    df.rename(columns=col_names, inplace=True)
    df.Name = "fi_doc_inf_diario"
    return df


# df_2006 = to_df(df_2006_bytes)
# df_2006

In [5]:
from pyportela.repositories.PostgresWarehouse import PostgresWarehouse
warehouse = PostgresWarehouse(f"postgresql://postgres:popo8160@localhost:5432/br_gov_cvm")

: 

In [6]:
def load_all_data(urls: list):
    dfs = []
    for url in urls:
        cached = downloads.download(url, expiry)
        df = to_df(cached)
        dfs.append(df)
    return pd.concat(dfs)

df_all = load_all_data(urls)
len(df_all)

In [None]:
import sqlite3
from typing import List, Optional


def save_df_to_sqlite(
    df: pd.DataFrame,
    table_name: str,
    db_path: str,
    replace_col: Optional[str] = None,
    overwrite: bool = False,
):
    unique_values = None
    if replace_col:
        unique_values = df[replace_col].unique().tolist()
    con = sqlite3.connect(db_path)
    try:
        if overwrite == True:
            return df.to_sql(table_name, con, if_exists="replace", index=False)
        elif (
            replace_col is not None
            and unique_values is not None
            and len(unique_values) > 0
        ):
            sql_where = ", ".join("?" for _ in unique_values)
            sql = f"DELETE FROM {table_name} WHERE {replace_col} IN ({sql_where})"
            con.execute(sql, unique_values).close()
        return df.to_sql(table_name, con, if_exists="append", index=False)
    finally:
        con.close()

In [None]:
# save_df_to_sqlite(df_2006, "fi_doc_inf_diario", "cvm.db", replace_col="DT_COMPTC")

In [None]:
con = sqlite3.connect("cvm.db")
cur = con.cursor()
cur.execute("SELECT COUNT(*) FROM fi_doc_inf_diario")
print(cur.fetchone())

cur.execute("SELECT DISTINCT CNPJ_FUNDO FROM fi_doc_inf_diario")
print(len(cur.fetchall()))
cur.close()
con.close()
    

(66951678,)
50905
