In [2]:
import os
from zipfile import ZipFile
import pandas as pd
import requests


# Download CVM data

In [3]:
base_path = os.getcwd()
download_path = os.path.join(base_path, 'cvm_data')
base_url = 'https://dados.cvm.gov.br/dados/CIA_ABERTA/DOC/DFP/DADOS/'

file_paths = []
for year in range(2010,2022):
    file_name = f'dfp_cia_aberta_{year}.zip'

    response = requests.get(f'{base_url}{file_name}')

    file_path = os.path.join(download_path, file_name)

    with open(file_path, 'wb') as file:
        file.write(response.content)
    
    file_paths.append(file_path)

# Based on downloaded zip data, create dataframe

In [4]:


financial_statements = []
for file_path in file_paths:
    zipped_file = ZipFile(file_path)

    for financial_statement in zipped_file.namelist():
        with zipped_file.open(financial_statement, 'r') as unzipped_file:
            financial_statement = pd.read_csv(unzipped_file, delimiter=';', encoding='latin-1')
            financial_statements.append(financial_statement)

untreated_df = pd.concat(financial_statements)

# Separate GRUPO_DFP column into TIPO_DF and NOME_DF

In [5]:
untreated_df[['TIPO_DF', 'NOME_DF']] = untreated_df['GRUPO_DFP'].str.split('-', expand=True)
untreated_df['TIPO_DF'] = untreated_df['TIPO_DF'].str.strip()
untreated_df['NOME_DF'] = untreated_df['NOME_DF'].str.strip()
treated_df = untreated_df.drop('GRUPO_DFP', axis=1)

# Filter Dataframe

In [6]:
treated_df = treated_df[untreated_df['ORDEM_EXERC'] == 'ÚLTIMO']
treated_df = treated_df[untreated_df['TIPO_DF'] == 'DF Consolidado']
treated_df = treated_df[untreated_df['NOME_DF'] == 'Demonstração do Resultado']
treated_df = treated_df[untreated_df['DENOM_CIA'] == 'WEG S.A.']
treated_df = treated_df[untreated_df['DS_CONTA'] == 'Lucro/Prejuízo Consolidado do Período']

  treated_df = treated_df[untreated_df['TIPO_DF'] == 'DF Consolidado']


# Show Dataframe

In [None]:
treated_df