# Notebook Report Extraction

**Objective**: extract financial report 'Formulário de Referência' from public repository managed by Comissão de Valores Mobiliários (CVM), equivalent to Securities and Exchange Comission (SEC) in USA.

In [1]:
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions

import os
import pandas as pd
from time import sleep

In [2]:
# Make sure the webdriver version matches your Google Chrome version.
print(webdriver.__version__)

4.10.0


In [3]:
# Initializing the browser.
driver = webdriver.Chrome()

# CVM url.
link = 'https://www.rad.cvm.gov.br/ENET/frmConsultaExternaCVM.aspx'
driver.get(link)
assert driver.title == 'Consulta de Documentos de Companhias Abertas'

In [4]:
# Filter by report type.
text_box_documento = driver.find_element(By.ID, 'cboCategorias_chosen_input')
driver.implicitly_wait(5)

text_box_documento.send_keys('FRE - Formulário de Referência')
#driver.implicitly_wait(5)
text_box_documento.send_keys('\ue007')
driver.implicitly_wait(5)

In [5]:
# Enable the data selection by pressing the button 'No período'.
radio_button_periodo = driver.find_element(By.ID, 'rdPeriodo')
radio_button_periodo.click()
driver.implicitly_wait(5)

# Input the starting date in string (DDMMYYYY).
text_box_dt_inicio = driver.find_element(By.ID, 'txtDataIni')
#driver.implicitly_wait(5)
text_box_dt_inicio.send_keys('01012021')
driver.implicitly_wait(5)

# Input the ending date in string (DDMMYYYY).
text_box_dt_inicio = driver.find_element(By.ID, 'txtDataFim')
#driver.implicitly_wait(5)
text_box_dt_inicio.send_keys('31012021')
driver.implicitly_wait(5)

In [6]:
# After all filters, execute the query.
button_consulta = driver.find_element(By.ID, 'btnConsulta')
button_consulta.click()
driver.implicitly_wait(5)

In [7]:
# The query results in a table with registry data and the report. For now we will focus on identifyng the registry data's position.
registry_elements = driver.find_elements(By.CSS_SELECTOR, "tr[role = 'row'] > td")
driver.implicitly_wait(5)

# Data is organized by the schema below:
# name: 1 + 11 * x
list_id_name = [1 + 11 * x for x in range(100)]

# report type: 2 + 11 * x
list_id_type = [2 + 11 * x for x in range(100)]

# year: 5 + 11 * x
list_id_year = [5 + 11 * x for x in range(100)]

# publication date: 6 + 11 * x
list_id_publ = [6 + 11 * x for x in range(100)]

print(registry_elements[list_id_name[0]].text)
print(registry_elements[list_id_type[0]].text)
print(registry_elements[list_id_year[0]].text)
print(registry_elements[list_id_publ[0]].text)

PRESTIGE CONSUMER HEALTHCARE INC
Comunicado ao Mercado
03/08/2023
03/08/2023 22:31


In [8]:
#!!! Por algum motivo os filtros estão errados: tem nada a ver o tipo do report, data e emrpresa.

In [10]:
# Extract registry data.
df_main = pd.DataFrame()

# Estimate the amount of pages found.
n_pages = len(driver.find_elements(By.CSS_SELECTOR, '.paginate_button')) - 2

for i in range(n_pages):
    print(f'Scrapping in page {i + 1}')

    # Step 1: get name, report type and year.
    registry_elements = driver.find_elements(By.CSS_SELECTOR, "tr[role = 'row'] > td")
    
    # Organizing name, report type and year in lists.
    list_name = []
    list_type = []
    list_year = []
    list_publ = []
    for idx, val in enumerate(registry_elements):
        if idx in list_id_name:
            list_name.append(val.text)
            
        if idx in list_id_type:
            list_type.append(val.text)

        if idx in list_id_year:
            list_year.append(val.text)

        if idx in list_id_publ:
            list_publ.append(val.text)

    # Step 2: get report's url available in current page.
    url_elements = driver.find_elements(By.ID, 'VisualizarDocumento')

    # Step 3: clean the url string.
    list_url = []
    for k in url_elements:
        url = k.get_attribute('onclick')
        url = url[url.find("'") + 1: -2]
        clean_url = f"https://www.rad.cvm.gov.br/ENET/{url}"
        list_url.append(clean_url)
    
    # Step 4: organize all data in dataframe.
    df = pd.DataFrame({
        'company': list_name,
        'report_type': list_type,
        'year': list_year,
        'publication_date': list_publ,
        'url': list_url
        })
    df_main = pd.concat([df_main, df], axis=0, ignore_index=True)

    # Last step: click on next page
    driver.find_element(By.CSS_SELECTOR, '#grdDocumentos_next').click()

Scrapping in page 1
Scrapping in page 2
Scrapping in page 3


In [None]:
# Establish the folder './temp' as the new destionation for downloads.
folder_path = os.getcwd() + '\\temp'
try:
    os.mkdir(folder_path)
else:
    print('Using the current folder /temp as destination for the downloaded files.')

chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("prefs", {
    "download.default_directory": folder_path
})

# New column 'file' containing (1) filename or (2) a message with error. 
df_main['file'] = None
for index, row in df_main.head(1).iterrows():

    # Folder before the download.
    list_file_before = os.listdir(folder_path)

    single_driver = webdriver.Chrome(options=chrome_options)
    single_driver.get(row['url'])

    # Press the button 'Salvar em PDF'.
    wait = WebDriverWait(single_driver, timeout=5)
    wait.until(expected_conditions.element_to_be_clickable((By.ID, "btnGeraRelatorioPDF"))).click()
    
    # Before advancing, it is necessary to make a few adjustments due to iframe particularities.
    iframe = single_driver.find_element(By.CSS_SELECTOR, '#iFrameModal')
    single_driver.switch_to.frame(iframe)

    # It is now clear why, but the website do not C lique no botão 'Gerar PDF'
    sleep(5)
    single_driver.find_elements(By.TAG_NAME, 'button').click()
    #wait = WebDriverWait(single_driver, timeout=5)
    #wait.until(expected_conditions.element_to_be_clickable((By.TAG_NAME, 'button'))).click()

    #sleep(30)
    sleep(5)
    timer = 0
    in_progress = True

    while in_progress:
        
        sleep(1)
        timer += 1
        list_file_after = os.listdir(folder_path)

        if len(list_file_after) > len(list_file_before):
            file = set(list_file_after) - set(list_file_before)
            file = list(file)[0]

            df_main.at[index, 'file'] = file 
            in_progress = False

        if timer == 60:
            df_main.at[index, 'file'] = 'timeout_error'
            in_progress = False
    
    single_driver.quit()