# Selenium

Objetivo: explorar a biblioteca Selenium para acessar o site da CVM e extrair o relatório financeiro Formulário de Referência.

In [162]:
from random import randint
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions

import os
import pandas as pd
from time import sleep

In [46]:
print(webdriver.__version__)

4.10.0


In [173]:
# Iniciando o navegador.
driver = webdriver.Chrome()

# Acessar o site da CVM.
link = 'https://www.rad.cvm.gov.br/ENET/frmConsultaExternaCVM.aspx'
driver.get(link)
assert driver.title == 'Consulta de Documentos de Companhias Abertas'

# Simula uma pausa.
#wait_time = randint(3, 6)
#driver.implicitly_wait(wait_time)
driver.implicitly_wait(10)

In [174]:
# Escolher o relatório Formulário de Referência.
text_box_documento = driver.find_element(By.ID, 'cboCategorias_chosen_input')
driver.implicitly_wait(5)

text_box_documento.send_keys('FRE - Formulário de Referência')
driver.implicitly_wait(5)
text_box_documento.send_keys('\ue007')

In [175]:
# Clicar no botão 'No período' para habilitar a seleção da data.
radio_button_periodo = driver.find_element(By.ID, 'rdPeriodo')
radio_button_periodo.click()
driver.implicitly_wait(5)

# Selecionar a data de início
text_box_dt_inicio = driver.find_element(By.ID, 'txtDataIni')
#text_box_dt_inicio.click()
driver.implicitly_wait(5)
text_box_dt_inicio.send_keys('01012021')

# Selecionar a data de início
text_box_dt_inicio = driver.find_element(By.ID, 'txtDataFim')
#text_box_dt_inicio.click()
driver.implicitly_wait(5)
text_box_dt_inicio.send_keys('31012021')

In [50]:
# Selecionar a empresa (opcional)
#text_box_empresa = driver.find_element(By.ID, 'cboEmpresa')
#text_box_empresa.send_keys('Porto Seguro S.A.')
#driver.implicitly_wait(3)
#text_box_empresa.send_keys(Keys.DOWN)
#driver.implicitly_wait(5)
#text_box_empresa.send_keys('\ue007')
#text_box_empresa.send_keys(Keys.RETURN)

# não funcionou.

In [176]:
# Apertar botão de execução da consulta.
button_consulta = driver.find_element(By.ID, 'btnConsulta')
button_consulta.click()

In [177]:
# Retrieve company name, report type and year.
registry_elements = driver.find_elements(By.CSS_SELECTOR, "tr[role = 'row'] > td")

# Data is organized by the schema below:
# name: 1 + 11 * x
list_id_name = [1 + 11 * x for x in range(100)]

# report type: 2 + 11 * x
list_id_type = [2 + 11 * x for x in range(100)]

# year: 5 + 11 * x
list_id_year = [5 + 11 * x for x in range(100)]

# publication date: 6 + 11 * x
list_id_publ = [6 + 11 * x for x in range(100)]

print(registry_elements[list_id_name[0]].text)
print(registry_elements[list_id_type[0]].text)
print(registry_elements[list_id_year[0]].text)
print(registry_elements[list_id_publ[0]].text)

LOCAWEB SERVIÇOS DE INTERNET S.A.
FRE - Formulário de Referência
2020
30/01/2021 22:12


In [178]:
# Extract registry data.
df_main = pd.DataFrame()

# Estimate the amount of pages found.
n_pages = len(driver.find_elements(By.CSS_SELECTOR, '.paginate_button')) - 2

for i in range(n_pages):
    print(f'Scrapping in page {i + 1}')
    
    #df = pd.DataFrame(columns=['company', 'report_type', 'year'])

    # Step 1: get name, report type and year.
    registry_elements = driver.find_elements(By.CSS_SELECTOR, "tr[role = 'row'] > td")
    
    # Organizing name, report type and year in lists.
    list_name = []
    list_type = []
    list_year = []
    list_publ = []
    for idx, val in enumerate(registry_elements):
        if idx in list_id_name:
            list_name.append(val.text)
            
        if idx in list_id_type:
            list_type.append(val.text)

        if idx in list_id_year:
            list_year.append(val.text)

        if idx in list_id_publ:
            list_publ.append(val.text)

    # Step 2: get report's url available in current page.
    url_elements = driver.find_elements(By.ID, 'VisualizarDocumento')

    # Step 3: clean the url string.
    list_url = []
    for k in url_elements:
        url = k.get_attribute('onclick')
        url = url[url.find("'") + 1: -2]
        clean_url = f"https://www.rad.cvm.gov.br/ENET/{url}"
        list_url.append(clean_url)
    
    # Step 4: organize all data in dataframe.
    df = pd.DataFrame({
        'company': list_name,
        'report_type': list_type,
        'year': list_year,
        'publication_date': list_publ,
        'url': list_url
        })
    df_main = pd.concat([df_main, df], axis=0, ignore_index=True)

    # Last step: click on next page
    driver.find_element(By.CSS_SELECTOR, '#grdDocumentos_next').click()


# Listar todos os downloads possíveis.
#list_docs = driver.find_elements(By.ID, 'VisualizarDocumento')
#print(list_docs[:3])
#print(list_docs[0].get_attribute('onclick'))

Scrapping in page 1
Scrapping in page 2
Scrapping in page 3


In [264]:
df_main['file'].value_counts()

timeout_error                      8
99816_024910_02082023200843.pdf    1
99707_023493_02082023200837.pdf    1
99691_024716_02082023200825.pdf    1
99692_020036_02082023200807.pdf    1
                                  ..
99750_020362_02082023200819.pdf    1
99752_024716_02082023200859.pdf    1
99755_021440_02082023200841.pdf    1
99756_020648_02082023200813.pdf    1
99645_019593_02082023200833.pdf    1
Name: file, Length: 93, dtype: int64

In [261]:
# Criar uma pasta './temp' para salvar os relatórios.
folder_path = os.getcwd() + '\\temp'
#os.rmdir(folder_path)
os.mkdir(folder_path)

chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("prefs", {
    "download.default_directory": folder_path
})

# Iniciando o navegador.
df_main['file'] = None
#for index, row in df_main.iterrows():
for index, row in df_main.head(100).iterrows():

    list_file_before = os.listdir(folder_path)

    single_driver = webdriver.Chrome(options=chrome_options)
    single_driver.get(row['url'])

    # Clique no botão 'Salvar em PDF'.
    wait = WebDriverWait(single_driver, timeout=5)
    wait.until(expected_conditions.element_to_be_clickable((By.ID, "btnGeraRelatorioPDF"))).click()
    #salvar_em_pdf = single_driver.find_element(By.ID, 'btnGeraRelatorioPDF')
    #print(single_driver.page_source)

    # Antes de gerar o PDF, é necessário entrar no iframe (i.e. html externo dentro html principal).
    iframe = single_driver.find_element(By.CSS_SELECTOR, '#iFrameModal')
    single_driver.switch_to.frame(iframe)

    #Clique no botão 'Gerar PDF'
    wait = WebDriverWait(single_driver, timeout=5)

    sleep(5)

    wait.until(expected_conditions.element_to_be_clickable((By.TAG_NAME, 'button'))).click()

    #sleep(30)
    sleep(5)
    timer = 0
    in_progress = True

    while in_progress:
        
        sleep(1)
        timer += 1
        list_file_after = os.listdir(folder_path)

        if len(list_file_after) > len(list_file_before):
            file = set(list_file_after) - set(list_file_before)
            file = list(file)[0]

            df_main.at[index, 'file'] = file 
            in_progress = False

        if timer == 60:
            df_main.at[index, 'file'] = 'timeout_error'
            in_progress = False
    
    single_driver.quit()