In [13]:
import pandas as pd
import time
import os
import boto3

In [14]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from datetime import datetime

In [15]:
service = Service()

options = webdriver.ChromeOptions()

driver = webdriver.Chrome(service=service, options=options)

In [16]:
url = 'https://sistemaswebb3-listados.b3.com.br/indexPage/day/IBOV?language=pt-br'

driver.get(url)


In [17]:
download_link = driver.find_element(By.XPATH, "//a[contains(text(), 'Download')]")
download_link.click()

In [18]:
time.sleep(2)

In [19]:
current_date = datetime.now().strftime('%d-%m-%y')
download_folder = 'C:/Users/rerys/Downloads'

In [20]:
files_in_download_folder = os.listdir(download_folder)

In [21]:
download_filename = f'IBOVDia_{current_date}.csv'
download_path = os.path.join(download_folder, download_filename)

In [22]:
if not os.path.exists(download_path):
    print(f"Erro: o arquivo {download_filename} não foi encontrado na pasta de downloads.")
else:
    # Lê o CSV
    df = pd.read_csv(download_path, encoding='latin1', sep=';', header=None, skiprows=2, on_bad_lines='skip')
    df = df.dropna(axis=1, how='all')

    # Adiciona as colunas de cabeçalho
    df.columns = ['cod', 'acao', 'tipo', 'qtd', 'part.']

    # Adiciona uma coluna de data no formato americano
    current_date_american = datetime.now().strftime('%Y-%m-%d')
    df['data_pregao'] = current_date_american

    year = datetime.now().strftime('%Y')
    month = datetime.now().strftime('%m')
    day = datetime.now().strftime('%d')

    parquet_dir = f'D:/Dev/Scraping_b3/Parquet/raw/{year}/{month}/{day}'
    os.makedirs(parquet_dir, exist_ok=True)

    # Define o caminho para salvar o arquivo Parquet com nome baseado na data do download
    parquet_output_path = os.path.join(parquet_dir, f'ibovespa_{current_date_american}.parquet')

    # Salva o DataFrame em formato Parquet
    df.to_parquet(parquet_output_path, index=False)

    # Lê o arquivo Parquet salvo para verificar
    df_loaded = pd.read_parquet(parquet_output_path)
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)

    print(df_loaded)

                      Codigo          Acao        Tipo        Qtde. Teorica  \
0                      RRRP3  3R PETROLEUM  ON      NM          238.441.689   
1                      ALOS3         ALLOS  ON      NM          532.616.595   
2                      ALPA4    ALPARGATAS  PN      N1          166.362.038   
3                      ABEV3     AMBEV S/A     ON  ATZ        4.394.245.879   
4                      ARZZ3     AREZZO CO  ON  ED  NM           62.305.891   
5                      ASAI3         ASSAI  ON      NM        1.349.217.892   
6                      AZUL4          AZUL  PN      N2          332.825.777   
7                      B3SA3            B3  ON      NM        5.602.790.110   
8                      BBSE3  BBSEGURIDADE  ON      NM          671.750.768   
9                      BBDC3      BRADESCO  ON      N1        1.489.259.656   
10                     BBDC4      BRADESCO  PN      N1        5.135.772.281   
11                     BRAP4     BRADESPAR  PN      

In [23]:
s3_client = boto3.client('s3', region_name='us-east-1')
bucket_name = 'fiap2024-mlet-reryson'

In [24]:
s3_parquet_path = f'Parquet/raw/{year}/{month}/{day}/ibovespa_{current_date_american}.parquet'

In [25]:
s3_client.upload_file(parquet_output_path, bucket_name, s3_parquet_path)
print(f"Arquivo Parquet enviado para o S3: {s3_parquet_path}")

Arquivo Parquet enviado para o S3: Parquet/raw/2024/07/15/ibovespa_2024-07-15.parquet


In [26]:
driver.quit()