Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
4,499 additions
and
133 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,9 @@ build/ | |
develop-eggs/ | ||
dist/ | ||
downloads/ | ||
downloads/* | ||
bases/ | ||
bases/* | ||
eggs/ | ||
.eggs/ | ||
lib/ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
#!/usr/bin/env python | ||
# coding: utf-8 | ||
import os | ||
|
||
import dask.dataframe as dd | ||
import numpy as np | ||
import pandas as pd | ||
import scraperwiki | ||
|
||
|
||
def main(): | ||
# pega a data máxima de referência | ||
file_path = os.path.join('bases', 'debentures.csv') | ||
if os.path.exists(file_path): | ||
df_base = pd.read_csv(file_path) | ||
df_base['data_referencia'] = pd.to_datetime(df_base['data_referencia']) | ||
print('Máxima data de referência', df_base['data_referencia'].max()) | ||
|
||
df = dd.read_csv( | ||
'downloads/*.csv', | ||
encoding='latin1', | ||
skiprows=2, | ||
sep='\t' | ||
) | ||
|
||
df = df.dropna(subset=['Ativo']) | ||
|
||
del df['Unnamed: 8'] | ||
|
||
# converte o dataframe dask para pandas | ||
df = df.compute() | ||
|
||
df['Data do PU'] = pd.to_datetime( | ||
df['Data do PU'], format='%d/%m/%Y', errors='ignore') | ||
|
||
df.rename(columns={ | ||
'Data do PU': 'data_referencia', | ||
'Ativo': 'ativo', | ||
'Valor Nominal': 'vr_nominal', | ||
'Juros': 'vr_juros', | ||
'Prêmio': 'vr_premio', | ||
'Preço Unitário': 'vr_preco_unitario', | ||
'Critério de Cálculo': 'criterio_calculo', | ||
'Situação': 'situacao'}, | ||
inplace=True | ||
) | ||
|
||
df['vr_nominal'] = df['vr_nominal'].apply(lambda x: x.replace('-', '')) | ||
df['vr_juros'] = df['vr_juros'].apply(lambda x: x.replace('-', '')) | ||
df['vr_premio'] = df['vr_premio'].apply(lambda x: x.replace('-', '')) | ||
df['vr_preco_unitario'] = df['vr_preco_unitario'].apply( | ||
lambda x: x.replace('-', '')) | ||
|
||
df['vr_nominal'] = df['vr_nominal'].apply(lambda x: x.replace('.', '')) | ||
df['vr_juros'] = df['vr_juros'].apply(lambda x: x.replace('.', '')) | ||
df['vr_premio'] = df['vr_premio'].apply(lambda x: x.replace('.', '')) | ||
df['vr_preco_unitario'] = df['vr_preco_unitario'].apply( | ||
lambda x: x.replace('.', '')) | ||
|
||
df['vr_nominal'] = df['vr_nominal'].apply(lambda x: x.replace(',', '.')) | ||
df['vr_juros'] = df['vr_juros'].apply(lambda x: x.replace(',', '.')) | ||
df['vr_premio'] = df['vr_premio'].apply(lambda x: x.replace(',', '.')) | ||
df['vr_preco_unitario'] = df['vr_preco_unitario'].apply( | ||
lambda x: x.replace(',', '.')) | ||
|
||
df['vr_nominal'] = df['vr_nominal'].apply(lambda x: x.strip()) | ||
df['vr_juros'] = df['vr_juros'].apply(lambda x: x.strip()) | ||
df['vr_premio'] = df['vr_premio'].apply(lambda x: x.strip()) | ||
df['vr_preco_unitario'] = df['vr_preco_unitario'].apply( | ||
lambda x: x.strip()) | ||
|
||
df['vr_nominal'] = pd.to_numeric(df['vr_nominal'], errors='coerce') | ||
df['vr_juros'] = pd.to_numeric(df['vr_juros'], errors='coerce') | ||
df['vr_premio'] = pd.to_numeric(df['vr_premio'], errors='coerce') | ||
df['vr_preco_unitario'] = pd.to_numeric( | ||
df['vr_preco_unitario'], errors='coerce') | ||
|
||
# salva o arquivo de saída | ||
print('Salvando resultado capturado no arquivo', file_path) | ||
df.to_csv(file_path, mode='a', index=False) | ||
|
||
print('Iniciando importação para base de dados') | ||
for row in df.to_dict('records'): | ||
try: | ||
scraperwiki.sqlite.save( | ||
unique_keys=['data_referencia', 'ativo'], data=row) | ||
except Exception as e: | ||
print("Error occurred:", e) | ||
continue |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,8 @@ | ||
scraperwiki | ||
lxml | ||
cssselect | ||
pandas | ||
wget | ||
BeautifulSoup4 | ||
tqdm | ||
tqdm==4.47.0 | ||
pandas==1.0.5 | ||
requests==2.24.0 | ||
bizdays==0.3.0 | ||
numpy==1.18.5 | ||
dask==2.20.0 | ||
beautifulsoup4==4.9.2 | ||
scraperwiki==0.5.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
python-3.6.2 | ||
python-3.7.7 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,151 +1,89 @@ | ||
# -*- coding: utf-8 -*- | ||
from __future__ import print_function | ||
import requests | ||
from bs4 import BeautifulSoup | ||
import csv | ||
import os | ||
import pandas as pd | ||
import scraperwiki | ||
|
||
def prepare_download_folder(folder_name): | ||
folder_path = os.path.join('downloads', folder_name) | ||
return prepare_folder(folder_path) | ||
|
||
|
||
def prepare_folder(folder_path): | ||
if not os.path.exists(folder_path): | ||
Path(folder_path).mkdir(parents=True, exist_ok=True) | ||
|
||
return folder_path | ||
|
||
|
||
def download_file(url, file_path): | ||
file_path_csv = file_path.replace('.ZIP', '.CSV') | ||
if os.path.exists(file_path) or os.path.exists(file_path_csv): | ||
print('Arquivo já baixado anteriormente', file_path) | ||
return False | ||
|
||
response = requests.get(url, stream=True) | ||
|
||
if response.status_code != 200: | ||
print('Arquivo não encontrado', url, response.status_code) | ||
return False | ||
|
||
with open(file_path, "wb") as handle: | ||
print('Downloading', url) | ||
for data in response.iter_content(): | ||
handle.write(data) | ||
handle.close() | ||
return True | ||
|
||
import time | ||
from datetime import datetime, timedelta | ||
|
||
import pandas as pd | ||
import requests | ||
from bs4 import BeautifulSoup | ||
|
||
def create_download_folder(): | ||
# Create directory | ||
dirName = os.path.join('downloads') | ||
|
||
try: | ||
# Create target Directory | ||
os.mkdir(dirName) | ||
print("Directory", dirName, "Created ") | ||
except FileExistsError: | ||
print("Directory", dirName, "already exists") | ||
import importa_arquivos | ||
import utils | ||
|
||
|
||
def get_urls(): | ||
url_base = 'http://www.debentures.com.br/exploreosnd/consultaadados/emissoesdedebentures/' | ||
url = url_base+'puhistorico_f.asp' | ||
def get_links(data_inicial): | ||
url_base = 'http://www.debentures.com.br' | ||
url_pu = f'{url_base}/exploreosnd/consultaadados/emissoesdedebentures/' | ||
url = url_pu+'puhistorico_f.asp' | ||
res = requests.get(url) | ||
|
||
while res.status_code != 200: | ||
res = requests.get(url) | ||
res = requests.get(url) | ||
|
||
soup = BeautifulSoup(res.text,"html.parser") | ||
select = soup.find("select", {"name":"ativo"}) | ||
soup = BeautifulSoup(res.text, "html.parser") | ||
select = soup.find("select", {"name": "ativo"}) | ||
|
||
urls = [] | ||
for option in select.find_all('option'): | ||
ativo = option['value'].strip() | ||
|
||
if len(ativo) < 6: | ||
continue | ||
|
||
url_download = url_base + 'puhistorico_e.asp?op_exc=False&dt_ini=&dt_fim=&Submit.x=34&Submit.y=13&ativo='+ativo+'++++' | ||
urls.append({'ativo': ativo,'url':url_download}) | ||
|
||
return urls | ||
|
||
|
||
def download_files_debentures(urls): | ||
for url in urls: | ||
try: | ||
print('Baixando arquivo do ativo', url['ativo']) | ||
name_file = url['ativo']+'.csv' | ||
path_file = os.path.join('downloads', name_file) | ||
# download file | ||
download_file(url['url'], path_file) | ||
except: | ||
print('Erro', url) | ||
if len(ativo) < 6: | ||
continue | ||
|
||
ativo = ativo.replace(' ', '+') | ||
|
||
def process_files_debentures(): | ||
download_path = os.path.join('downloads') | ||
for file_name in os.listdir(download_path): | ||
path_file = os.path.join(download_path, file_name) | ||
print('Processando arquivo', path_file) | ||
process_file(path_file) | ||
# remove processed file | ||
os.remove(path_file) | ||
|
||
|
||
def process_file(file_path): | ||
df = pd.read_csv( | ||
file_path, | ||
skiprows=2, | ||
encoding='iso-8859-1', | ||
sep='\t' | ||
) | ||
|
||
print('Importing {} items'.format(len(df))) | ||
|
||
# remove as linhas com problemas | ||
df = df[df['Ativo'].notnull()] | ||
|
||
# remove unnamed columns | ||
df.drop('Unnamed: 8', axis=1, inplace=True) | ||
|
||
#print(df.tail()) | ||
|
||
for index, row in df.iterrows(): | ||
try: | ||
data = { | ||
'data': row['Data do PU'], | ||
'ativo': row['Ativo'], | ||
'valor_nominal': row['Valor Nominal'], | ||
'valor_juros': row['Juros'], | ||
'valor_premio': row['Prêmio'], | ||
'preco_unitario': row['Preço Unitário'], | ||
'criterio_calculo': row['Critério de Cálculo'], | ||
'situacao': row['Situação'] | ||
} | ||
scraperwiki.sqlite.save(unique_keys=['data', 'ativo'], data=data) | ||
except Exception as e: | ||
print("Error occurred:", e) | ||
return False | ||
return True | ||
today = datetime.today() | ||
|
||
url_compl = '/exploreosnd/consultaadados/emissoesdedebentures/' | ||
url = f'{url_base}{url_compl}' | ||
url = f'{url}puhistorico_e.asp?' | ||
url = f'{url}op_exc=False&dt_ini={data_inicial}&Submit.x=34&Submit.y=13' | ||
url = f"{url}&dt_fim={today.strftime('%d/%m/%Y')}&ativo={ativo}++++" | ||
urls.append({'ativo': ativo, 'url': url}) | ||
|
||
def main(): | ||
# create download folder | ||
create_download_folder() | ||
return urls | ||
|
||
urls = get_urls() | ||
download_files_debentures(urls) | ||
process_files_debentures() | ||
|
||
# rename file | ||
os.rename('scraperwiki.sqlite', 'data.sqlite') | ||
def main(): | ||
utils.prepare_download_folder('downloads') | ||
|
||
# pega a data máxima de referência | ||
file_path = os.path.join('bases', 'debentures.csv') | ||
data_inicial = '' | ||
if os.path.exists(file_path): | ||
df_base = pd.read_csv(file_path) | ||
df_base['data_referencia'] = pd.to_datetime(df_base['data_referencia']) | ||
print('Máxima data de referência', df_base['data_referencia'].max()) | ||
data_inicial = df_base['data_referencia'].max() | ||
data_inicial = data_inicial + timedelta(days=1) | ||
data_inicial = data_inicial.strftime('%d/%m/%Y') | ||
|
||
urls = get_links(data_inicial) | ||
|
||
tamanho = len(urls) | ||
for index, url in enumerate(urls): | ||
name_file = url['ativo']+'.csv' | ||
path_file = os.path.join('downloads', name_file) | ||
print( | ||
f'{index+1} de {tamanho}', | ||
' Baixando arquivo do ativo', | ||
url['ativo'], name_file | ||
) | ||
|
||
utils.download(url['url'], None, path_file) | ||
time.sleep(1) | ||
|
||
if index > 0 and index % 50 == 0: | ||
print('Aguardando 30 segundos, para evitar timeout') | ||
time.sleep(30) | ||
|
||
print('Consolidando arquivos baixados') | ||
importa_arquivos.main() | ||
|
||
|
||
if __name__ == '__main__': | ||
main() | ||
time.sleep(60) | ||
# rename file | ||
os.rename('scraperwiki.sqlite', 'data.sqlite') |
Oops, something went wrong.