Skip to content

Commit

Permalink
ajustado tempo de execução
Browse files Browse the repository at this point in the history
  • Loading branch information
royopa committed Sep 29, 2020
1 parent 67b6dcc commit cec94b9
Show file tree
Hide file tree
Showing 7 changed files with 4,499 additions and 133 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Expand Up @@ -16,6 +16,9 @@ build/
develop-eggs/
dist/
downloads/
downloads/*
bases/
bases/*
eggs/
.eggs/
lib/
Expand Down
89 changes: 89 additions & 0 deletions importa_arquivos.py
@@ -0,0 +1,89 @@
#!/usr/bin/env python
# coding: utf-8
import os

import dask.dataframe as dd
import numpy as np
import pandas as pd
import scraperwiki


def main():
# pega a data máxima de referência
file_path = os.path.join('bases', 'debentures.csv')
if os.path.exists(file_path):
df_base = pd.read_csv(file_path)
df_base['data_referencia'] = pd.to_datetime(df_base['data_referencia'])
print('Máxima data de referência', df_base['data_referencia'].max())

df = dd.read_csv(
'downloads/*.csv',
encoding='latin1',
skiprows=2,
sep='\t'
)

df = df.dropna(subset=['Ativo'])

del df['Unnamed: 8']

# converte o dataframe dask para pandas
df = df.compute()

df['Data do PU'] = pd.to_datetime(
df['Data do PU'], format='%d/%m/%Y', errors='ignore')

df.rename(columns={
'Data do PU': 'data_referencia',
'Ativo': 'ativo',
'Valor Nominal': 'vr_nominal',
'Juros': 'vr_juros',
'Prêmio': 'vr_premio',
'Preço Unitário': 'vr_preco_unitario',
'Critério de Cálculo': 'criterio_calculo',
'Situação': 'situacao'},
inplace=True
)

df['vr_nominal'] = df['vr_nominal'].apply(lambda x: x.replace('-', ''))
df['vr_juros'] = df['vr_juros'].apply(lambda x: x.replace('-', ''))
df['vr_premio'] = df['vr_premio'].apply(lambda x: x.replace('-', ''))
df['vr_preco_unitario'] = df['vr_preco_unitario'].apply(
lambda x: x.replace('-', ''))

df['vr_nominal'] = df['vr_nominal'].apply(lambda x: x.replace('.', ''))
df['vr_juros'] = df['vr_juros'].apply(lambda x: x.replace('.', ''))
df['vr_premio'] = df['vr_premio'].apply(lambda x: x.replace('.', ''))
df['vr_preco_unitario'] = df['vr_preco_unitario'].apply(
lambda x: x.replace('.', ''))

df['vr_nominal'] = df['vr_nominal'].apply(lambda x: x.replace(',', '.'))
df['vr_juros'] = df['vr_juros'].apply(lambda x: x.replace(',', '.'))
df['vr_premio'] = df['vr_premio'].apply(lambda x: x.replace(',', '.'))
df['vr_preco_unitario'] = df['vr_preco_unitario'].apply(
lambda x: x.replace(',', '.'))

df['vr_nominal'] = df['vr_nominal'].apply(lambda x: x.strip())
df['vr_juros'] = df['vr_juros'].apply(lambda x: x.strip())
df['vr_premio'] = df['vr_premio'].apply(lambda x: x.strip())
df['vr_preco_unitario'] = df['vr_preco_unitario'].apply(
lambda x: x.strip())

df['vr_nominal'] = pd.to_numeric(df['vr_nominal'], errors='coerce')
df['vr_juros'] = pd.to_numeric(df['vr_juros'], errors='coerce')
df['vr_premio'] = pd.to_numeric(df['vr_premio'], errors='coerce')
df['vr_preco_unitario'] = pd.to_numeric(
df['vr_preco_unitario'], errors='coerce')

# salva o arquivo de saída
print('Salvando resultado capturado no arquivo', file_path)
df.to_csv(file_path, mode='a', index=False)

print('Iniciando importação para base de dados')
for row in df.to_dict('records'):
try:
scraperwiki.sqlite.save(
unique_keys=['data_referencia', 'ativo'], data=row)
except Exception as e:
print("Error occurred:", e)
continue
15 changes: 8 additions & 7 deletions requirements.txt
@@ -1,7 +1,8 @@
scraperwiki
lxml
cssselect
pandas
wget
BeautifulSoup4
tqdm
tqdm==4.47.0
pandas==1.0.5
requests==2.24.0
bizdays==0.3.0
numpy==1.18.5
dask==2.20.0
beautifulsoup4==4.9.2
scraperwiki==0.5.1
2 changes: 1 addition & 1 deletion runtime.txt
@@ -1 +1 @@
python-3.6.2
python-3.7.7
188 changes: 63 additions & 125 deletions scraper.py
@@ -1,151 +1,89 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
import requests
from bs4 import BeautifulSoup
import csv
import os
import pandas as pd
import scraperwiki

def prepare_download_folder(folder_name):
folder_path = os.path.join('downloads', folder_name)
return prepare_folder(folder_path)


def prepare_folder(folder_path):
if not os.path.exists(folder_path):
Path(folder_path).mkdir(parents=True, exist_ok=True)

return folder_path


def download_file(url, file_path):
file_path_csv = file_path.replace('.ZIP', '.CSV')
if os.path.exists(file_path) or os.path.exists(file_path_csv):
print('Arquivo já baixado anteriormente', file_path)
return False

response = requests.get(url, stream=True)

if response.status_code != 200:
print('Arquivo não encontrado', url, response.status_code)
return False

with open(file_path, "wb") as handle:
print('Downloading', url)
for data in response.iter_content():
handle.write(data)
handle.close()
return True

import time
from datetime import datetime, timedelta

import pandas as pd
import requests
from bs4 import BeautifulSoup

def create_download_folder():
# Create directory
dirName = os.path.join('downloads')

try:
# Create target Directory
os.mkdir(dirName)
print("Directory", dirName, "Created ")
except FileExistsError:
print("Directory", dirName, "already exists")
import importa_arquivos
import utils


def get_urls():
url_base = 'http://www.debentures.com.br/exploreosnd/consultaadados/emissoesdedebentures/'
url = url_base+'puhistorico_f.asp'
def get_links(data_inicial):
url_base = 'http://www.debentures.com.br'
url_pu = f'{url_base}/exploreosnd/consultaadados/emissoesdedebentures/'
url = url_pu+'puhistorico_f.asp'
res = requests.get(url)

while res.status_code != 200:
res = requests.get(url)
res = requests.get(url)

soup = BeautifulSoup(res.text,"html.parser")
select = soup.find("select", {"name":"ativo"})
soup = BeautifulSoup(res.text, "html.parser")
select = soup.find("select", {"name": "ativo"})

urls = []
for option in select.find_all('option'):
ativo = option['value'].strip()

if len(ativo) < 6:
continue

url_download = url_base + 'puhistorico_e.asp?op_exc=False&dt_ini=&dt_fim=&Submit.x=34&Submit.y=13&ativo='+ativo+'++++'
urls.append({'ativo': ativo,'url':url_download})

return urls


def download_files_debentures(urls):
for url in urls:
try:
print('Baixando arquivo do ativo', url['ativo'])
name_file = url['ativo']+'.csv'
path_file = os.path.join('downloads', name_file)
# download file
download_file(url['url'], path_file)
except:
print('Erro', url)
if len(ativo) < 6:
continue

ativo = ativo.replace(' ', '+')

def process_files_debentures():
download_path = os.path.join('downloads')
for file_name in os.listdir(download_path):
path_file = os.path.join(download_path, file_name)
print('Processando arquivo', path_file)
process_file(path_file)
# remove processed file
os.remove(path_file)


def process_file(file_path):
df = pd.read_csv(
file_path,
skiprows=2,
encoding='iso-8859-1',
sep='\t'
)

print('Importing {} items'.format(len(df)))

# remove as linhas com problemas
df = df[df['Ativo'].notnull()]

# remove unnamed columns
df.drop('Unnamed: 8', axis=1, inplace=True)

#print(df.tail())

for index, row in df.iterrows():
try:
data = {
'data': row['Data do PU'],
'ativo': row['Ativo'],
'valor_nominal': row['Valor Nominal'],
'valor_juros': row['Juros'],
'valor_premio': row['Prêmio'],
'preco_unitario': row['Preço Unitário'],
'criterio_calculo': row['Critério de Cálculo'],
'situacao': row['Situação']
}
scraperwiki.sqlite.save(unique_keys=['data', 'ativo'], data=data)
except Exception as e:
print("Error occurred:", e)
return False
return True
today = datetime.today()

url_compl = '/exploreosnd/consultaadados/emissoesdedebentures/'
url = f'{url_base}{url_compl}'
url = f'{url}puhistorico_e.asp?'
url = f'{url}op_exc=False&dt_ini={data_inicial}&Submit.x=34&Submit.y=13'
url = f"{url}&dt_fim={today.strftime('%d/%m/%Y')}&ativo={ativo}++++"
urls.append({'ativo': ativo, 'url': url})

def main():
# create download folder
create_download_folder()
return urls

urls = get_urls()
download_files_debentures(urls)
process_files_debentures()

# rename file
os.rename('scraperwiki.sqlite', 'data.sqlite')
def main():
utils.prepare_download_folder('downloads')

# pega a data máxima de referência
file_path = os.path.join('bases', 'debentures.csv')
data_inicial = ''
if os.path.exists(file_path):
df_base = pd.read_csv(file_path)
df_base['data_referencia'] = pd.to_datetime(df_base['data_referencia'])
print('Máxima data de referência', df_base['data_referencia'].max())
data_inicial = df_base['data_referencia'].max()
data_inicial = data_inicial + timedelta(days=1)
data_inicial = data_inicial.strftime('%d/%m/%Y')

urls = get_links(data_inicial)

tamanho = len(urls)
for index, url in enumerate(urls):
name_file = url['ativo']+'.csv'
path_file = os.path.join('downloads', name_file)
print(
f'{index+1} de {tamanho}',
' Baixando arquivo do ativo',
url['ativo'], name_file
)

utils.download(url['url'], None, path_file)
time.sleep(1)

if index > 0 and index % 50 == 0:
print('Aguardando 30 segundos, para evitar timeout')
time.sleep(30)

print('Consolidando arquivos baixados')
importa_arquivos.main()


if __name__ == '__main__':
main()
time.sleep(60)
# rename file
os.rename('scraperwiki.sqlite', 'data.sqlite')

0 comments on commit cec94b9

Please sign in to comment.