Skip to content

Commit

Permalink
ajustado importação
Browse files Browse the repository at this point in the history
  • Loading branch information
royopa committed Sep 29, 2020
1 parent 510f24e commit 85e1503
Show file tree
Hide file tree
Showing 9 changed files with 4,454 additions and 95 deletions.
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,10 @@
# Ignore output of scraper
data.sqlite
scraperwiki.sqlite
scraperwiki.sqlite-journal
data.sqlite-journal
downloads/*
dados.csv
downloads/
bases/
bases/*
6 changes: 6 additions & 0 deletions .ipynb_checkpoints/Untitled-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 4
}
6 changes: 6 additions & 0 deletions Untitled.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 4
}
Binary file added __pycache__/utils.cpython-37.pyc
Binary file not shown.
85 changes: 85 additions & 0 deletions importa_arquivo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# -*- coding: utf-8 -*-
from __future__ import print_function, with_statement

import os
import time
from datetime import datetime

import pandas as pd
import scraperwiki
from chardet.universaldetector import UniversalDetector

import utils


def main():
# morph.io requires this db filename, but scraperwiki doesn't nicely
# expose a way to alter this. So we'll fiddle our environment ourselves
# before our pipeline modules load.
os.environ['SCRAPERWIKI_DATABASE_NAME'] = 'sqlite:///data.sqlite'

file_path = os.path.join("dados.csv")
df = pd.read_csv(
file_path,
skiprows=2,
encoding='iso-8859-1',
sep='\t'
)

df = df.rename(columns={
u'Data': "dt_referencia",
u'Emissor': "emissor",
u'C\xf3digo do Ativo': "co_ativo",
u'ISIN': "isin",
u'Quantidade': "quantidade",
u'N\xfamero de Neg\xf3cios': "nu_negocios",
u'PU M\xednimo': "pu_minimo",
u'PU M\xe9dio': "pu_medio",
u'PU M\xe1ximo': "pu_maximo",
u'% PU da Curva': "pu_curva"
})

# converte para datetime
df['dt_referencia'] = pd.to_datetime(
df['dt_referencia'], format='%d/%m/%Y').dt.date

# formata o campo para float
df['pu_medio'] = df['pu_medio'].str.replace('.', '')
df['pu_medio'] = df['pu_medio'].str.replace(',', '.')
df['pu_medio'] = pd.to_numeric(df['pu_medio'], errors='coerce')

# formata o campo para float
df['pu_minimo'] = df['pu_minimo'].str.replace('.', '')
df['pu_minimo'] = df['pu_minimo'].str.replace(',', '.')
df['pu_minimo'] = pd.to_numeric(df['pu_minimo'], errors='coerce')

# formata o campo para float
df['pu_maximo'] = df['pu_maximo'].str.replace('.', '')
df['pu_maximo'] = df['pu_maximo'].str.replace(',', '.')
df['pu_maximo'] = pd.to_numeric(df['pu_maximo'], errors='coerce')

# formata o campo para float
df['pu_curva'] = df['pu_curva'].str.replace('.', '')
df['pu_curva'] = df['pu_curva'].str.replace(',', '.')
df['pu_curva'] = pd.to_numeric(df['pu_curva'], errors='coerce')

# salva o arquivo de saída
file_path = os.path.join('bases', 'debentures_negociacao.csv')
print('Salvando resultado capturado no arquivo', file_path)
df.to_csv(file_path, mode='a', index=False)

print('Importing {} items'.format(len(df)))

for index, row in enumerate(df.to_dict('records')):
print(f'{index+1} de {len(df)}')
scraperwiki.sqlite.save(
unique_keys=['dt_referencia', 'co_ativo'], data=row)

print('{} Registros importados com sucesso'.format(len(df)))


if __name__ == '__main__':
main()
time.sleep(60)
# rename file
os.rename('scraperwiki.sqlite', 'data.sqlite')
2 changes: 1 addition & 1 deletion runtime.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
python-3.6.2
python-3.6.7
107 changes: 13 additions & 94 deletions scraper.py
Original file line number Diff line number Diff line change
@@ -1,106 +1,27 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
from __future__ import print_function
import requests
from __future__ import print_function, with_statement

import os
import pandas as pd
import scraperwiki
from tqdm import tqdm
from datetime import datetime
import sys
import codecs
from chardet.universaldetector import UniversalDetector
import os


def download_file(url, file_path):
response = requests.get(url, stream=True)

if response.status_code != 200:
print('Arquivo não encontrado', url, response.status_code)
return False

with open(file_path, "wb") as handle:
print('Downloading', url)
for data in response.iter_content():
handle.write(data)
handle.close()
return True


def create_download_folder():
# Create directory
dirName = os.path.join('downloads')

try:
# Create target Directory
os.mkdir(dirName)
print("Directory", dirName, "Created ")
except Exception:
print("Directory", dirName, "already exists")
import importa_arquivo
import utils


def process_file(url):
file_path = os.path.join("dados.csv")
if download_file(url, file_path) is False:

if utils.download(url, None, file_path) is False:
print("Erro ao baixar arquivo")
return False

# morph.io requires this db filename, but scraperwiki doesn't nicely
# expose a way to alter this. So we'll fiddle our environment ourselves
# before our pipeline modules load.
os.environ['SCRAPERWIKI_DATABASE_NAME'] = 'sqlite:///data.sqlite'

df = pd.read_csv(
file_path,
skiprows=2,
encoding='iso-8859-1',
sep='\t'
)

df = df.rename(columns={
u'Data':"data",
u'Emissor':"emissor",
u'C\xf3digo do Ativo':"co_ativo",
u'ISIN':"isin",
u'Quantidade':"quantidade",
u'N\xfamero de Neg\xf3cios':"nu_negocios",
u'PU M\xednimo':"pu_minimo",
u'PU M\xe9dio':"pu_medio",
u'PU M\xe1ximo':"pu_maximo",
u'% PU da Curva':"pu_curva"
})

print('Importing {} items'.format(len(df)))
print('Iniciando importação')

# converte para datetime
df['data'] = pd.to_datetime(df['data'], format='%d/%m/%Y').dt.date
# formata o campo para float
df['pu_medio'] = df['pu_medio'].str.replace('.', '')
df['pu_medio'] = df['pu_medio'].str.replace(',', '.')
df['pu_medio'] = pd.to_numeric(df['pu_medio'], errors='coerce')
# formata o campo para float
df['pu_minimo'] = df['pu_minimo'].str.replace('.', '')
df['pu_minimo'] = df['pu_minimo'].str.replace(',', '.')
df['pu_minimo'] = pd.to_numeric(df['pu_minimo'], errors='coerce')
# formata o campo para float
df['pu_maximo'] = df['pu_maximo'].str.replace('.', '')
df['pu_maximo'] = df['pu_maximo'].str.replace(',', '.')
df['pu_maximo'] = pd.to_numeric(df['pu_maximo'], errors='coerce')
# formata o campo para float
df['pu_curva'] = df['pu_curva'].str.replace('.', '')
df['pu_curva'] = df['pu_curva'].str.replace(',', '.')
df['pu_curva'] = pd.to_numeric(df['pu_curva'], errors='coerce')

for row in df.to_dict('records'):
scraperwiki.sqlite.save(unique_keys=df.columns.values.tolist(), data=row)

print('{} Registros importados com sucesso'.format(len(df)))
importa_arquivo.main()


def main():
# create download folder
create_download_folder()
utils.prepare_download_folder('bases')
utils.prepare_download_folder('downloads')

dt_ini = datetime(1990, 1, 1)
dt_ini = dt_ini.strftime("%Y%m%d")
Expand All @@ -109,12 +30,10 @@ def main():
dt_fim = dt_fim.strftime("%Y%m%d")

url_base = 'http://www.debentures.com.br/exploreosnd/consultaadados/mercadosecundario/precosdenegociacao_e.asp'
url = '{}?op_exc=Nada&emissor=&isin=&ativo=&dt_ini={}&dt_fim={}'.format(url_base, dt_ini, dt_fim)

process_file(url)
url = '{}?op_exc=Nada&emissor=&isin=&ativo=&dt_ini={}&dt_fim={}'.format(
url_base, dt_ini, dt_fim)

# rename file
os.rename('scraperwiki.sqlite', 'data.sqlite')
process_file(url)


if __name__ == '__main__':
Expand Down

0 comments on commit 85e1503

Please sign in to comment.