ajustado tempo de execução

royopa · Sep 29, 2020 · cec94b9 · cec94b9
1 parent 67b6dcc
commit cec94b9
Show file tree

Hide file tree

Showing 7 changed files with 4,499 additions and 133 deletions.
diff --git a/.gitignore b/.gitignore
@@ -16,6 +16,9 @@ build/
 develop-eggs/
 dist/
 downloads/
+downloads/*
+bases/
+bases/*
 eggs/
 .eggs/
 lib/

diff --git a/importa_arquivos.py b/importa_arquivos.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+# coding: utf-8
+import os
+
+import dask.dataframe as dd
+import numpy as np
+import pandas as pd
+import scraperwiki
+
+
+def main():
+    # pega a data máxima de referência
+    file_path = os.path.join('bases', 'debentures.csv')
+    if os.path.exists(file_path):
+        df_base = pd.read_csv(file_path)
+        df_base['data_referencia'] = pd.to_datetime(df_base['data_referencia'])
+        print('Máxima data de referência', df_base['data_referencia'].max())
+
+    df = dd.read_csv(
+        'downloads/*.csv',
+        encoding='latin1',
+        skiprows=2,
+        sep='\t'
+    )
+
+    df = df.dropna(subset=['Ativo'])
+
+    del df['Unnamed: 8']
+
+    # converte o dataframe dask para pandas
+    df = df.compute()
+
+    df['Data do PU'] = pd.to_datetime(
+        df['Data do PU'], format='%d/%m/%Y', errors='ignore')
+
+    df.rename(columns={
+        'Data do PU': 'data_referencia',
+        'Ativo': 'ativo',
+        'Valor Nominal': 'vr_nominal',
+        'Juros': 'vr_juros',
+        'Prêmio': 'vr_premio',
+        'Preço Unitário': 'vr_preco_unitario',
+        'Critério de Cálculo': 'criterio_calculo',
+        'Situação': 'situacao'},
+        inplace=True
+    )
+
+    df['vr_nominal'] = df['vr_nominal'].apply(lambda x: x.replace('-', ''))
+    df['vr_juros'] = df['vr_juros'].apply(lambda x: x.replace('-', ''))
+    df['vr_premio'] = df['vr_premio'].apply(lambda x: x.replace('-', ''))
+    df['vr_preco_unitario'] = df['vr_preco_unitario'].apply(
+        lambda x: x.replace('-', ''))
+
+    df['vr_nominal'] = df['vr_nominal'].apply(lambda x: x.replace('.', ''))
+    df['vr_juros'] = df['vr_juros'].apply(lambda x: x.replace('.', ''))
+    df['vr_premio'] = df['vr_premio'].apply(lambda x: x.replace('.', ''))
+    df['vr_preco_unitario'] = df['vr_preco_unitario'].apply(
+        lambda x: x.replace('.', ''))
+
+    df['vr_nominal'] = df['vr_nominal'].apply(lambda x: x.replace(',', '.'))
+    df['vr_juros'] = df['vr_juros'].apply(lambda x: x.replace(',', '.'))
+    df['vr_premio'] = df['vr_premio'].apply(lambda x: x.replace(',', '.'))
+    df['vr_preco_unitario'] = df['vr_preco_unitario'].apply(
+        lambda x: x.replace(',', '.'))
+
+    df['vr_nominal'] = df['vr_nominal'].apply(lambda x: x.strip())
+    df['vr_juros'] = df['vr_juros'].apply(lambda x: x.strip())
+    df['vr_premio'] = df['vr_premio'].apply(lambda x: x.strip())
+    df['vr_preco_unitario'] = df['vr_preco_unitario'].apply(
+        lambda x: x.strip())
+
+    df['vr_nominal'] = pd.to_numeric(df['vr_nominal'], errors='coerce')
+    df['vr_juros'] = pd.to_numeric(df['vr_juros'], errors='coerce')
+    df['vr_premio'] = pd.to_numeric(df['vr_premio'], errors='coerce')
+    df['vr_preco_unitario'] = pd.to_numeric(
+        df['vr_preco_unitario'], errors='coerce')
+
+    # salva o arquivo de saída
+    print('Salvando resultado capturado no arquivo', file_path)
+    df.to_csv(file_path, mode='a', index=False)
+
+    print('Iniciando importação para base de dados')
+    for row in df.to_dict('records'):
+        try:
+            scraperwiki.sqlite.save(
+                unique_keys=['data_referencia', 'ativo'], data=row)
+        except Exception as e:
+            print("Error occurred:", e)
+            continue
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,8 @@
-scraperwiki
-lxml
-cssselect
-pandas
-wget
-BeautifulSoup4
-tqdm
+tqdm==4.47.0
+pandas==1.0.5
+requests==2.24.0
+bizdays==0.3.0
+numpy==1.18.5
+dask==2.20.0
+beautifulsoup4==4.9.2
+scraperwiki==0.5.1
diff --git a/runtime.txt b/runtime.txt
@@ -1 +1 @@
-python-3.6.2
+python-3.7.7
diff --git a/scraper.py b/scraper.py
@@ -1,151 +1,89 @@
 # -*- coding: utf-8 -*-
-from __future__ import print_function
-import requests
-from bs4 import BeautifulSoup
+import csv
 import os
-import pandas as pd
-import scraperwiki
-
-def prepare_download_folder(folder_name):
-    folder_path = os.path.join('downloads', folder_name)
-    return prepare_folder(folder_path)
-
-
-def prepare_folder(folder_path):
-    if not os.path.exists(folder_path):
-        Path(folder_path).mkdir(parents=True, exist_ok=True)
-
-    return folder_path
-
-
-def download_file(url, file_path):
-    file_path_csv = file_path.replace('.ZIP', '.CSV')
-    if os.path.exists(file_path) or os.path.exists(file_path_csv):
-        print('Arquivo já baixado anteriormente', file_path)
-        return False
-
-    response = requests.get(url, stream=True)
-
-    if response.status_code != 200:
-        print('Arquivo não encontrado', url, response.status_code)
-        return False
-
-    with open(file_path, "wb") as handle:
-        print('Downloading', url)
-        for data in response.iter_content():
-            handle.write(data)
-    handle.close()
-    return True
-
+import time
+from datetime import datetime, timedelta
 
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
 
-def create_download_folder():
-    # Create directory
-    dirName = os.path.join('downloads')
-
-    try:
-        # Create target Directory
-        os.mkdir(dirName)
-        print("Directory", dirName, "Created ")
-    except FileExistsError:
-        print("Directory", dirName, "already exists")
+import importa_arquivos
+import utils
 
 
-def get_urls():
-    url_base = 'http://www.debentures.com.br/exploreosnd/consultaadados/emissoesdedebentures/'
-    url = url_base+'puhistorico_f.asp'
+def get_links(data_inicial):
+    url_base = 'http://www.debentures.com.br'
+    url_pu = f'{url_base}/exploreosnd/consultaadados/emissoesdedebentures/'
+    url = url_pu+'puhistorico_f.asp'
     res = requests.get(url)
 
     while res.status_code != 200:
-      res = requests.get(url)
+        res = requests.get(url)
 
-    soup = BeautifulSoup(res.text,"html.parser")
-    select = soup.find("select", {"name":"ativo"})
+    soup = BeautifulSoup(res.text, "html.parser")
+    select = soup.find("select", {"name": "ativo"})
 
     urls = []
     for option in select.find_all('option'):
         ativo = option['value'].strip()
-
-        if len(ativo) < 6:
-          continue
-
-        url_download = url_base + 'puhistorico_e.asp?op_exc=False&dt_ini=&dt_fim=&Submit.x=34&Submit.y=13&ativo='+ativo+'++++'
-        urls.append({'ativo': ativo,'url':url_download})
-
-    return urls
 
-
-def download_files_debentures(urls):
-    for url in urls:
-        try:
-            print('Baixando arquivo do ativo', url['ativo'])
-            name_file = url['ativo']+'.csv'
-            path_file = os.path.join('downloads', name_file)
-            # download file
-            download_file(url['url'], path_file)
-        except:
-            print('Erro', url)
+        if len(ativo) < 6:
             continue
 
+        ativo = ativo.replace(' ', '+')
 
-def process_files_debentures():
-    download_path = os.path.join('downloads')
-    for file_name in os.listdir(download_path):
-        path_file = os.path.join(download_path, file_name)
-        print('Processando arquivo', path_file)
-        process_file(path_file)
-        # remove processed file
-        os.remove(path_file)
-
-
-def process_file(file_path):
-    df = pd.read_csv(
-        file_path,
-        skiprows=2,
-        encoding='iso-8859-1',
-        sep='\t'
-    )
-
-    print('Importing {} items'.format(len(df)))
-
-    # remove as linhas com problemas
-    df = df[df['Ativo'].notnull()]
-
-    # remove unnamed columns
-    df.drop('Unnamed: 8', axis=1, inplace=True)
-
-    #print(df.tail())
-
-    for index, row in df.iterrows():
-        try:
-            data = {
-                'data': row['Data do PU'],
-                'ativo': row['Ativo'],
-                'valor_nominal': row['Valor Nominal'],
-                'valor_juros': row['Juros'],
-                'valor_premio': row['Prêmio'],
-                'preco_unitario': row['Preço Unitário'],
-                'criterio_calculo': row['Critério de Cálculo'],
-                'situacao': row['Situação']
-            }
-            scraperwiki.sqlite.save(unique_keys=['data', 'ativo'], data=data)
-        except Exception as e:
-            print("Error occurred:", e)
-            return False
-    return True
+        today = datetime.today()
 
+        url_compl = '/exploreosnd/consultaadados/emissoesdedebentures/'
+        url = f'{url_base}{url_compl}'
+        url = f'{url}puhistorico_e.asp?'
+        url = f'{url}op_exc=False&dt_ini={data_inicial}&Submit.x=34&Submit.y=13'
+        url = f"{url}&dt_fim={today.strftime('%d/%m/%Y')}&ativo={ativo}++++"
+        urls.append({'ativo': ativo, 'url': url})
 
-def main():
-    # create download folder
-    create_download_folder()
+    return urls
 
-    urls = get_urls()
-    download_files_debentures(urls)
-    process_files_debentures()
 
-    # rename file
-    os.rename('scraperwiki.sqlite', 'data.sqlite')
+def main():
+    utils.prepare_download_folder('downloads')
+
+    # pega a data máxima de referência
+    file_path = os.path.join('bases', 'debentures.csv')
+    data_inicial = ''
+    if os.path.exists(file_path):
+        df_base = pd.read_csv(file_path)
+        df_base['data_referencia'] = pd.to_datetime(df_base['data_referencia'])
+        print('Máxima data de referência', df_base['data_referencia'].max())
+        data_inicial = df_base['data_referencia'].max()
+        data_inicial = data_inicial + timedelta(days=1)
+        data_inicial = data_inicial.strftime('%d/%m/%Y')
+
+    urls = get_links(data_inicial)
+
+    tamanho = len(urls)
+    for index, url in enumerate(urls):
+        name_file = url['ativo']+'.csv'
+        path_file = os.path.join('downloads', name_file)
+        print(
+            f'{index+1} de {tamanho}',
+            ' Baixando arquivo do ativo',
+            url['ativo'], name_file
+        )
+
+        utils.download(url['url'], None, path_file)
+        time.sleep(1)
+
+        if index > 0 and index % 50 == 0:
+            print('Aguardando 30 segundos, para evitar timeout')
+            time.sleep(30)
+
+    print('Consolidando arquivos baixados')
+    importa_arquivos.main()
 
 
 if __name__ == '__main__':
     main()
+    time.sleep(60)
+    # rename file
+    os.rename('scraperwiki.sqlite', 'data.sqlite')