Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
royopa committed May 22, 2019
0 parents commit b396f49
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
@@ -0,0 +1,2 @@
# Ignore output of scraper
data.sqlite
1 change: 1 addition & 0 deletions README.md
@@ -0,0 +1 @@
This is a scraper that runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation)
9 changes: 9 additions & 0 deletions requirements.txt
@@ -0,0 +1,9 @@
# It's easy to add more libraries or choose different versions. Any libraries
# specified here will be installed and made available to your morph.io scraper.
# Find out more: https://morph.io/documentation/python

# Custom version of scraperwiki library
-e git+http://github.com/openaustralia/scraperwiki-python.git@morph_defaults#egg=scraperwiki

lxml==3.4.4
cssselect==0.9.1
1 change: 1 addition & 0 deletions runtime.txt
@@ -0,0 +1 @@
python-3.6.2
30 changes: 30 additions & 0 deletions scraper.py
@@ -0,0 +1,30 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
import scraperwiki
import lxml.html
import os

os.environ['SCRAPERWIKI_DATABASE_NAME'] = 'sqlite:///data.sqlite'

# # Read in a page
for pk_partic in reversed(range(1, 999999)):
print(str(pk_partic), end=' ')
url = 'http://cvmweb.cvm.gov.br/SWB/Sistemas/SCW/CPublica/DemContabeis/CPublicaDemContabeisFI.aspx?PK_PARTIC={}'
pk_partic = str(pk_partic)
html = scraperwiki.scrape(url.format(pk_partic))

# Find something on the page using css selectors
root = lxml.html.fromstring(html)

try:
cnpj = root.get_element_by_id("lbNrPfPj").text.replace('.', '').replace('-', '').replace('/', '')
except Exception as e:
continue

data = {
"pk_partic": pk_partic,
"cnpj": cnpj
}

# Write out to the sqlite database using scraperwiki library
scraperwiki.sqlite.save(unique_keys=['pk_partic'], data=data)
Binary file added scraperwiki.sqlite
Binary file not shown.

0 comments on commit b396f49

Please sign in to comment.