## References
- Lib: https://github.com/okfn-brasil/serenata-toolbox
- https://gist.github.com/fgrehm/572ba814d617e831f4b1faac5e0b9165


In [None]:
!pip install -U serenata-toolbox

In [None]:
!mkdir data

In [None]:
from serenata_toolbox.datasets import Datasets
datasets = Datasets('data/')

# now lets see what are the latest datasets available
for dataset in datasets.downloader.LATEST:
    print(dataset)  # and you'll see a long list of datasets!

# and let's download one of them
datasets.downloader.download('2018-01-05-reimbursements.xz')  # yay, you've just downloaded this dataset to data/


In [None]:
reimbursements = pd.read_csv('data/2018-01-05-reimbursements.xz', low_memory=False)

In [None]:
print("Total reimbursements:", len(reimbursements))

In [None]:
reimbursements.year.value_counts()

In [None]:
reimbursements = reimbursements.query('(subquota_description == "Congressperson meal")')
reimbursements.year.value_counts()

In [None]:
reimbursements = reimbursements.query('year >= 2015')
len(reimbursements)

In [None]:
def url(row):
    args = (row.applicant_id, row.year, row.document_id)
    return (
        'http://www.camara.gov.br/'
        'cota-parlamentar/documentos/publ/{}/{}/{}.pdf'
    ).format(*args)

reimbursements['url'] = reimbursements.apply(url, axis=1)

In [None]:
%%time

if not os.path.exists("data/ocr-receipts"):
    os.makedirs("data/ocr-receipts")

def load_url(document_id, url, timeout):
    receipt_file = "data/ocr-receipts/{}.pdf".format(document_id)
    if os.path.exists(receipt_file):
        return True
    else:
        return urlretrieve(url, receipt_file, )

# We can use a with statement to ensure threads are cleaned up promptly
receipts_downloaded = {}
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
    # Start the load operations and mark each future with its URL
    future_to_document_id = {executor.submit(load_url, r.document_id, r.url, 60): r.document_id for _, r in reimbursements.iterrows()}
    i = 0
    for future in concurrent.futures.as_completed(future_to_document_id):
        document_id = future_to_document_id[future]
        try:
            i += 1
            if i % 1000 == 0:
                print("At", i)
            future.result()
        except Exception as exc:
            print('%r generated an exception: %s' % (document_id, exc))
            receipts_downloaded[document_id] = False
        else:
            receipts_downloaded[document_id] = True


In [None]:
%%bash 
du -h data/ocr-receipts/

In [None]:
r = reimbursements.set_index('document_id')
r['receipt_downloaded'] = pd.Series(receipts_downloaded)
r = r.reset_index()
r.head(10)
