# EWG Tap Water Database Crawler

In [4]:

import os
import time
import csv
import asyncio
import json
from urllib.request import urlopen
import sys
import asyncio
import aiohttp
import json
from pprint import pprint
import requests
from pyquery import PyQuery as pq
from sein import log

class ParseException(Exception):
    pass

def record_csv(headers, content, path):
  with open(path, 'w', encoding='utf-8') as f:
    f_csv = csv.writer(f, lineterminator='\n')
    f_csv.writerow(headers)
    f_csv.writerows(content)
  print('save csv done', path)


In [9]:
def fetch_page(url):
    r = requests.get(url)
    return r.content


  
def make_url_list(codes):
    url_fmt = 'https://www.ewg.org/tapwater/system.php?pws={}'.format
    return [url_fmt(c) for c in codes.splitlines() if c.strip()]


def parse_nitrate_content(content, sleep_sec):
    time.sleep(sleep_sec)
    doc = pq(content)
    for div in doc.find('div.slide-toggle'):
      h3 = pq(div).find('p')
      if h3.text().startswith('Nitrate, a fertilizer chemical,'):
        nitrate_div = div
        break
    else:
      raise ParseException('Not found Nitrate block')

    figure = pq(nitrate_div).find('figure.levels-compare-figure')

    state = figure.find('.state-ppb-popup').text()
    national = figure.find('.national-ppb-popup').text()
    utility = figure.find('.this-utility-ppb-popup').text()

    return state, national, utility
  

In [10]:

codes = '''

NY7003493
NY3330032
NY5110526
NY7003666
NY7011735
NY2701047
NY3304336
NY1400422
NY4303673
NY1400443
NY2902835
NY2704518
NY5903465
NY3304334
NY2715672
NY2902840
NY5103247
NY1400444
NY5903444
NY3202411
NY2902830
NY0004553
NY2900000
NY0100189
NY5103263
NY1400399
NY0100198
NY1404557
NY5903441
NY0701008
NY1404556
NY4600070

'''

url_list = make_url_list(codes)


session = aiohttp.ClientSession()
async def fetch_ewg_page(url):
    print('async fetch_ewg_page {url}'.format(**locals()))
    async with session.get(url) as resp:
        body = await resp.read()
        try:
            ret = parse_nitrate_content(body, sleep_sec=1)
            print(ret)
        except ParseException:
            print('can not parse')
    print('async fetch_ewg_page {url} done'.format(**locals()))
    
    
def run_async_tasks(tasks):

    loop = asyncio.get_event_loop()
    results = loop.run_until_complete(asyncio.gather(*tasks))


run_async_tasks([fetch_ewg_page(url) for url in url_list])


Creating a client session outside of coroutine
client_session: <aiohttp.client.ClientSession object at 0x000000000798EDA0>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000000007566588>


async fetch_ewg_page https://www.ewg.org/tapwater/system.php?pws=NY3330032
async fetch_ewg_page https://www.ewg.org/tapwater/system.php?pws=NY1404557
async fetch_ewg_page https://www.ewg.org/tapwater/system.php?pws=NY0100189
async fetch_ewg_page https://www.ewg.org/tapwater/system.php?pws=NY2902830
async fetch_ewg_page https://www.ewg.org/tapwater/system.php?pws=NY1404556
async fetch_ewg_page https://www.ewg.org/tapwater/system.php?pws=NY4303673
async fetch_ewg_page https://www.ewg.org/tapwater/system.php?pws=NY7003493
async fetch_ewg_page https://www.ewg.org/tapwater/system.php?pws=NY7011735
async fetch_ewg_page https://www.ewg.org/tapwater/system.php?pws=NY7003666
async fetch_ewg_page https://www.ewg.org/tapwater/system.php?pws=NY0004553
async fetch_ewg_page https://www.ewg.org/tapwater/system.php?pws=NY3304336
async fetch_ewg_page https://www.ewg.org/tapwater/system.php?pws=NY2701047
async fetch_ewg_page https://www.ewg.org/tapwater/system.php?pws=NY2902835
async fetch_ewg_page http

In [39]:





# for pwsid in code.splitlines():
#     if not pwsid: 
#         continue
#     url = 'https://www.ewg.org/tapwater/system.php?pws={}'.format(pwsid)
#     print(url)
#     content = fetch_page(url)
#     try:
#         ret = parse_nitrate_content(content)
#         print(ret)
#     except ParseException:
#         print('can not parse')

https://www.ewg.org/tapwater/system.php?pws=NY2100644
can not parse
https://www.ewg.org/tapwater/system.php?pws=NY3201450
can not parse
https://www.ewg.org/tapwater/system.php?pws=NY3530213
can not parse
https://www.ewg.org/tapwater/system.php?pws=NY3221176
can not parse
https://www.ewg.org/tapwater/system.php?pws=NY3512263
can not parse
https://www.ewg.org/tapwater/system.php?pws=NY3700998
can not parse
https://www.ewg.org/tapwater/system.php?pws=NY2230013
('0.895 ppm', '0.969 ppm', '0.680 ppm')
https://www.ewg.org/tapwater/system.php?pws=NY2230023
can not parse
https://www.ewg.org/tapwater/system.php?pws=NY2701042
can not parse
https://www.ewg.org/tapwater/system.php?pws=NY3730024
can not parse
https://www.ewg.org/tapwater/system.php?pws=NY3212122
can not parse
https://www.ewg.org/tapwater/system.php?pws=NY2230090
can not parse
https://www.ewg.org/tapwater/system.php?pws=NY3221798
can not parse
https://www.ewg.org/tapwater/system.php?pws=NY3700919
can not parse
https://www.ewg.org/ta

In [41]:
8 +19 +120 +282 +151 +21 +239 +567 +202 +3549 +1810 +4194 +273 +3967 +18131 +4588 +11043 +1300 +19678 +5469 +18 +228 +5201 +4321 +26598 +9488 +1683 +2189 +5051 +3476 +7800 +6191 +25680 +20946 +5698 +203 +3172 +4581 +23139 +1591 +2805 +4003 +14658 +2591 +250 +1364 +25918 +15903 +5603 +5802 +22436 +965 +1112 +5894 +1538 +4939 +15282 +2114 +8721 +986 +4286 +9465 +23678 +4337 +1859

419374