# EWG Tap Water Database Crawler

In [None]:

import os
import time
import csv
import asyncio
import json
from urllib.request import urlopen
import sys
import asyncio
import aiohttp
import json
from pprint import pprint
import requests
from pyquery import PyQuery as pq
# from sein import log

class ParseException(Exception):
    pass

def record_csv(headers, content, path):
  with open(path, 'w', encoding='utf-8') as f:
    f_csv = csv.writer(f, lineterminator='\n')
    f_csv.writerow(headers)
    f_csv.writerows(content)
  print('save csv done', path)
  
  
def windows(iterable, length=2, overlap=0, yield_tail=False):
  '''按照固定窗口大小切片list, 可以重叠
  滑动array窗口,
  每次提供length数目的元素,如果有overlap则重复之前的元素
  yield_tail: 最后不足 length 的那部分元素是否也要 yield'''
  import itertools
  if length <= overlap:
    raise AttributeError('overlap {} cannot larger than length {}'.format(overlap, length))
  it = iter(iterable)
  results = list(itertools.islice(it, length))
  while len(results) == length:
    yield results
    results = results[length-overlap:]
    results.extend(itertools.islice(it, length-overlap))
  if results and yield_tail:
    yield results

In [None]:
def fetch_page(url):
    r = requests.get(url)
    return r.content


  
def make_code_list(path):
    ret = []
    with open(path, 'r') as f:
        for line in f.readlines():
            if line.startswith('PWS ID') or not line.strip():
                continue
            else:
                ret.append(line.split(',')[0])
    return ret


def parse_nitrate_content(content):

    doc = pq(content)
    for div in doc.find('div.slide-toggle'):
      h3 = pq(div).find('p')
      if h3.text().startswith('Nitrate, a fertilizer chemical,'):
        nitrate_div = div
        break
    else:
      raise ParseException('Not found Nitrate block')

    figure = pq(nitrate_div).find('figure.levels-compare-figure')

    state = figure.find('.state-ppb-popup').text()
    national = figure.find('.national-ppb-popup').text()
    utility = figure.find('.this-utility-ppb-popup').text()

    return state, national, utility
  

In [None]:

codes = '''NY7003493
NY3330032
NY5110526
NY7003666
NY7011735
NY2701047
NY3304336
NY1400422
NY4303673
NY1400443
NY2902835
NY2704518
NY5903465
NY3304334
NY2715672
NY2902840
NY5103247
NY1400444
NY5903444
NY3202411
NY2902830
NY0004553
NY2900000
NY0100189
NY5103263
NY1400399
NY0100198
NY1404557
NY5903441
NY0701008
NY1404556
NY4600070'''

codes = '''PR0002591
PR0005477
PR0002000
PR0003283
PR0003824
PR0003293
PR0002652
PR0005386
PR0002672
PR0005296
PR0005306
PR0005176
PR0002772
PR0003262
PR0003343
PR0004745
PR0004664
PR0004835
PR0004314
PR0005086
PR0003904'''

session = aiohttp.ClientSession()
async def fetch_ewg_page(code, output_file):
    url_fmt = 'https://www.ewg.org/tapwater/system.php?pws={}'.format
    url = url_fmt(code)
    # print('async fetch_ewg_page {url}'.format(**locals()))
    async with session.get(url) as resp:
        body = await resp.read()
        try:
            ret = parse_nitrate_content(body)
            print('done {url} {ret}'.format(**locals()))
        except ParseException:
            ret = ['not_found', 'not_found', 'not_found']
            print('fail {url} can not parse'.format(**locals()))
        with open(output_file, 'a') as output:
            output.write('{},{},{},{},{}\n'.format(code, ret[0], ret[1], ret[2], url))
    
    
def run_async_tasks(codes, output_file, task_per_turn=5, task_sleep=10):
    with open(output_file, 'a') as output:
        output.write('pwsid,state,national,this_utility,url\n')

    loop = asyncio.get_event_loop()
    for part in windows(codes, length=task_per_turn, yield_tail=True):
        print('send new tasks')
        tasks = [fetch_ewg_page(code, output_file) for code in part]
        results = loop.run_until_complete(asyncio.gather(*tasks))
        time.sleep(task_sleep)

        

run_async_tasks(
                make_code_list('NY_[25918].csv'), 
                # codes.splitlines(),
                output_file='NY_[25918]_fetch_nitrate.csv',
                task_per_turn=10, task_sleep=3)
