In [None]:
from bs4 import BeautifulSoup
from collections import defaultdict
import csv
import datetime
from functools import lru_cache
import os
import random
import re
import requests
import string
import subprocess

def extract_park_geo(park_doc):
    p = dict()
    for var in ("latitude", "longitude", "signName", "parkId"):
        m = re.search(f"var {var} = ([^;]+);", park_doc)
        if not m:
            print(f"Missing {var}")
            p[var] = None
        else:
            p[var] = m.group(1).strip('"')
        
    return p


def prefetch_local_pages(root_dir):
    cmd = """wget --recursive --level 2 https://www.nycgovparks.org/facilities/bathrooms 
    -I parks,facilities --reject pdf,jpg,video --wait 0.1 --timeout 1 --tries 3""".split()
    os.chdir(root_dir)
    print(cmd)
    print(subprocess.call(cmd))


class LocalScraper:
    ROOT_DIR = 'data/www.nycgovparks.org/'

    def fetch_potty_doc(self):
        potty_page = f"{self.ROOT_DIR}/facilities/bathrooms"
        return open(potty_page).read()
    
    @lru_cache(maxsize=None)
    def fetch_park_geo(self, park_id):
        park_dir = f"{self.ROOT_DIR}/parks/{park_id}"
        if not os.path.isdir(park_dir):
            print(f"{park_id} not a dir")
            return {}
        if not os.path.isfile(f"{park_dir}/map"):
            print(f"{park_id} no map")
            return {}
    
        return extract_park_geo(open(f"{park_dir}/map").read())
    
    
class OnlineScraper:
    ROOT_URL = 'https://www.nycgovparks.org'
    
    def fetch_potty_doc(self):
        potty_page = f"{self.ROOT_URL}/facilities/bathrooms"
        return requests.get(potty_page).content

    @lru_cache(maxsize=None)
    def fetch_park_geo(self, park_id):
        park_page = f"{self.ROOT_URL}/parks/{park_id}/map"
        return extract_park_geo(requests.get(park_page).content.decode('utf-8'))


def potty_list(doc):
    soup = BeautifulSoup(doc, 'html.parser')
        
    potties = []
    
    tabnav = soup.find('ul', class_='nav-tabs').find_all('li')
    boroughs = { li.find('a').attrs['href']: li.find('a').text 
                 for li in tabnav }
    
    tab = soup.find(class_='tab-content')
    for pane in tab.find_all('div', class_='tab-pane'):
        boro = boroughs[f"#{pane.attrs['id']}"]
        rows = pane.find_all('tr')
        
        for r in rows:
            p = { 'park_borough': boro }
            td = r.find_all('td')
            if not td:
                continue
        
            p['potty_name'] = td[0].text

            park = list(td[1].children)
            if not park:
                continue
            park_link = park[0]
            p['park_name'] = park_link.text
            p['park_page'] = park_link.attrs['href'][7:]
            
            if len(park) > 2:
                p['potty_location'] = park[2]
            else:
                p['potty_location'] = park[1].text
            
            p['accessible'] = (len(td) > 3 and td[3].find('img', alt='Accessible') is not None)
            
            potties.append(p)
        
    return potties


def potty_rec(potty):
    return {
        'ID': f"P{''.join(random.choices(string.ascii_uppercase + string.digits, k=5))}",
        'Borough': potty['park_borough'],
        'Name': potty['potty_name'],
        'Park Name': potty['park_name'],
        'Human-readable location': potty['potty_location'],
        'Lat': potty['park_geo'].get('latitude', ''),
        'Lon': potty['park_geo'].get('longitude', ''),
        'Location URL': f"https://www.nycgovparks.org/parks/{potty['park_page']}",
        'Type': '',
        'Number of stalls': '',
        'Accessible': potty['accessible'],
        'Hours': '',
        'Last updated': datetime.datetime.now().isoformat(),
        'Scrape date': datetime.date.today().isoformat(),
    }


def write_csv(output_fh, potties):
    writer = csv.DictWriter(output_fh, fieldnames=[
        'ID', 'Borough', 'Name', 'Park Name', 'Human-readable location', 'Lat', 'Lon', 
        'Location URL', 'Type', 'Number of stalls', 'Accessible',
        'Hours', 'Last updated', 'Scrape date'
    ])
    writer.writeheader()
    for p in potties:
        writer.writerow(potty_rec(p))
    

def get_potties_with_park_geo(scraper):
    doc = scraper.fetch_potty_doc()
    ps = potty_list(doc)

    for p in ps:
        p['park_geo'] = scraper.fetch_park_geo(p['park_page'])

    return ps

In [None]:
# prefetch_local_pages('/Users/ivancho/src/pottyproject/data/')
# scraper = LocalScraper()

scraper = OnlineScraper()
    
ps = get_potties_with_park_geo(scraper)

len(ps), len([p for p in ps if p['park_geo']])

In [None]:
# Save all the data we scraped
with open('potty_raw.csv', 'w') as fh:
    write_csv(fh, ps)