In [428]:
import numpy as np
import pandas as pd
from time import sleep
import sqlite3
import json
import dill
from itertools import chain
import geopy.distance

from pyinaturalist.node_api import get_places_autocomplete, get_places_by_id, get_places_nearby
from pyinaturalist.node_api import (
    get_observations,
    get_observation_species_counts,
    get_observation_observers,
    get_observation_identifiers,
)

import matplotlib.pyplot as plt

In [875]:
# get park info

# import sys
# sys.path.append("../")
from utility import get_info, find_city_level_address

info_to_concat = []
info_list = ['../philly_parks_info.db', '../philly_state_parks_info.db', '../nyc_parks_info.db']
for info_db in info_list:
    df = get_info(info_db)
    info_to_concat.append(df)
    
info_df = pd.concat(info_to_concat, axis=0)

info_df.reset_index(drop=True, inplace=True)
info_df.shape

(4614, 5)

In [960]:
# info_df.drop(columns=['inat_nb_place_id', 'inat_nb_location_lat', 'inat_nb_location_lng', 'inat_nb_name', 'inat_nb_distance'], inplace=True)
info_df.drop(columns=['inat_ac_place_id', 'inat_ac_location_lat', 'inat_ac_location_lng', 'inat_ac_name', 'inat_ac_distance'], inplace=True)


We will use inaturalist to explore biodiversity in parks of interest. To do so, we first need to find inaturalist place_id for our parks, and then query for species_count under each place_id.

## Get inaturalist place_id
Some examples using `get_places_autocomplete()`

In [87]:
response = get_places_autocomplete(q='Bartram\'s garden, PA')
# other examples: 'Starlight Park' #'Abbott Marshlands' #'Quiet Waters Park, Annapolis, MD'
response['results'][0]

{'ancestor_place_ids': [97394, 1, 42, 2983, 68325, 122846],
 'bounding_box_geojson': {'coordinates': [[[-75.2159535885, 39.9282339211],
    [-75.2159535885, 39.935374929],
    [-75.2061903477, 39.935374929],
    [-75.2061903477, 39.9282339211],
    [-75.2159535885, 39.9282339211]]],
  'type': 'Polygon'},
 'bbox_area': 6.971937968240231e-05,
 'admin_level': None,
 'place_type': 20,
 'name': "Bartram's Garden",
 'location': [39.9320063533, -75.2110479939],
 'id': 122846,
 'display_name': "Bartram's Garden, PA, US",
 'uuid': '477c2ab3-cf8a-4fc9-9002-1e8a338e465f',
 'slug': 'bartram-s-garden',
 'geometry_geojson': {'coordinates': [[[[-75.2068555355072, 39.9353749290365],
     [-75.21569609642029, 39.93325244917665],
     [-75.2159535884857, 39.93284110825641],
     [-75.21558880805969, 39.932676571196126],
     [-75.2138078212738, 39.93305500584344],
     [-75.21365761756896, 39.932791747179856],
     [-75.21395802497864, 39.932248772988316],
     [-75.21355032920837, 39.93198551122373],
 

In [113]:
get_places_by_id(125178)

{'total_results': 1,
 'page': 1,
 'per_page': 1,
 'results': [{'ancestor_place_ids': [50605, 125178],
   'bounding_box_geojson': {'coordinates': [[[-73.884408474, 40.8298426409],
      [-73.884408474, 40.8372556055],
      [-73.8792586327, 40.8372556055],
      [-73.8792586327, 40.8298426409],
      [-73.884408474, 40.8298426409]]],
    'type': 'Polygon'},
   'bbox_area': 3.817559125251798e-05,
   'admin_level': None,
   'place_type': None,
   'name': 'Starlight Park',
   'location': [40.8338378995, -73.8818974355],
   'id': 125178,
   'display_name': 'Starlight Park',
   'uuid': '1349c5d9-f36f-4e0b-af78-53865fb2a6bf',
   'slug': 'starlight-park',
   'geometry_geojson': {'coordinates': [[[[-73.88402223587036,
        40.82984264090415],
       [-73.8795804977417, 40.83553475463228],
       [-73.87925863265991, 40.83577827397769],
       [-73.87996673583984, 40.836281544456924],
       [-73.87938737869261, 40.8368822171243],
       [-73.88056755065918, 40.83725560549862],
       [-73.88

Using `/places/nearby`

In [120]:
bounding_box = [38.946569315132784, -76.49152286765548, 38.930909760960105, -76.51493759405312]
response0 = get_places_nearby(*bounding_box)
# response0 = get_places_nearby(*bounding_box, name='Quite*')#'Quite Waters Park, Annapolis, MD')
response0

{'total_results': 0,
 'page': 1,
 'per_page': 0,
 'results': {'standard': [], 'community': []}}

In [82]:
key = 'community'#['standard', 'community']
num_results = len(response0['results'][key])

print({p['id']: p['name'] for p in  response0['results']['community']})

['MarineGEO Chesapeake Bay',
 'South River, Edgewater, MD',
 'USGS Quad: Annapolis',
 'USGS Quad: South River',
 'Quiet Waters Park, Annapolis, MD']

In [118]:
bounding_box = (150.0, -50.0, -149.999, -49.999)

response = get_places_nearby(*bounding_box, name='Mahurangi College')
response

{'total_results': 1,
 'page': 1,
 'per_page': 1,
 'results': {'standard': [],
  'community': [{'ancestor_place_ids': [97393, 6803, 8345, 119755],
    'bounding_box_geojson': {'coordinates': [[[-185.3471767902374,
        -36.40698885244839],
       [-185.3471767902374, -36.402822451222875],
       [-185.34240245819092, -36.402822451222875],
       [-185.34240245819092, -36.40698885244839],
       [-185.3471767902374, -36.40698885244839]]],
     'type': 'Polygon'},
    'bbox_area': 1.49992432378257,
    'admin_level': None,
    'place_type': None,
    'name': 'Mahurangi College',
    'location': None,
    'id': 119755,
    'display_name': 'Mahurangi College, AU, NZ',
    'slug': 'mahurangi-college',
    'geometry_geojson': {'coordinates': [[[-185.3471767902374,
        -36.406708221041235],
       [-185.34256875514984, -36.40562454255458],
       [-185.34289062023163, -36.402822451222875],
       [-185.3445053100586, -36.404726502714844],
       [-185.3471767902374, -36.406708221041235]

Two steps to acquire matched place_id: 1. use autocomplete name search; 2. use nearby search with boundingbox 

### 1. Use autocomplete starting from info_df['name']
Results can be selected by comparing with [lat, lng]

In [149]:
response = find_place_by_autocomplete[0]
print({p['id']: p['name'] for p in  response['results']})

{133786: 'Quiet Waters Park, Broward', 60598: 'Quiet Waters Park, Annapolis, MD'}


In [None]:
# query for all names
# find_place_by_autocomplete = []
for name in info_df['name'][3264:]:
    find_place_by_autocomplete.append(get_places_autocomplete(q=name))
    sleep(.2)
    

with open('find_place_by_autocomplete.dill', 'wb') as f:
    dill.dump(find_place_by_autocomplete, f)
    

In [170]:
# with open('find_place_by_autocomplete.dill', 'rb') as f:
#     find_place_by_autocomplete = dill.load(f)

In [219]:
ind_with_results = [(i, find_place_by_autocomplete[i]['total_results']) for i in range(len(find_place_by_autocomplete)) 
     if find_place_by_autocomplete[i]['total_results']]


In [421]:
def lat_lng_dist(loc1, loc2):
    return geopy.distance.distance(loc1, loc2).miles

In [224]:
# find the right park among the returned results
match_closest = []
for ind, _ in ind_with_results:
    all_loc = [find_place_by_autocomplete[ind]['results'][i]['location'] for i in range(len(find_place_by_autocomplete[ind]['results']))]

    target_loc = np.array(info_df.loc[ind, ['lat', 'lng']])

    all_dist = [lat_lng_dist(target_loc, loc) for loc in all_loc]
    match_closest.append((ind, min(all_dist), find_place_by_autocomplete[ind]['results'][all_dist.index(min(all_dist))]['name'], 
           info_df.loc[ind, 'name']))


In [729]:
len(ind_with_results)

1064

In [233]:
# plt.hist([item[1] for item in match_closest], 100); 
len([item[1] for item in match_closest if item[1]<1])

583

In [958]:
list(info_df.loc[1823])

['ChIJj_bEvjsJw4kRhJLh-E_oLyU',
 'Island Beach',
 40.88993579999999,
 -74.44297949999999,
 '1 Bloomfield Ave, Mountain Lakes, NJ 07046, USA',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 162895.0,
 40.5727977648,
 -73.9781726582,
 'Coney Island Beach and Boardwalk, Brooklyn, NY, USA',
 32.77408672199006]

In [724]:
# missing_nearby_with_named_results = set(missing_nearby) & {x[0] for x in ind_with_results}

In [959]:
cnt_very_near = 0
cnt_near = 0

for ind, _ in ind_with_results:
    park_names = [find_place_by_autocomplete[ind]['results'][i]['name'] for i in range(len(find_place_by_autocomplete[ind]['results']))]
    all_loc = [find_place_by_autocomplete[ind]['results'][i]['location'] for i in range(len(find_place_by_autocomplete[ind]['results']))]
    target_loc = np.array(info_df.loc[ind, ['lat', 'lng']])

    all_dist = [(i, lat_lng_dist(target_loc, loc)) for i, loc in enumerate(all_loc)]
    min_dist_index, min_dist = sorted(all_dist, key=lambda x: x[1])[0]
    
    if min_dist < 1:
        cnt_very_near += 1
        pass
        #print(f"Found for {ind} - {info_df.loc[ind, 'name']} - {park_names[min_dist_index]}")
    elif min_dist > 1 and min_dist <= 10:
        cnt_near += 1
        #print(f"Found for {ind} - {info_df.loc[ind, 'name']} - {park_names[min_dist_index]}")
    elif min_dist > 30 and min_dist <= 50:
        print(f"Found for {ind} - {info_df.loc[ind, 'name']} - {park_names[min_dist_index]} - {min_dist}")
        
print(cnt_very_near, cnt_near)

BLACKLIST_AC_IND = {211, 807, 827, 840, 960, 1055, 1304, 1336, 1419, 1424, 1651, 1813, 1819, 1823, 1844, 1866, 1878, 
                   1921, 1977, 2097, 2227, 2374, 2390, 2417, 2607, 2826, 3172, 3180, 3428, 3521, 3714, 3779, 3884,
                   4066, 4098, 4141, 4154, 4184, 4292, 4319, 4367, 4392, 4414, 4512, 4522}

Found for 16 - Rock Creek Park - Rock Creek Stream Valley Park Unit 2 - 31.511895013317794
Found for 58 - Nottingham Park - Nottingham County Park - 34.71241003989495
Found for 211 - Nature Trail - Concord Elementary School Nature Trail (Closed During School Hours) - 45.64167356992564
Found for 419 - Lewis Park - Samuel S Lewis State Park - 47.91282617128921
Found for 807 - Veterans Park - Lawrence Veterans Park - 38.70896754768442
Found for 827 - Veterans Park - Lawrence Veterans Park - 35.251427918941445
Found for 840 - Veterans Park - Lawrence Veterans Park - 36.36994685422997
Found for 1055 - Drexel Park - Drexel Woods Park - Lawrence Nature Center - 31.114424879624966
Found for 1424 - Bunker Hill Trails - Bunker-Hill-Road-Trails - 44.20987505744834
Found for 1651 - Township Park - Inman Park Franklin Township - 31.505635121572237
Found for 1813 - Green acres park - Green Acres Park Burlington Township, NJ - 43.07464124565231
Found for 1819 - Central Park - Central Park and Vicinit

In [961]:
# since you'd match these places through two means (ac-autocomplete, nb-nearby), will record the matching results under each
# and then later filter to keep the correct one

for ind, _ in ind_with_results:
    target_loc = np.array(info_df.loc[ind, ['lat', 'lng']])
    
    # get the search result and sort by distance
    park_names = [result['name'] for result in find_place_by_autocomplete[ind]['results']]
    park_place_id = [result['id'] for result in find_place_by_autocomplete[ind]['results']]
    all_loc = [result['location'] for result in find_place_by_autocomplete[ind]['results']]

    all_dist = [(i, lat_lng_dist(target_loc, loc)) for i, loc in enumerate(all_loc)]
    min_dist_index, min_dist = sorted(all_dist, key=lambda x: x[1])[0]
    
    if min_dist < 50 and ind not in BLACKLIST_AC_IND:
        # the reason that we need such a large min_dist threshold, state park/forest can be off by a lot
        #
        # record the distance closest searching result
        # note that when min_dist is extreme large, the result should be excluded, which will be handled later
        info_df.loc[ind, 'inat_ac_place_id'] = park_place_id[min_dist_index]
        info_df.loc[ind, 'inat_ac_location_lat'] = all_loc[min_dist_index][0]
        info_df.loc[ind, 'inat_ac_location_lng'] = all_loc[min_dist_index][1]
        info_df.loc[ind, 'inat_ac_name'] = park_names[min_dist_index]
        info_df.loc[ind, 'inat_ac_distance'] = min_dist


### 2. Use nearby query with manual bounding box created based on [lat,lng] or Google geometry.viewport
This can be useful when the inaturalist place_id is not recorded with a similar name as the park itself - however close enough to be considered as relevant. Note that large city/county or region level place_id (including USGS) were excluded for the lack of specificity.

In [307]:
# create a table with info about Google geometry.viewport (which is largely the same size box...)
# other raw data files: 'nyc_popularity_20210317_133549.db'; '../pine_grove_cape_may_r2k_popularity_20210316_144723.db'
def get_ne_sw(input_db):
    con = sqlite3.connect(input_db)
    pop_df_detail = pd.read_sql_query("SELECT id, details FROM popularity WHERE has_popular_times > 0", con)
    pop_df_detail.head()
    con.close()
    
    pop_df_detail[['northeast', 'southwest']] = pop_df_detail['details'].apply(get_bbox)

    pop_df_detail.drop(columns=['details', ], inplace=True)
    return pop_df_detail

def get_bbox(detail):
    bbox = json.loads(detail)['geometry']['viewport']
    return pd.Series([tuple(bbox['northeast'].values()), tuple(bbox['southwest'].values())])


In [423]:
ind = 750
bbox = json.loads(pop_df_detail['details'][ind])['geometry']['viewport']

# list(bbox['northeast'].values())
lat_lng_dist(list(bbox['northeast'].values()), list(bbox['southwest'].values()))

In [357]:
df_for_concat = []
for files in ['../pine_grove_cape_may_r2k_popularity_20210316_144723.db', '../nyc_popularity_20210317_133549.db']:
    df_for_concat.append(get_ne_sw(files))
    

In [358]:
def get_bbox_state_park(detail):
    bbox = json.loads(detail)['candidates'][0]['geometry']['viewport']
    return pd.Series([tuple(bbox['northeast'].values()), tuple(bbox['southwest'].values())])
#'state_forest_PA_google_place_id.db' & '../state_park_PA_google_place_id.db'

for file in ['../state_park_PA_google_place_id.db', '../state_forest_PA_google_place_id.db']:
    state_park_con = sqlite3.connect(file)
    try:
        state_park = pd.read_sql_query("SELECT id, data FROM state_park", state_park_con)
    except:
        state_park = pd.read_sql_query("SELECT id, data FROM state_forest", state_park_con)

    state_park_con.close()

    state_park[['northeast', 'southwest']] = state_park['data'].apply(get_bbox_state_park)
    state_park.drop(columns=['data', ], inplace=True)
    df_for_concat.append(state_park)
    

In [387]:
info_df_with_bbox = pd.concat(df_for_concat, axis=0)

info_df_with_bbox = info_df_with_bbox.drop_duplicates(subset=['id'])

In [749]:
# merge the coords back to info_df
info_df = info_df.merge(info_df_with_bbox, how='left', on='id')
info_df.shape

(4614, 12)

In [396]:
from tqdm import tqdm

In [398]:
# query inaturalist API for place_id
find_place_by_nearby = []

for i in tqdm(range(info_df.shape[0])):
    
    bbox = chain(*info_df.loc[i,['northeast', 'southwest']])
    find_place_by_nearby.append((info_df.loc[i,'id'],get_places_nearby(*bbox)))
    sleep(1)


100%|██████████| 4614/4614 [2:00:03<00:00,  1.56s/it]  


In [399]:
with open('find_place_by_nearby.dill', 'wb') as f:
    dill.dump(find_place_by_nearby, f)
    
# with open('find_place_by_nearby.dill', 'rb') as f:
#     find_place_by_nearby = dill.load(f)

##### Matching based on names and distance

In [426]:
# 'standard' only contains county-level places
find_place_by_nearby[0][1]['results']['community'][-1].keys()

dict_keys(['ancestor_place_ids', 'bounding_box_geojson', 'bbox_area', 'admin_level', 'place_type', 'name', 'location', 'id', 'display_name', 'uuid', 'slug', 'geometry_geojson'])

In [477]:
ind = 10
key = 'community'
print(info_df.loc[ind, 'name'])
print([result['name'] for result in find_place_by_nearby[ind][1]['results'][key]])

# distance
target_loc = info_df.loc[ind, ['lat', 'lng']]
dist_all_nearby = [lat_lng_dist(result['location'], target_loc) for result in find_place_by_nearby[ind][1]['results'][key]]
print(dist_all_nearby)

min(dist_all_nearby)


Jonas and Anne Catharine Green Park
['MarineGEO Chesapeake Bay', 'USGS Quad: Annapolis']
[9.192217474139998, 4.708290435334753]


4.708290435334753

In [501]:
a = 'village park'
b = 'test village park'
len(a)<len(b)

True

In [591]:
# decide whether two names are similar 
import re

def tokenize(string):
    return [t.lower() for t in re.split('\W', string) if t !='']

def similar_or_not(a, b):
    a = tokenize(a)
    b = tokenize(b)
    
    if len(a) <= len(b):
        return set(a).issubset(set(b))
    else:
        return set(b).issubset(set(a))
    
def fuzzy_similar_or_not(a, b):
    if len(set.intersection(set(tokenize(a)), set(tokenize(b))))>=2:
        return True
    else:
        return False


In [549]:
info_df.head()

Unnamed: 0,id,name,lat,lng,address,northeast,southwest,inaturalist_place_id,inaturalist_name,inaturalist_distance
0,ChIJAWkAqNL1t4kRlm4slspOSXo,Quiet Waters Park,38.93767,-76.500899,"600 Quiet Waters Park Rd, Annapolis, MD 21403,...","(38.9390092302915, -76.4995525197085)","(38.9363112697085, -76.50225048029151)",60598.0,"Quiet Waters Park, Annapolis, MD",0.136006
1,ChIJzcow6Xb1t4kRQVE7s1AWWr8,Hillsmere Shores Community Beach,38.927212,-76.49415,"101 W Bay View Dr, Annapolis, MD 21403, USA","(38.9285315302915, -76.49278681970848)","(38.92583356970851, -76.49548478029149)",,,
2,ChIJ_bk5K1z1t4kRQNverIUOVko,Quiet Waters Dog Beach,38.930202,-76.508341,"1701-1799 Quiet Waters Park Rd, Annapolis, MD ...","(38.93155118029149, -76.5069917197085)","(38.92885321970849, -76.5096896802915)",60598.0,"Quiet Waters Park, Annapolis, MD",0.704592
3,ChIJ_-2cQEX1t4kRoiCrmNMbrQ8,Quiet Waters Dog Park,38.931865,-76.505658,"600 Quiet Waters Park Rd, Annapolis, MD 21403,...","(38.93345803029149, -76.50433266970849)","(38.9307600697085, -76.5070306302915)",60598.0,"Quiet Waters Park, Annapolis, MD",0.541753
4,ChIJQWcsgIn2t4kRCL0Ub363FyI,Pip Moyer Recreation Center (Annapolis Recreat...,38.963271,-76.50515,"273 Hilltop Ln, Annapolis, MD 21403, USA","(38.9647144802915, -76.50395901970847)","(38.9620165197085, -76.5066569802915)",52243.0,Eastport Neighborhood,0.880709


In [686]:
debug_index(4)

Pip Moyer Recreation Center (Annapolis Recreation and Parks)
[(('MarineGEO Chesapeake Bay', 100), 7.4028805546384495), (('USGS Quad: South River', 17), 3.564147031826793), (('Eastport Neighborhood', None), 0.880708658508032)]


In [764]:
is_city_district("New York City")

True

In [877]:
BLACKLISTED_PLACE_TYPES = {7, 8, 9, 12,
                           1000, # Township
                           17, # USGS quad
                           21, 25}
# too large of an area
BLACKLISTEC_PLACE_NAME = {'Philadelphia Parks North', 'Philadelphia Parks South', 
                          'Bronx River Watershed and some surrounding areas', 'Staten Island', 
                          'Town of North Hempstead'}

STOP_WORDS = {'park', 'field', 'fields', 'playground', 'meadows', 'parks', 'new', 'district'}

def check_token_ovelap_threshold(t1, t2):
    st1 = set(tokenize(t1))
    st2 = set(tokenize(t2))
    
    overlap = st1 & st2
    if len(overlap) > 1:
        return True
    elif len(overlap) == 0:
        return False
    else:
        return list(overlap)[0] not in STOP_WORDS

# to not include instances such as "New York City" or "New York City Council District" 
# (whose type is None, so cannot be removed through BLACKLISTED_PLACE_TYPES)    
CITY_REGEX = re.compile(r'New York City( \(.*\))?')
def is_city_district(name):
    return name.find('New York City Council District') != -1 or \
        CITY_REGEX.search(name) is not None


# Some matching rules
# 1. check if nearby place_id has similar name as the target, pick the most similar one 
#    (also included a few heuristics to make sure that the selected place_id not only match one word such as "park"); 
# 2. if no name match found, check whether any meaningful place_id (e.g not in BLACKLISTED_PLACE_TYPES) located within 1 mile

def heuristic_match(key='community'):
    for ind in range(len(info_df)):
        # exclude all inaturalist entries with invalid place types
        nearby_parks = [result for result in find_place_by_nearby[ind][1]['results'][key]
                        if result.get('place_type') not in BLACKLISTED_PLACE_TYPES]
        nearby_park_names = [r['name'] for r in nearby_parks]
        target_loc = info_df.loc[ind, ['lat', 'lng']]
        google_title = info_df.loc[ind, 'name']
        dist_all_nearby = [(i, lat_lng_dist(r['location'], target_loc))
                           for i, r in enumerate(nearby_parks)]
        
        if nearby_parks:
            # find the park with the highest title similarity
            # otherwise, find the nearest park
            most_similar_park_idx, js = sorted([(i, jaccard_similarity(google_title, t))
                                                for i, t in enumerate(nearby_park_names)], key=lambda x: -x[1])[0]
            
            if js <= 0 or not check_token_ovelap_threshold(google_title, nearby_park_names[most_similar_park_idx]):
                # no token overlap, rule 2: find a park that is nearest
                most_similar_park_idx, dist = sorted(dist_all_nearby, key=lambda x: x[1])[0]
                if dist > 1 or is_city_district(nearby_park_names[most_similar_park_idx]):
                    most_similar_park_idx = None
            
            # exclude a few more that are extremely large or in blacklist
            if most_similar_park_idx is not None and \
               (nearby_park_names[most_similar_park_idx] in BLACKLISTEC_PLACE_NAME or \
               nearby_parks[most_similar_park_idx]['bbox_area'] > .1):
                most_similar_park_idx = None
            
            if most_similar_park_idx is not None:
                # integrate inaturalist place_id info to info_df
                ms_idx = most_similar_park_idx
                info_df.loc[ind, 'inat_nb_place_id'] = nearby_parks[ms_idx]['id']
                info_df.loc[ind, 'inat_nb_location_lat'] = nearby_parks[ms_idx]['location'][0]
                info_df.loc[ind, 'inat_nb_location_lng'] = nearby_parks[ms_idx]['location'][1]
                info_df.loc[ind, 'inat_nb_name'] = nearby_park_names[ms_idx]
                info_df.loc[ind, 'inat_nb_distance'] = dist_all_nearby[ms_idx][1]
                info_df.loc[ind, 'inat_nb_bbox_area'] = nearby_parks[ms_idx]['bbox_area']
                

heuristic_match(key='community')

In [855]:
pd.options.display.max_rows = 10
info_df
# info_df[(info_df['inat_nb_distance']>.5) & (info_df['inat_nb_distance']<1)]

Unnamed: 0,id,name,lat,lng,address,inat_ac_place_id,inat_ac_location_lat,inat_ac_location_lng,inat_ac_name,inat_ac_distance,northeast,southwest,inat_nb_bbox_area,inat_nb_place_id,inat_nb_location_lat,inat_nb_location_lng,inat_nb_name,inat_nb_distance
0,ChIJAWkAqNL1t4kRlm4slspOSXo,Quiet Waters Park,38.937670,-76.500899,"600 Quiet Waters Park Rd, Annapolis, MD 21403,...",60598.0,38.939282,-76.502352,"Quiet Waters Park, Annapolis, MD",0.136006,"(38.9390092302915, -76.4995525197085)","(38.9363112697085, -76.50225048029151)",0.000446,60598.0,38.939282,-76.502352,"Quiet Waters Park, Annapolis, MD",0.136006
1,ChIJzcow6Xb1t4kRQVE7s1AWWr8,Hillsmere Shores Community Beach,38.927212,-76.494150,"101 W Bay View Dr, Annapolis, MD 21403, USA",,,,,,"(38.9285315302915, -76.49278681970848)","(38.92583356970851, -76.49548478029149)",,,,,,
2,ChIJ_bk5K1z1t4kRQNverIUOVko,Quiet Waters Dog Beach,38.930202,-76.508341,"1701-1799 Quiet Waters Park Rd, Annapolis, MD ...",,,,,,"(38.93155118029149, -76.5069917197085)","(38.92885321970849, -76.5096896802915)",0.000446,60598.0,38.939282,-76.502352,"Quiet Waters Park, Annapolis, MD",0.704592
3,ChIJ_-2cQEX1t4kRoiCrmNMbrQ8,Quiet Waters Dog Park,38.931865,-76.505658,"600 Quiet Waters Park Rd, Annapolis, MD 21403,...",,,,,,"(38.93345803029149, -76.50433266970849)","(38.9307600697085, -76.5070306302915)",0.000446,60598.0,38.939282,-76.502352,"Quiet Waters Park, Annapolis, MD",0.541753
4,ChIJQWcsgIn2t4kRCL0Ub363FyI,Pip Moyer Recreation Center (Annapolis Recreat...,38.963271,-76.505150,"273 Hilltop Ln, Annapolis, MD 21403, USA",,,,,,"(38.9647144802915, -76.50395901970847)","(38.9620165197085, -76.5066569802915)",0.000887,52243.0,38.965180,-76.488980,Eastport Neighborhood,0.880709
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4609,ChIJ45Y73YD0wokR6x0q6cvzpEI,Ciccarone Park,40.856039,-73.886631,"2426 Hughes Ave, The Bronx, NY 10458, USA",,,,,,"(40.8573071802915, -73.8851153697085)","(40.8546092197085, -73.88781333029151)",,,,,,
4610,ChIJueLtN4DzwokRJWO2gJPSH2I,Rose Hill Park,40.862283,-73.890278,"4270, 2659 Webster Ave, The Bronx, NY 10458, USA",,,,,,"(40.8633695302915, -73.8893493697085)","(40.8606715697085, -73.8920473302915)",0.000086,165295.0,40.861591,-73.885108,"Fordham University - Bronx, NY",0.275027
4611,ChIJfTP2N_H0wokRSK-jDqj16IU,Starlight Park,40.833573,-73.882485,"1490 Sheridan Blvd., The Bronx, NY 10459, USA",125178.0,40.833838,-73.881897,Starlight Park,0.035808,"(40.8420985, -73.87541424999999)","(40.82883449999999, -73.88727485000003)",,,,,,
4612,ChIJfb4rltn1wokRwtzhQ6GDa2w,Bronx River Greenway,40.833220,-73.882092,"1490 Sheridan Blvd., The Bronx, NY 10459, USA",,,,,,"(40.8348330802915, -73.8812730197085)","(40.8321351197085, -73.88397098029151)",0.068787,,,,,


In [665]:
key = 'community'
def jaccard_similarity(a, b):
    sa = set(tokenize(a))
    sb = set(tokenize(b))
    
    return len(sa & sb) / len(sa | sb)

def get_nearby_parks(ind):
    return [(result['name'], result['place_type']) for result in find_place_by_nearby[ind][1]['results'][key]]

def get_nearby_park_dist(ind):
    target_loc = info_df.loc[ind, ['lat', 'lng']]
    dist_all_nearby = [lat_lng_dist(result['location'], target_loc) for result in find_place_by_nearby[ind][1]['results'][key]]

    return dist_all_nearby

def debug_index(ind):
    print(info_df.loc[ind, 'name'])
    print(list(zip(get_nearby_parks(ind), get_nearby_park_dist(ind))))

In [630]:
unmatched_indices = np.where(info_df['inaturalist_name'].isnull())[0]
jacc_sims = []

for i in unmatched_indices:
    g_title = info_df.loc[i, 'name']
    nat_titles = [result['name'] for result in find_place_by_nearby[i][1]['results'][key]]
    if not nat_titles:
        jacc_sims.append((i, 0.0))
    else:
        jacc_sims.append((i, max(jaccard_similarity(g_title, t) for t in nat_titles)))


In [765]:
debug_index(4582)

New Roots Community Farm
[(('Hudson to Housatonic Region', 25), 35.52837558416262), (('NYC Bank Swallow Monitoring Zone', None), 8.831982816346347), (('New York City (2018)', None), 10.881509551486918), (('New York City', None), 9.523041035414987), (('New York City', None), 9.042626404487512), (('New York City (from NYC Parks) ', 1000), 8.871763279973585), (('New York City Council District 17 (Bronx)', None), 1.6247260306940925), (('New York City Council District 08 (Manhattan and Bronx)', None), 1.2412836749530858)]


In [736]:
sorted([(i, s) for i, s in jacc_sims if s > 0], key=lambda x: -x[1]);

In [625]:
[result['name'] for result in find_place_by_nearby[10][1]['results'][key]]

['MarineGEO Chesapeake Bay', 'USGS Quad: Annapolis']

In [610]:
ind = 92
print(info_df.loc[ind,'name'])
[result['name'] for result in find_place_by_nearby[ind][1]['results'][key]]

Gunpowder falls hiking


['USGS Quad: White Marsh']

In [757]:
missing_place_id = np.where((info_df['inat_nb_place_id'].isnull()) & (info_df['inat_ac_place_id'].isnull()))[0]
len(missing_place_id)

3003

### 3. Compare results based on autocomplete or nearby search, and select the appropriate place_id
Also exclude the ones with too large distances

In [966]:
# == among all parks, 344 have the same place_id under the two queries
# info_df[info_df['inat_ac_place_id'] == info_df['inat_nb_place_id']]
# == 52 have both but not identical
info_df[(~info_df['inat_ac_place_id'].isnull()) & (~info_df['inat_nb_place_id'].isnull()) & (info_df['inat_ac_place_id'] != info_df['inat_nb_place_id'])]
# == 181 have autocomplete but not nearby
# info_df[(~info_df['inat_ac_place_id'].isnull()) & (info_df['inat_nb_place_id'].isnull())]#['inat_ac_distance'].describe()
# == 528 have nearby only
# info_df[(info_df['inat_ac_place_id'].isnull()) & (~info_df['inat_nb_place_id'].isnull())]#['inat_nb_distance'].describe()


Unnamed: 0,id,name,lat,lng,address,inat_nb_place_id,inat_nb_location_lat,inat_nb_location_lng,inat_nb_name,inat_nb_distance,inat_nb_bbox_area,inat_ac_place_id,inat_ac_location_lat,inat_ac_location_lng,inat_ac_name,inat_ac_distance


In [965]:
# when with 2 inconsistent results, pick the closest one
# set the other as None
inconsistent_ind = np.where((~info_df['inat_ac_place_id'].isnull()) & (~info_df['inat_nb_place_id'].isnull()) & 
                            (info_df['inat_ac_place_id'] != info_df['inat_nb_place_id']))[0]
for ind in inconsistent_ind:
    if info_df.loc[ind, 'inat_ac_distance']<info_df.loc[ind, 'inat_nb_distance']:
        res_to_remove = 'inat_nb_'
    else:
        res_to_remove = 'inat_ac_'
        
    info_df.loc[ind,[res_to_remove+'name', res_to_remove+'place_id', res_to_remove+'distance', 
                 res_to_remove+'location_lat', res_to_remove+'location_lng']] = None


In [1003]:
# now form a new column to store the place_id

info_df['inat_place_id'] = [info_df.loc[ind, 'inat_ac_place_id'] if not np.isnan(info_df.loc[ind, 'inat_ac_place_id']) 
 else info_df.loc[ind, 'inat_nb_place_id'] for ind in range(len(info_df))]


In [1004]:
len(info_df['inat_place_id'].unique())

669

## Query unique number of species as a proxy for biodiversity
Other things to consider: 
1. need to seperate original or invaded species?
2. need to split species by time? 

Or, in general consider that if a place has larger amount of unique species, it tends to be health and worth visiting?

In [111]:
# get_observations also takes parameters: day, month, year (must be observed within this day/month/year);
#                                         d1, d2 (after & before the date)
# get_observations['total_results'] returns total number observations
# get_observation_species_counts['total_results'] groups repeated observations

PLACE_ID = 125178#response['results'][0]['id']

total_observations = get_observations(
    place_id=PLACE_ID,
    verifiable=True,
    per_page=0,
)['total_results']
print(f'Total observations: {total_observations}')

total_taxa = get_observation_species_counts(
    place_id=PLACE_ID,
    verifiable=True,
    per_page=0, #not actually return species detail
)['total_results']
print(f'Total taxa observed: {total_taxa}')

Total observations: 103
Total taxa observed: 57


In [1019]:
# Or to include some species results
# response2 = get_places_autocomplete(q='John Heinz National Wildlife Refuge')#'John Heinz National Wildlife Refuge At Tinicum')

PLACE_ID = 60598#response2['results'][0]['id']
total_taxa = get_observation_species_counts(
    place_id=PLACE_ID,
    verifiable=True,
    per_page=25,
)

[(res['taxon']['iconic_taxon_name'], res['count']) for res in total_taxa['results']]


[('Aves', 23),
 ('Aves', 22),
 ('Aves', 19),
 ('Aves', 18),
 ('Aves', 16),
 ('Aves', 16),
 ('Aves', 14),
 ('Reptilia', 14),
 ('Aves', 12),
 ('Mammalia', 12),
 ('Aves', 11),
 ('Aves', 11),
 ('Aves', 10),
 ('Aves', 9),
 ('Aves', 9),
 ('Reptilia', 9),
 ('Insecta', 9),
 ('Aves', 9),
 ('Aves', 8),
 ('Aves', 8),
 ('Insecta', 8),
 ('Plantae', 8),
 ('Plantae', 8),
 ('Plantae', 8),
 ('Plantae', 7)]

In [1046]:
inat_place_id_all = info_df['inat_place_id'].unique()
inat_taxa_place_id = {}

for i in tqdm(range(len(inat_place_id_all))):
    place_id = inat_place_id_all[i]
    if ~np.isnan(place_id):
        total_taxa = get_observation_species_counts(
            place_id=int(place_id),
            verifiable=True,
            per_page=100,
        )
        inat_taxa_place_id[int(place_id)] = total_taxa
        sleep(1)
        

In [1033]:
with open('inaturalist_species_count_info.dill', 'wb') as f:
    dill.dump(inat_taxa_place_id, f)

In [1043]:
info_df['species_count'] = info_df['inat_place_id'].apply(
    lambda x: inat_taxa_place_id[int(x)]['total_results'] if not np.isnan(x) else None)


In [1047]:
info_with_inaturalist = info_df.copy()

In [1048]:
info_with_inaturalist.to_parquet('park_info_with_inaturalist.parquet')
