# Pennsylvania state parks exploration with FourSquare Place API 

The state of Pennsylvania has a total of 121 state parks, as of 2016. The preservation of these parks is critical to local community wellbeing however challenging given their diversity. 

The short project here intended to first cluster in an unsupervised way, then to fit for parks' popularity to identify main factors impacting park quality (which may be used for recommendation as well as recognizing potential issues may be improved for the park).
Things learnt from the PA parks may be generalized to other states and the framework can be expanded by incorporating information from other place APIs. 

In [639]:
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

import pandas as pd
import requests
import re

from geopy.geocoders import Nominatim 
import folium

from sklearn.cluster import KMeans

In [351]:
import pickle

def to_pickle(obj, filename):
    with open(filename, "wb+") as f:
        pickle.dump(obj, f)
        
def from_pickle(filename):
    with open(filename, "rb") as f:
        return pickle.load(f)

In [614]:
# df_park.to_pickle('df_park.pkl')  
# df_park = pd.read_pickle('df_park.pkl')

## Part I. Extract features and their popularities of PA state parks

Features include basic information from wikipedia and nearby venues from FourSquare Place API.

### Wikipedia features

In [2]:
# load the list of PA state parks (https://en.wikipedia.org/wiki/List_of_Pennsylvania_state_parks)
df_park = pd.read_csv('state_park_PA.csv')
df_park.shape

(121, 7)

In [599]:
# wikipedia feature: park area (acres)
def find_acres(acres_entry):
    return float(re.search('\d{1,3}(,\d{3})*(\.\d+)?', acres_entry)[0].replace(',',''))

df_park['Area in acres (ha)'] = df_park['Area in acres (ha)'].apply(find_acres)

In [611]:
# wikipedia feature: if with stream or lake in park
def if_water(water_entry):
    if water_entry == 'None':
        return False
    else:
        return True

df_park['if water'] = df_park['Stream(s) and / or lake(s)'].apply(if_water)    

### FourSquare Place API features

In [541]:
# get fourSquare config ids
fsqr_config = pd.read_csv('foursquare_config.txt', header=None)
CLIENT_ID = fsqr_config.iloc[1][0]
CLIENT_SECRET = fsqr_config.iloc[3][0]
VERSION = '20200905' 
LIMIT = 500

In [5]:
# convert an address into latitude and longitude values prepared for FourSquare API calls
geolocator = Nominatim(user_agent="pa_explorer")

In [4]:
# (continued..)
latitude = np.empty(df_park.shape[0])
longitude = np.empty(df_park.shape[0])
latitude[:] = np.nan
longitude[:] = np.nan

for ind in range(df_park.shape[0]):
    address = '{}, PA'.format(df_park['Park name'][ind])

    location = geolocator.geocode(address)
    if location:
        latitude[ind] = location.latitude
        longitude[ind] = location.longitude
    print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude[ind], longitude[ind]))
    
np.sum(np.isnan(longitude))

The geograpical coordinate of Allegheny Islands State Park, PA are 40.5371096, -79.82333755703746.
The geograpical coordinate of Archbald Pothole State Park, PA are 41.50989920000001, -75.56838568277055.
The geograpical coordinate of Bald Eagle State Park, PA are 41.0417311, -77.6030495.
The geograpical coordinate of Beltzville State Park, PA are 40.8684893, -75.60191225614028.
The geograpical coordinate of Bendigo State Park, PA are 41.530494399999995, -78.6278593808558.
The geograpical coordinate of Benjamin Rush State Park, PA are 40.11494625, -74.97665930932364.
The geograpical coordinate of Big Pocono State Park, PA are 41.0424311, -75.35193912576602.
The geograpical coordinate of Big Spring State Forest Picnic Area, PA are nan, nan.
The geograpical coordinate of Black Moshannon State Park, PA are 40.90129, -78.06456571768157.
The geograpical coordinate of Blue Knob State Park, PA are 40.2786873, -78.5811281.
The geograpical coordinate of Boyd Big Tree Preserve Conservation Area, 

The geograpical coordinate of Pymatuning State Park, PA are 41.59256695, -80.50988858272981.
The geograpical coordinate of R. B. Winter State Park, PA are 40.9941725, -77.1893037.
The geograpical coordinate of Raccoon Creek State Park, PA are 40.51761045000001, -80.45803128032529.
The geograpical coordinate of Ralph Stover State Park, PA are 40.434521700000005, -75.09927468131067.
The geograpical coordinate of Ravensburg State Park, PA are 41.1100727, -77.243025.
The geograpical coordinate of Reeds Gap State Park, PA are 40.7217364, -77.4752722.
The geograpical coordinate of Ricketts Glen State Park, PA are 41.33284675, -76.27929838443805.
The geograpical coordinate of Ridley Creek State Park, PA are 39.9559436, -75.4496427.
The geograpical coordinate of Ryerson Station State Park, PA are 39.88515845, -80.44362884537824.
The geograpical coordinate of S. B. Elliott State Park, PA are 41.1130063, -78.5258953.
The geograpical coordinate of Salt Springs State Park, PA are 41.91037285, -75.

4

In [7]:
# add latitude and longitude to the data frame
df_park['latitude'] = latitude
df_park['longitude'] = longitude

In [28]:
# add missing park latitude and longitude to the dataframe

# Big Spring State Forest Picnic Area; [40.262894, -77.658821]
df_park.loc[df_park['Park name']=='Big Spring State Forest Picnic Area','latitude'] = 40.262894
df_park.loc[df_park['Park name']=='Big Spring State Forest Picnic Area','longitude'] = -77.658821

# Cowans Gap State Park; [39.990880, -77.927953]
df_park.loc[df_park['Park name']=='Cowans Gap State Park','latitude'] = 39.990880
df_park.loc[df_park['Park name']=='Cowans Gap State Park','longitude'] = -77.927953

# French Creek State Park, PA; [40.214996, -75.789494]
df_park.loc[df_park['Park name']=='French Creek State Park','latitude'] = 40.214996
df_park.loc[df_park['Park name']=='French Creek State Park','longitude'] = -75.789494

# Kings Gap Environmental Education and Training Center [40.093944, -77.267937]
df_park.loc[df_park['Park name']=='Kings Gap Environmental Education and Training Center','latitude'] = 40.093944
df_park.loc[df_park['Park name']=='Kings Gap Environmental Education and Training Center','longitude'] = -77.267937


#### Show PA state parks on map

In [640]:
# Visualize neighborhoods on a map with the coordinates
address = 'Centre County, PA'
location = geolocator.geocode(address)
latitude_pa = location.latitude
longitude_pa = location.longitude

In [7]:
# create map of Toronto using latitude and longitude values
map_pa = folium.Map(location=[latitude_pa, longitude_pa], zoom_start=7.5)

# add markers to map
for lat, lng, park, remarks in zip(df_park['latitude'], df_park['longitude'], df_park['Park name'], df_park['Remarks']):
    if ~np.isnan(lat):
        # clean up remarks - remove reference index on wiki
        remarks = re.sub(r'\[[^()]*\]', '', remarks)
        label = "{}: {}".format(park, remarks)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng], radius=5, popup=label, color='blue', fill=True, fill_color='#3186cc', fill_opacity=0.7, 
            parse_html=False).add_to(map_pa)  
    else:
        print('Missing: {}.'.format(park))
    
map_pa

#### Nearby venues for each state park

In [472]:
# test API for exploration: 
# `section`: topPicks (a mix of recommendations generated without a query from the user). 
#            OR food, drinks, coffee, shops, arts, outdoors, sights, trending, 
#               nextVenues (venues frequently visited after a given venue), 
# lat = float(df_park.loc[df_park['Park name']=='Cherry Springs State Park', 'latitude'])
# lng = float(df_park.loc[df_park['Park name']=='Cherry Springs State Park', 'longitude'])


def fsq_explore(parkId):
    lat = df_park.loc[parkId,'latitude']
    lng = df_park.loc[parkId,'longitude']
    radius = 10000 # meters, max 100,000 meters; if 25000 m (about half hour drive at 30 mile/h)
    LIMIT = 100
    SECTION = ''
    # create the API request URL for venue recommendation
    # "ll" can be substitude by the option "near" with park names directly [automatically resolve geolocate, maybe less reliable]
    url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&section={}'.format(
        CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT, SECTION)

    # make the GET request
    results = requests.get(url).json()#["response"]['groups'][0]['items']
    return results

parkId = 2
results = fsq_explore(parkId)

In [473]:
print('total venues: ' + str(len(results['response']['groups'][0]['items'])))
for i in range(len(results['response']['groups'][0]['items'])):
    print(results['response']['groups'][0]['items'][i]['venue']['name'] + ': ' + 
         results['response']['groups'][0]['items'][i]['venue']['categories'][0]['name'] + 
         str(results['response']['groups'][0]['items'][i]['venue']['location']['distance']))

total venues: 18
Bald Eagle State Park: State / Provincial Park4048
Hublersburg Inn: American Restaurant8964
Nature Inn: Hotel2351
Brown Hill Tavern: Bar6874
Dairy Queen: Ice Cream Shop6843
TravelCenters of America: Gas Station7057
Flying J: Gas Station6884
Cinnabon: Sandwich Place7387
Subway: Sandwich Place7351
Hampton Inn by Hilton: Hotel7044
Ingram's Market: Gas Station6915
The Cottage Restaurant: American Restaurant7214
Denny's: Breakfast Spot6880
Bc Hotel: Bar3862
Uni-Mart: Convenience Store5819
McDonald's: Fast Food Restaurant7294
Millers Gun Shop: Sporting Goods Shop9482
J Brothers Construction LLC: Construction & Landscaping9897


In [462]:
results['response']['groups'][0]['items'][2]['venue']

{'id': '4c95227438dd8cfaafe2cf62',
 'name': 'Nature Inn',
 'location': {'address': '201 Warbler Way',
  'crossStreet': 'at Bald Eagle State Park',
  'lat': 41.03956985473633,
  'lng': -77.63091278076172,
  'labeledLatLngs': [{'label': 'display',
    'lat': 41.03956985473633,
    'lng': -77.63091278076172}],
  'distance': 2351,
  'postalCode': '16841',
  'cc': 'US',
  'city': 'Howard',
  'state': 'PA',
  'country': 'United States',
  'formattedAddress': ['201 Warbler Way (at Bald Eagle State Park)',
   'Howard, PA 16841',
   'United States']},
 'categories': [{'id': '4bf58dd8d48988d1fa931735',
   'name': 'Hotel',
   'pluralName': 'Hotels',
   'shortName': 'Hotel',
   'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/travel/hotel_',
    'suffix': '.png'},
   'primary': True}],
 'photos': {'count': 0, 'groups': []}}

#### Display the venues nearby on map

In [456]:
# map of a park with its nearby venues
map_park = folium.Map(location=[lat, lng], zoom_start=12)
label = "{}: {}".format(df_park['Park name'][parkId], df_park['Remarks'][parkId])
label = folium.Popup(label, parse_html=True)
folium.CircleMarker(
        [lat, lng], radius=5, popup=label, color='red', fill=True, fill_color='#d63855', fill_opacity=0.7,
        parse_html=False).add_to(map_park) 

# add markers to map
for i in range(len(results['response']['groups'][0]['items'])):
    venue_name = results['response']['groups'][0]['items'][i]['venue']['name']
    venue_categ = results['response']['groups'][0]['items'][i]['venue']['categories'][0]['name']
    lat_v = results['response']['groups'][0]['items'][i]['venue']['location']['lat']
    lng_v = results['response']['groups'][0]['items'][i]['venue']['location']['lng']
    
    label = "{} ({})".format(venue_name, venue_categ)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat_v, lng_v], radius=5, popup=label, color='blue', fill=True, fill_color='#3186cc', fill_opacity=0.7,
        parse_html=False).add_to(map_park)  
    
map_park

#### Map all categories to its top category (1 out of 10)
Build a dictionary for all possible fsq category ids, and their value will be the corresponding top category.

In [354]:
url = 'https://api.foursquare.com/v2/venues/categories?client_id={}&client_secret={}&v={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION)
all_categ = requests.get(url).json()
all_categ['response']['categories'][3]#['categories'][0]#['categories']

In [434]:
# find the top level category for each categ id
# since we only have limited number of datapoint (121 parks), features are simplified by only considering the 10 top categ
def create_categ_mapping(all_categ):    
    categ_mapping = {}
    categ_parent = {}
    def find_categ(node, label, parent):
        # map id to label
        categ_mapping[node['id']] = label
        categ_parent[node['id']] = parent

        for child in node.get('categories', []): # if at leaf, categories is empty
            find_categ(child, label, node)
        
            
    for itopc in range(10):
        root = all_categ['response']['categories'][itopc]
        find_categ(root, root['name'], None)
        
    return categ_mapping, categ_parent

categ_mapping, categ_parent = create_categ_mapping(all_categ)


In [443]:
categ_mapping['4bf58dd8d48988d152941735']

'Food'

In [478]:
categ_keys = []
for itopc in range(10):
    categ_keys.append(all_categ['response']['categories'][itopc]['name'])
categ_keys

['Arts & Entertainment',
 'College & University',
 'Event',
 'Food',
 'Nightlife Spot',
 'Outdoors & Recreation',
 'Professional & Other Places',
 'Residence',
 'Shop & Service',
 'Travel & Transport']

#### Generate summary stats on number of nearby venues under each top categories

In [482]:
# use defaultdict to handle default value
# from collections import defaultdict

all_venue_stat = []
for parkId in range(len(df_park)):
    results = fsq_explore(parkId)
    venue_stat = { k: 0 for k in categ_keys }

    for item in results['response']['groups'][0]['items']:
        venue_top_categ = categ_mapping[item['venue']['categories'][0]['id']]
        venue_stat[venue_top_categ] += 1    
    
    all_venue_stat.append(venue_stat)

In [494]:
for ipark in range(121):
    for categ in categ_keys:
        df_park.loc[ipark, 'nearby '+categ] = all_venue_stat[ipark][categ]

In [613]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  
    display(df_park)

Unnamed: 0,Park name,County or counties,Area in acres (ha),Date founded,Stream(s) and / or lake(s),Remarks,Image,latitude,longitude,fsq_venue_id,fsq_venue_name,fsq_venue_categ,nearby Arts & Entertainment,nearby College & University,nearby Event,nearby Food,nearby Nightlife Spot,nearby Outdoors & Recreation,nearby Professional & Other Places,nearby Residence,nearby Shop & Service,nearby Travel & Transport,fsq_likes,if water
0,Allegheny Islands State Park,Allegheny County,43.0,1980,Allegheny River,Three alluvial islands near Pittsburgh with no...,,40.53711,-79.823338,4d759203497fa1431e19d225,Allegheny Islands State Park,State / Provincial Park,4.0,1.0,0.0,50.0,9.0,9.0,1.0,0.0,25.0,1.0,1.0,True
1,Archbald Pothole State Park,Lackawanna County,150.0,1964,,"One of world's largest potholes, 38 ft (12 m) ...",,41.509899,-75.568386,4c4c5150c668e21e43d156fb,Archbald Pothole State Park,State / Provincial Park,6.0,0.0,0.0,52.0,7.0,2.0,0.0,0.0,31.0,2.0,2.0,False
2,Bald Eagle State Park,Centre County,5900.0,1971,"Bald Eagle Creek, Foster Joseph Sayers Reservoir","1,730 acre (700 ha) U.S. Army Corps of Enginee...",,41.041731,-77.603049,4ba789bdf964a5209a9b39e3,Bald Eagle State Park,State / Provincial Park,0.0,0.0,0.0,7.0,2.0,1.0,0.0,0.0,6.0,2.0,15.0,True
3,Beltzville State Park,Carbon County,2973.0,1972,"Pohopoco Creek, Beltzville Lake",U.S. Army Corps of Engineers lake is 949 acres...,,40.868489,-75.601912,4f49199be4b0e7e90f5f5b62,Beltzville Lake State Park,State / Provincial Park,2.0,0.0,0.0,29.0,12.0,14.0,1.0,0.0,17.0,4.0,24.0,True
4,Bendigo State Park,Elk County,100.0,1959,East Branch Clarion River,"Only 20 acres (8.1 ha) developed, name a corru...",,41.530494,-78.627859,,Joy Gardens Roller Rink,Skating Rink,0.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,3.0,0.0,,True
5,Benjamin Rush State Park,Philadelphia County,275.0,1975,,Site of one of the world's largest community g...,,40.114946,-74.976659,4d026a9a73d3b60c071b5c5d,Benjamin Rush State Park,State / Provincial Park,4.0,0.0,0.0,56.0,8.0,4.0,0.0,0.0,28.0,0.0,1.0,False
6,Big Pocono State Park,Monroe County,1306.0,1954,,"On Camelback Mountain, site of Camelback Ski A...",,41.042431,-75.351939,4bf81b4b5efe2d7fba206a34,Big Pocono State Park,State / Provincial Park,10.0,0.0,0.0,35.0,2.0,13.0,2.0,0.0,31.0,7.0,1.0,False
7,Big Spring State Forest Picnic Area,Perry County,45.0,1936,Big Spring Run,Park has trail to partially completed railroad...,,40.262894,-77.658821,59eac5279746176eb940110a,Wildwood Family Campground,Campground,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,1.0,0.0,0.0,True
8,Black Moshannon State Park,Centre County,3394.0,1937,"Black Moshannon Creek, Black Moshannon Lake",Park has bog with three carnivorous plant spec...,,40.90129,-78.064566,4bf573842c6b76b00b81a18c,Black Moshannon State Park,State / Provincial Park,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,1.0,0.0,10.0,True
9,Blue Knob State Park,Bedford County,5874.0,1945,,This former Recreation Demonstration Area on t...,,40.278687,-78.581128,4c2bd860f7acef3b4066ed0c,Blue Knob State Park,State / Provincial Park,0.0,0.0,0.0,0.0,0.0,6.0,1.0,0.0,0.0,0.0,4.0,False


In [671]:
df_park.iloc[[31, 32]]

Unnamed: 0,Park name,County or counties,Area in acres (ha),Date founded,Stream(s) and / or lake(s),Remarks,Image,latitude,longitude,fsq_venue_id,...,nearby Event,nearby Food,nearby Nightlife Spot,nearby Outdoors & Recreation,nearby Professional & Other Places,nearby Residence,nearby Shop & Service,nearby Travel & Transport,fsq_likes,if water
31,French Creek State Park,Berks and Chester Counties,7339.0,1946,French Creek,"Former Recreation Demonstration Area, adjacent...",,40.214996,-75.789494,4bb7cd0a1261d13af4e6e798,...,0.0,28.0,2.0,12.0,2.0,0.0,24.0,6.0,37.0,True
32,Gifford Pinchot State Park,York County,2338.0,1961,"Beaver Creek (tributary of Conewago Creek), Pi...","Gifford Pinchot was a Pennsylvania governor, c...",,40.0767,-76.887688,4c37f05193db0f47d7d62092,...,0.0,9.0,2.0,9.0,0.0,0.0,9.0,0.0,50.0,True


### Park popularity on FourSquare
#### Search for the state park venue_id in order to retrieve its details

In [117]:
categId = '5bae9231bedf3950379f89d0' # category id for "State / Provincial Park"
radius = 8000 # meters, max 100,000 meters; 
LIMIT = 10
parkIndex = 79
lat = df_park['latitude'][parkIndex]
lng = df_park['longitude'][parkIndex]
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&v={}&ll={},{}&categoryId={}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    lat, 
    lng,
    categId,
    radius, 
    LIMIT)
venue_park_results = requests.get(url).json()

In [118]:
venue_park_results['response']['venues'][0]['location']['distance']

9454

In [136]:
categId = '5bae9231bedf3950379f89d0' # category id for "State / Provincial Park"
# other possible categ: trail; campground; Scenic Lookout; park
categId2 = '4d4b7105d754a06377d81259'
# Outdoors & Recreation (top categ): 4d4b7105d754a06377d81259
parkId_list = [] # target name, found park name, found id
radius = 5000; LIMIT = 10

for lat, lng, park in zip(df_park['latitude'], df_park['longitude'], df_park['Park name']):
    url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&v={}&ll={},{}&categoryId={}&radius={}&limit={}'.format(
        CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, categId, radius, LIMIT)
    venue_park_results = requests.get(url).json()
    
    found_park = np.NaN
    found_categ = np.NaN
    found_id = np.NaN
    distance = np.NaN
    if len(venue_park_results['response']['venues']): # if results exist, pick the closest one ([0])
        found_park = venue_park_results['response']['venues'][0]['name']
        found_categ = venue_park_results['response']['venues'][0]['categories'][0]['name']
        found_id = venue_park_results['response']['venues'][0]['id']
        distance = venue_park_results['response']['venues'][0]['location']['distance']
    else: # if did find "State park" nearby, try another category
        url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&v={}&ll={},{}&categoryId={}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, categId2, radius, LIMIT)
        venue_park_results = requests.get(url).json()
        if len(venue_park_results['response']['venues']): # if results exist, pick the closest one ([0])
            found_park = venue_park_results['response']['venues'][0]['name']
            found_categ = venue_park_results['response']['venues'][0]['categories'][0]['name']
            found_id = venue_park_results['response']['venues'][0]['id']
            distance = venue_park_results['response']['venues'][0]['location']['distance']
         
    parkId_list.append([park, found_park, found_categ, found_id, distance])

In [138]:
df_parkId = pd.DataFrame(parkId_list)
df_parkId.columns = ['Park name', 'found name', 'found in category', 'found Id', 'distance']
df_parkId

Unnamed: 0,Park name,found name,found in category,found Id,distance
0,Allegheny Islands State Park,Allegheny Islands State Park,State / Provincial Park,4d759203497fa1431e19d225,120.0
1,Archbald Pothole State Park,Archbald Pothole State Park,State / Provincial Park,4c4c5150c668e21e43d156fb,528.0
2,Bald Eagle State Park,Bald Eagle State Park,State / Provincial Park,4ba789bdf964a5209a9b39e3,4048.0
3,Beltzville State Park,Beltzville Lake State Park,State / Provincial Park,4f49199be4b0e7e90f5f5b62,2169.0
4,Bendigo State Park,Joy Gardens Roller Rink,Skating Rink,4efa283cb8f72ebeaab18408,5481.0
...,...,...,...,...,...
116,Washington Crossing Historic Park,Washington Crossing State Park,State / Provincial Park,4c1a368198f4a593323401f6,2593.0
117,Whipple Dam State Park,Whipple Dam State Park,State / Provincial Park,4ba257baf964a52025ef37e3,270.0
118,White Clay Creek Preserve,White Clay Creek State Park,State / Provincial Park,4bdc6b55c79cc9286b7386e9,3507.0
119,Worlds End State Park,Forksville Covered Bridge,Bridge,5b3eb74098fbfc002c0a7f00,3563.0


In [615]:
np.sum(df_parkId['found Id'].isna())

5

In [253]:
# problematic finds: 4(x), 7, 25, 38, 42, 45,  
#                    76(nan), 78(x), 88(x), 98, 105(nan), 113(x), 114(x), 115 

# update the ones with inaccurate information
df_parkId.loc[50, 'found Id'] = '5856cfbd809a770330d0ff0c'; df_parkId.loc[50, 'found name'] = 'laurel mountain ski resort'; df_parkId.loc[50, 'found in category'] = 'Ski Area'; df_parkId.loc[50, 'distance'] = 570
df_parkId.loc[53, 'found Id'] = '4bf919805efe2d7f67b26b34'; df_parkId.loc[53, 'found name'] = 'Lehigh Gorge S.P. Trail - Glen Onoko Access'; df_parkId.loc[53, 'found in category'] = 'Trail'
df_parkId.loc[72, 'found Id'] = '4bcda1ddfb84c9b6bf40223e'; df_parkId.loc[72, 'found name'] = 'Nolde Forest State Park'; df_parkId.loc[72, 'found in category'] = 'Trail'; df_parkId.loc[72, 'distance'] = 839
df_parkId.loc[79, 'found Id'] = '4c2642c6db519521555d2c3a'; df_parkId.loc[79, 'found name'] = 'Penn-Roosevelt State Park'; df_parkId.loc[79, 'found in category'] = 'State / Provincial Park'; df_parkId.loc[79, 'distance'] = 9454
df_parkId.loc[86, 'found Id'] = '4c1e6a0efcf8c9b6193aad0b'; df_parkId.loc[86, 'found name'] = 'Promised Land State Park'; df_parkId.loc[86, 'found in category'] = 'Lake'; df_parkId.loc[86, 'distance'] = 583
df_parkId.loc[104, 'found Id'] = '4cb9fd0e035d236ab289d74e'; df_parkId.loc[104, 'found name'] = 'Sinnemahoning State Park'; df_parkId.loc[104, 'found in category'] = 'State / Provincial Park'; df_parkId.loc[104, 'distance'] = 5959
df_parkId.loc[106, 'found Id'] = '4c0a6e4b340720a1205a8693'; df_parkId.loc[106, 'found name'] = 'Susquehanna River'; df_parkId.loc[106, 'found in category'] = 'River'; df_parkId.loc[106, 'distance'] = 3742
df_parkId.loc[119, 'found Id'] = '4ddd8668183877913989560c'; df_parkId.loc[119, 'found name'] = 'Worlds End State Park'; df_parkId.loc[119, 'found in category'] = 'Trail'; df_parkId.loc[119, 'distance'] = 364

# remove the ones that were failed to find
df_parkId.loc[4, 'found Id'] = np.NaN
df_parkId.loc[78, 'found Id'] = np.NaN
df_parkId.loc[88, 'found Id'] = np.NaN
df_parkId.loc[113, 'found Id'] = '4fb2883fe4b059b0d49c8437'
df_parkId.loc[114, 'found Id'] = '4fb2883fe4b00dd091d29b8b'

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  
    display(df_parkId)

Unnamed: 0,Park name,found name,found in category,found Id,distance
0,Allegheny Islands State Park,Allegheny Islands State Park,State / Provincial Park,4d759203497fa1431e19d225,120.0
1,Archbald Pothole State Park,Archbald Pothole State Park,State / Provincial Park,4c4c5150c668e21e43d156fb,528.0
2,Bald Eagle State Park,Bald Eagle State Park,State / Provincial Park,4ba789bdf964a5209a9b39e3,4048.0
3,Beltzville State Park,Beltzville Lake State Park,State / Provincial Park,4f49199be4b0e7e90f5f5b62,2169.0
4,Bendigo State Park,Joy Gardens Roller Rink,Skating Rink,,5481.0
5,Benjamin Rush State Park,Benjamin Rush State Park,State / Provincial Park,4d026a9a73d3b60c071b5c5d,905.0
6,Big Pocono State Park,Big Pocono State Park,State / Provincial Park,4bf81b4b5efe2d7fba206a34,2330.0
7,Big Spring State Forest Picnic Area,Wildwood Family Campground,Campground,59eac5279746176eb940110a,2881.0
8,Black Moshannon State Park,Black Moshannon State Park,State / Provincial Park,4bf573842c6b76b00b81a18c,1120.0
9,Blue Knob State Park,Blue Knob State Park,State / Provincial Park,4c2bd860f7acef3b4066ed0c,1767.0


In [147]:
df_parkId['found in category'].unique()

array(['State / Provincial Park', 'Skating Rink', 'Campground', 'Trail',
       'Other Great Outdoors', 'Summer Camp', 'Scenic Lookout', 'Park',
       'Roller Rink', nan, 'Bathing Area', 'Recreation Center', 'Well',
       'Harbor / Marina', 'Lake', 'Golf Course', 'Bridge'], dtype=object)

In [338]:
df_park['fsq_venue_id'] = df_parkId['found Id']
df_park['fsq_venue_name'] = df_parkId['found name']
df_park['fsq_venue_categ'] = df_parkId['found in category']
df_park

Unnamed: 0,Park name,County or counties,Area in acres (ha),Date founded,Stream(s) and / or lake(s),Remarks,Image,latitude,longitude,fsq_venue_id,fsq_venue_name,fsq_venue_categ
0,Allegheny Islands State Park,Allegheny County,43 acres (17 ha),1980,Allegheny River,Three alluvial islands near Pittsburgh with no...,,40.537110,-79.823338,4d759203497fa1431e19d225,Allegheny Islands State Park,State / Provincial Park
1,Archbald Pothole State Park,Lackawanna County,150 acres (61 ha),1964,,"One of world's largest potholes, 38 ft (12 m) ...",,41.509899,-75.568386,4c4c5150c668e21e43d156fb,Archbald Pothole State Park,State / Provincial Park
2,Bald Eagle State Park,Centre County,"5,900 acres (2,388 ha)",1971,"Bald Eagle Creek, Foster Joseph Sayers Reservoir","1,730 acre (700 ha) U.S. Army Corps of Enginee...",,41.041731,-77.603049,4ba789bdf964a5209a9b39e3,Bald Eagle State Park,State / Provincial Park
3,Beltzville State Park,Carbon County,"2,973 acres (1,203 ha)",1972,"Pohopoco Creek, Beltzville Lake",U.S. Army Corps of Engineers lake is 949 acres...,,40.868489,-75.601912,4f49199be4b0e7e90f5f5b62,Beltzville Lake State Park,State / Provincial Park
4,Bendigo State Park,Elk County,100 acres (40 ha),1959,East Branch Clarion River,"Only 20 acres (8.1 ha) developed, name a corru...",,41.530494,-78.627859,,Joy Gardens Roller Rink,Skating Rink
...,...,...,...,...,...,...,...,...,...,...,...,...
116,Washington Crossing Historic Park,Bucks County,500 acres (202 ha),2016,Delaware River,Site of George Washington's crossing of the De...,,40.297607,-74.876276,4c1a368198f4a593323401f6,Washington Crossing State Park,State / Provincial Park
117,Whipple Dam State Park,Huntingdon County,256 acres (104 ha),1928,Whipple Lake,"There was a camp for Boy Scouts, Girl Scouts, ...",,40.686769,-77.862838,4ba257baf964a52025ef37e3,Whipple Dam State Park,State / Provincial Park
118,White Clay Creek Preserve,Chester County,"1,255 acres (508 ha)",1984,White Clay Creek,"Park was donated by DuPont to preserve ""divers...",,39.743574,-75.769126,4bdc6b55c79cc9286b7386e9,White Clay Creek State Park,State / Provincial Park
119,Worlds End State Park,Sullivan County,780 acres (316 ha),1932,Loyalsock Creek,"A ""Must See Park"" known for trout fishing, whi...",,41.461818,-76.576093,4ddd8668183877913989560c,Worlds End State Park,Trail


#### Get some details about each state park (fsq premium endpoint)

In [343]:
# VENUE_ID = '4f88baafe4b02981899ad0fa'#venue_park_results['response']['venues'][0]['id']
state_park_detail = []
for VENUE_ID in df_parkId['found Id']:
    url = 'https://api.foursquare.com/v2/venues/{}?client_id={}&client_secret={}&v={}'.format(VENUE_ID, CLIENT_ID, CLIENT_SECRET, VERSION)
    venue_results = requests.get(url).json()
#     venue_results['response']['venue']['rating'] #or['likes']
    state_park_detail.append(venue_results)

In [569]:
state_park_detail[120]

{'meta': {'code': 200, 'requestId': '5f615a9b096e8e43567c9e92'},
 'response': {'venue': {'id': '4c2aee419a559c7463150de2',
   'name': 'Yellow Creek State Park',
   'contact': {'phone': '7243577913',
    'formattedPhone': '(724) 357-7913',
    'twitter': 'visitpaparks'},
   'location': {'address': '170 Route 259 Hwy',
    'crossStreet': 'Hwy 259',
    'lat': 40.57835194804436,
    'lng': -79.00116741657257,
    'labeledLatLngs': [{'label': 'display',
      'lat': 40.57835194804436,
      'lng': -79.00116741657257}],
    'postalCode': '15765',
    'cc': 'US',
    'city': 'Penn Run',
    'state': 'PA',
    'country': 'United States',
    'formattedAddress': ['170 Route 259 Hwy (Hwy 259)',
     'Penn Run, PA 15765',
     'United States']},
   'canonicalUrl': 'https://foursquare.com/v/yellow-creek-state-park/4c2aee419a559c7463150de2',
   'categories': [{'id': '5bae9231bedf3950379f89d0',
     'name': 'State / Provincial Park',
     'pluralName': 'State / Provincial Parks',
     'shortName': 

In [570]:
#to_pickle(state_park_detail, 'state_park_detail.pkl')
# state_park_detail = from_pickle('state_park_detail.pkl')#['response']['venue'].keys()
# len(state_park_detail)

In [572]:
park_rating_list = []
for ipark in range(121):
    if state_park_detail[ipark]['meta']['code'] == 200:
        park_rating_list.append([df_park.loc[ipark, 'Park name'], 
                            state_park_detail[ipark]['response']['venue'].get('rating', np.NaN)])
park_rating = pd.DataFrame(park_rating_list)
park_rating

Unnamed: 0,0,1
0,Allegheny Islands State Park,
1,Archbald Pothole State Park,
2,Bald Eagle State Park,8.9
3,Beltzville State Park,7.6
4,Benjamin Rush State Park,
...,...,...
111,Washington Crossing Historic Park,8.5
112,Whipple Dam State Park,7.8
113,White Clay Creek Preserve,8.5
114,Worlds End State Park,


In [576]:
park_likes_list = []
for ipark in range(121):
    if state_park_detail[ipark]['meta']['code'] == 200:
        park_likes_list.append([df_park.loc[ipark, 'Park name'], 
                            state_park_detail[ipark]['response']['venue']['likes']['count']])
    else:
        park_likes_list.append([df_park.loc[ipark, 'Park name'], np.NaN])
park_likes = pd.DataFrame(park_likes_list)
park_likes.columns = ['Park name', 'fsq likes']
with pd.option_context('display.max_rows', None):
    display(park_likes)

Unnamed: 0,Park name,fsq likes
0,Allegheny Islands State Park,1.0
1,Archbald Pothole State Park,2.0
2,Bald Eagle State Park,15.0
3,Beltzville State Park,24.0
4,Bendigo State Park,
5,Benjamin Rush State Park,1.0
6,Big Pocono State Park,1.0
7,Big Spring State Forest Picnic Area,0.0
8,Black Moshannon State Park,10.0
9,Blue Knob State Park,4.0


In [604]:
df_park['fsq_likes'] = park_likes['fsq likes']

### Summarize extracted features

In [616]:
df_park.columns

Index(['Park name', 'County or counties', 'Area in acres (ha)', 'Date founded',
       'Stream(s) and / or lake(s)', 'Remarks', 'Image', 'latitude',
       'longitude', 'fsq_venue_id', 'fsq_venue_name', 'fsq_venue_categ',
       'nearby Arts & Entertainment', 'nearby College & University',
       'nearby Event', 'nearby Food', 'nearby Nightlife Spot',
       'nearby Outdoors & Recreation', 'nearby Professional & Other Places',
       'nearby Residence', 'nearby Shop & Service',
       'nearby Travel & Transport', 'fsq_likes', 'if water'],
      dtype='object')

In [618]:
df_park_summary = df_park[['Park name', 'County or counties', 'Date founded', 'latitude', 'longitude', 'fsq_venue_categ',
                         'if water', 'nearby Arts & Entertainment', 'nearby College & University',
                         'nearby Event', 'nearby Food', 'nearby Nightlife Spot',
                         'nearby Outdoors & Recreation', 'nearby Professional & Other Places',
                         'nearby Residence', 'nearby Shop & Service',
                         'nearby Travel & Transport', 'fsq_likes']]
df_park_summary

Unnamed: 0,Park name,County or counties,Date founded,latitude,longitude,fsq_venue_categ,if water,nearby Arts & Entertainment,nearby College & University,nearby Event,nearby Food,nearby Nightlife Spot,nearby Outdoors & Recreation,nearby Professional & Other Places,nearby Residence,nearby Shop & Service,nearby Travel & Transport,fsq_likes
0,Allegheny Islands State Park,Allegheny County,1980,40.537110,-79.823338,State / Provincial Park,True,4.0,1.0,0.0,50.0,9.0,9.0,1.0,0.0,25.0,1.0,1.0
1,Archbald Pothole State Park,Lackawanna County,1964,41.509899,-75.568386,State / Provincial Park,False,6.0,0.0,0.0,52.0,7.0,2.0,0.0,0.0,31.0,2.0,2.0
2,Bald Eagle State Park,Centre County,1971,41.041731,-77.603049,State / Provincial Park,True,0.0,0.0,0.0,7.0,2.0,1.0,0.0,0.0,6.0,2.0,15.0
3,Beltzville State Park,Carbon County,1972,40.868489,-75.601912,State / Provincial Park,True,2.0,0.0,0.0,29.0,12.0,14.0,1.0,0.0,17.0,4.0,24.0
4,Bendigo State Park,Elk County,1959,41.530494,-78.627859,Skating Rink,True,0.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,3.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,Washington Crossing Historic Park,Bucks County,2016,40.297607,-74.876276,State / Provincial Park,True,3.0,0.0,0.0,60.0,4.0,14.0,2.0,0.0,15.0,2.0,23.0
117,Whipple Dam State Park,Huntingdon County,1928,40.686769,-77.862838,State / Provincial Park,True,0.0,0.0,0.0,1.0,0.0,5.0,1.0,0.0,0.0,0.0,8.0
118,White Clay Creek Preserve,Chester County,1984,39.743574,-75.769126,State / Provincial Park,True,1.0,3.0,0.0,59.0,5.0,14.0,0.0,0.0,16.0,2.0,40.0
119,Worlds End State Park,Sullivan County,1932,41.461818,-76.576093,Trail,True,1.0,0.0,0.0,2.0,1.0,3.0,0.0,0.0,1.0,3.0,5.0


In [619]:
df_park_summary.to_pickle('df_park_summary.pkl')  
# df_park_summary = pd.read_pickle('df_park_summary.pkl')

## Part II. PA state park clusters
Unsupervised clustering based on features such as nearby venues and popularities.

In [631]:
# get features that would be used for clustering
df_park_clustering = df_park_summary[['nearby Arts & Entertainment', 'nearby College & University',
                                      'nearby Event', 'nearby Food', 'nearby Nightlife Spot',
                                      'nearby Outdoors & Recreation', 'nearby Professional & Other Places',
                                      'nearby Residence', 'nearby Shop & Service',
                                      'nearby Travel & Transport', 'fsq_likes']]
df_park_clustering.loc[df_park_clustering['fsq_likes'].isna(), 'fsq_likes'] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [661]:
# set number of clusters
kclusters = 6

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_park_clustering)

In [662]:
# add clustering labels
# neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
df_park_summary['Cluster Labels'] = kmeans.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [663]:
# visualize the clusters

# create map
map_clusters = folium.Map(location=[latitude_pa, longitude_pa], zoom_start=7.5)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_park_summary['latitude'], df_park_summary['longitude'], 
                                  df_park_summary['Park name'], df_park_summary['Cluster Labels']):
    label = folium.Popup(str(poi) + ', Cluster ' + str(cluster), parse_html=True)
    
    folium.CircleMarker(
        [lat, lon], radius=5, popup=label, color='#000000' if np.isnan(cluster) else rainbow[int(cluster)-1],
        fill=True, fill_color='#000000' if np.isnan(cluster) else rainbow[int(cluster)-1], fill_opacity=0.7).add_to(map_clusters)
       
map_clusters
# downtown Toronto neighborhoods are well clustered