# Segmenting and Clustering - NYC Example

This is an example that illustrates how to convert addresses into latitude and longitude.

It then uses the foursquare API to do some analysis on New York City neighborhoods.

Using some of the foursquare data we will take an analytical approach with k-Means clustering we will segment the city into neighborhoods.

We will also make use of folium to visualize some of the results.

In [45]:
# Importing libraries, etc, that we will use

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json

#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes
import folium

In [46]:
# Load the new york json data

with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [47]:
# Quick look at the data set

newyork_data

{'type': 'FeatureCollection',
 'totalFeatures': 306,
 'features': [{'type': 'Feature',
   'id': 'nyu_2451_34572.1',
   'geometry': {'type': 'Point',
    'coordinates': [-73.84720052054902, 40.89470517661]},
   'geometry_name': 'geom',
   'properties': {'name': 'Wakefield',
    'stacked': 1,
    'annoline1': 'Wakefield',
    'annoline2': None,
    'annoline3': None,
    'annoangle': 0.0,
    'borough': 'Bronx',
    'bbox': [-73.84720052054902,
     40.89470517661,
     -73.84720052054902,
     40.89470517661]}},
  {'type': 'Feature',
   'id': 'nyu_2451_34572.2',
   'geometry': {'type': 'Point',
    'coordinates': [-73.82993910812398, 40.87429419303012]},
   'geometry_name': 'geom',
   'properties': {'name': 'Co-op City',
    'stacked': 2,
    'annoline1': 'Co-op',
    'annoline2': 'City',
    'annoline3': None,
    'annoangle': 0.0,
    'borough': 'Bronx',
    'bbox': [-73.82993910812398,
     40.87429419303012,
     -73.82993910812398,
     40.87429419303012]}},
  {'type': 'Feature',
 

In [48]:
# Define a variable for the features in the newyork_data.json file
neighborhoods_data = newyork_data['features']

# Take a look at the first entry here
neighborhoods_data[0]

{'type': 'Feature',
 'id': 'nyu_2451_34572.1',
 'geometry': {'type': 'Point',
  'coordinates': [-73.84720052054902, 40.89470517661]},
 'geometry_name': 'geom',
 'properties': {'name': 'Wakefield',
  'stacked': 1,
  'annoline1': 'Wakefield',
  'annoline2': None,
  'annoline3': None,
  'annoangle': 0.0,
  'borough': 'Bronx',
  'bbox': [-73.84720052054902,
   40.89470517661,
   -73.84720052054902,
   40.89470517661]}}

In [49]:
# Transform the data into a pandas dataframe
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude']
neighborhoods = pd.DataFrame(columns = column_names)

# Loop through the data and fill the data frame
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough']
    neighborhood_name = data['properties']['name']
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    neighborhoods = neighborhoods.append({'Borough': borough, 'Neighborhood': neighborhood_name, 'Latitude': neighborhood_lat, 'Longitude': neighborhood_lon}, ignore_index = True)

# Check that the data loaded the way we expected
neighborhoods.head()    

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


## Using geopy library

Next we use the geopy library to get the latitude and longitude of NYC. We create an instance of the geocoder and define a user_agent, which we will name ny_explorer.

In [50]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print("The geographical coordinates of New York City are {}, {}.".format(latitude, longitude))

The geographical coordinates of New York City are 40.7127281, -74.0060152.


In [51]:
# Use folium to create a map of the city

map_newyork = folium.Map(location = [latitude, longitude], zoom_start = 10)

for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'],neighborhoods['Neighborhood']):
    label = '{},{}'.format(neighborhood, borough)
    label = folium.Popup(label,parse_html = True)
    folium.CircleMarker(
        [lat,lng],
        radius = 5,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.7,
        parse_html = False).add_to(map_newyork)

map_newyork

In [52]:
# Simplify the map by restricting view to manhattan

manhattan_data = neighborhoods[neighborhoods['Borough'] == 'Manhattan'].reset_index(drop = True)

address = 'Manhattan, NY'

geolocator = Nominatim(user_agent = "ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinates of Manhattan are {}, {}'.format(latitude, longitude))

The geographical coordinates of Manhattan are 40.7896239, -73.9598939


In [53]:
map_manhattan = folium.Map(location=[latitude,longitude], zoom_start = 11)

for lat, lng, label in zip(manhattan_data['Latitude'], manhattan_data['Longitude'], manhattan_data['Neighborhood']):
    label = folium.Popup(label,parse_html = True)
    folium.CircleMarker(
        [lat,lng],
        radius = 5,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = '#31866.cc',
        fill_opacity = 0.7,
        parse_html = False).add_to(map_manhattan)
    
map_manhattan

## Using the Foursquare API to explore and segment neighborhoods

In [54]:
# Connecting to foursquare

CLIENT_ID = 'P4GV2G4LEKK4XMTHM0H4H5W3CFR055TAHR2IQ3JPLJ0TFHEK'
CLIENT_SECRET = 'CJ2YTC5JCO4F3PNHKPSZPMFOTIU5RYD4QD1WB5FYRCS4FD5B'
VERSION ='20180604'
LIMIT = 100

In [55]:
manhattan_data.loc[0,'Neighborhood']

'Marble Hill'

In [56]:
neighborhood_latitude = manhattan_data.loc[0,'Latitude']
neighborhood_longitude = manhattan_data.loc[0,'Longitude']
neighborhood_name = manhattan_data.loc[0,'Neighborhood']
print("Latitude and longitude values of {} are {},{}.".format(neighborhood_name, neighborhood_latitude, neighborhood_longitude))

Latitude and longitude values of Marble Hill are 40.87655077879964,-73.91065965862981.


In [57]:
# Get the top 100 venues that are in Marble Hill within a radius of 500 meters

LIMIT = 100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID,CLIENT_SECRET,VERSION,neighborhood_latitude,neighborhood_longitude,radius,LIMIT)

url

'https://api.foursquare.com/v2/venues/explore?&client_id=P4GV2G4LEKK4XMTHM0H4H5W3CFR055TAHR2IQ3JPLJ0TFHEK&client_secret=CJ2YTC5JCO4F3PNHKPSZPMFOTIU5RYD4QD1WB5FYRCS4FD5B&v=20180604&ll=40.87655077879964,-73.91065965862981&radius=500&limit=100'

In [58]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5fc55f8772ff4663205bb895'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Marble Hill',
  'headerFullLocation': 'Marble Hill, New York',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 22,
  'suggestedBounds': {'ne': {'lat': 40.88105078329964,
    'lng': -73.90471933917806},
   'sw': {'lat': 40.87205077429964, 'lng': -73.91659997808156}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4b4429abf964a52037f225e3',
       'name': "Arturo's",
       'location': {'address': '5198 Broadway',
        'crossStreet': 'at 225th St.',
        'lat': 40.87441177110231,
        'lng': -73.91027100981574,
        'labeledLatLngs': [{'label'

In [60]:
# Function to extract the category of a venue

def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
    
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [61]:
# Clean the json file and put it into a pandas data frame

venues = results['response']['groups'][0]['items']

nearby_venues = pd.json_normalize(venues)

filtered_columns = ['venue.name','venue.categories','venue.location.lat','venue.location.lng']
nearby_venues = nearby_venues.loc[:,filtered_columns]

nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis = 1)

nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

KeyError: 'groups'

In [17]:
print("{} venues were returned by Foursquare".format(nearby_venues.shape[0]))

22 venues were returned by Foursquare


## Expanding the Process to All Neighborhoods in Manhattan

In [30]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name,lat,lng in zip(names, latitudes, longitudes):
        print(name)
        
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={},&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)
    
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])
        
        nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
        nearby_venues.columns=['Neighborhood', 'Neighborhood Latitude','Neighborhood Longitude','Venue','Venue Latitude','Venue Longitude','Venue Category']
        
        return(nearby_venues)

In [31]:
manhattan_venues = getNearbyVenues(names = manhattan_data['Neighborhood'],latitudes=manhattan_data['Latitude'],longitudes=manhattan_data['Longitude'])

Marble Hill


KeyError: 'groups'

In [59]:
names='Marble Hill'
latitudes=40.876551
longitudes=-73.910660

url='https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={},&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)

results = requests.get(url).json()
results

{'meta': {'code': 400,
  'errorType': 'invalid_auth',
  'errorDetail': 'Missing access credentials. See https://developer.foursquare.com/docs/api/configuration/authentication for details.',
  'requestId': '5fc55f93ca93552c137274f1'},
 'response': {}}