In [1]:
# Problem Statement
# I have been approached by a Manhattan based businessman who wants to start a service which delivers Indian food 
# to a customer’s home. He believes the demand for his service will be higher in neighborhoods of Manhattan where there 
# is a lack of Indian restaurants. He has asked me to help him identify such neighborhoods so that he can better target 
# his marketing budget.


In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import folium 
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import json 
from pandas.io.json import json_normalize
import urllib.request
from geopy.geocoders import Nominatim

In [3]:
url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs/newyork_data.json'
filename = 'newyork_data.json'
urllib.request.urlretrieve(url, filename)

('newyork_data.json', <http.client.HTTPMessage at 0x1d8d54c0518>)

In [4]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)
neighborhoods_data = newyork_data['features']
print(neighborhoods_data[0])

{'type': 'Feature', 'id': 'nyu_2451_34572.1', 'geometry': {'type': 'Point', 'coordinates': [-73.84720052054902, 40.89470517661]}, 'geometry_name': 'geom', 'properties': {'name': 'Wakefield', 'stacked': 1, 'annoline1': 'Wakefield', 'annoline2': None, 'annoline3': None, 'annoangle': 0.0, 'borough': 'Bronx', 'bbox': [-73.84720052054902, 40.89470517661, -73.84720052054902, 40.89470517661]}}


In [5]:
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

neighborhoods = pd.DataFrame(columns=column_names)

for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)
    
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [6]:
print('The NYC dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The NYC dataframe has 5 boroughs and 306 neighborhoods.


In [7]:
# Creating a subset of Manhattan neighborhoods for our analysis 
manhattan_data = neighborhoods[neighborhoods['Borough'] == 'Manhattan'].reset_index(drop=True)
print('The Manhattan dataframe has {} neighborhoods.'.format(
        manhattan_data.shape[0]
    )
)

address = 'Manhattan, NY'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))


# create map of Manhattan using latitude and longitude values
map_manhattan = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(manhattan_data['Latitude'], manhattan_data['Longitude'], manhattan_data['Borough'], manhattan_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_manhattan)  
    
map_manhattan

The Manhattan dataframe has 40 neighborhoods.
The geograpical coordinate of Manhattan are 40.7896239, -73.9598939.


In [8]:
CLIENT_ID = 'CJKWKRM2G4LXJCXHZGYDUOUNM330GTSNK2OLYE3RPZG02IFR' # your Foursquare ID
CLIENT_SECRET = 'ZXP1UGADQHISOMBNGAA2IY4LERBWP5RT1RM5HDMKSEL13TSQ' # your Foursquare Secret
ACCESS_TOKEN = 'XBXIQMB3TGZR0WHENOPPJG3MKAQ0S1IRQ5AYLN11UCVYLEIU' # your FourSquare Access Token
VERSION = '20180604'
LIMIT = 100
category_Id = '4bf58dd8d48988d10f941735' # Category Id of 'Indian Restaurant'

In [9]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&oauth_token={}&v={}&categoryId={}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, lat, lng, ACCESS_TOKEN, VERSION, category_Id, radius, LIMIT)

        # make the GET request
        results = requests.get(url).json()['response']['venues']
        
        dataframe = json_normalize(results)
        
        # return only relevant information for each nearby venue
        venues_list.append([
            name, 
            lat, 
            lng, 
            dataframe.shape[0]])

    nearby_venues = pd.DataFrame(venues_list, columns = ['Neighborhood', 
                  'Latitude', 
                  'Longitude', 
                  'Number of Indian Restaurants'])
    
    return(nearby_venues)

In [10]:
# Pull the number of Indian restaurants in each neighborhood
mnhttn_indian_rest = getNearbyVenues(names=manhattan_data['Neighborhood'],
                                   latitudes=manhattan_data['Latitude'],
                                   longitudes=manhattan_data['Longitude']
                                  )

mnhttn_indian_rest.head()

Unnamed: 0,Neighborhood,Latitude,Longitude,Number of Indian Restaurants
0,Marble Hill,40.876551,-73.91066,1
1,Chinatown,40.715618,-73.994279,4
2,Washington Heights,40.851903,-73.9369,2
3,Inwood,40.867684,-73.92121,0
4,Hamilton Heights,40.823604,-73.949688,2


In [11]:
# I am going to divide the neighborhoods into three clusters based on the number of Indian restaurants
kclusters = 3

ds_clustering = mnhttn_indian_rest.drop(['Neighborhood', 'Latitude', 'Longitude'],  1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(ds_clustering)

In [12]:
# add clustering labels
mnhttn_indian_rest.insert(0, 'Cluster Labels', kmeans.labels_)

In [13]:
# Cluster 0 Summary
print('Cluster 0 has {} neighborhoods. These neighborhoods have {} to {} Indian Restaurants in a 500 meter radius.'.format(
        len(mnhttn_indian_rest.loc[mnhttn_indian_rest['Cluster Labels'] == 0]),
    mnhttn_indian_rest.loc[mnhttn_indian_rest['Cluster Labels'] == 0]['Number of Indian Restaurants'].min(),
mnhttn_indian_rest.loc[mnhttn_indian_rest['Cluster Labels'] == 0]['Number of Indian Restaurants'].max()
    )
)

# Cluster 2 Summary
print('Cluster 2 has {} neighborhoods. These neighborhoods have {} to {} Indian Restaurants in a 500 meter radius.'.format(
        len(mnhttn_indian_rest.loc[mnhttn_indian_rest['Cluster Labels'] == 2]),
    mnhttn_indian_rest.loc[mnhttn_indian_rest['Cluster Labels'] == 2]['Number of Indian Restaurants'].min(),
mnhttn_indian_rest.loc[mnhttn_indian_rest['Cluster Labels'] == 2]['Number of Indian Restaurants'].max()
    )
)

# Cluster 1 Summary
print('Cluster 1 has {} neighborhoods. These neighborhoods have {} to {} Indian Restaurants in a 500 meter radius.'.format(
        len(mnhttn_indian_rest.loc[mnhttn_indian_rest['Cluster Labels'] == 1]),
    mnhttn_indian_rest.loc[mnhttn_indian_rest['Cluster Labels'] == 1]['Number of Indian Restaurants'].min(),
mnhttn_indian_rest.loc[mnhttn_indian_rest['Cluster Labels'] == 1]['Number of Indian Restaurants'].max()
    )
)

Cluster 0 has 22 neighborhoods. These neighborhoods have 0 to 6 Indian Restaurants in a 500 meter radius.
Cluster 2 has 12 neighborhoods. These neighborhoods have 7 to 16 Indian Restaurants in a 500 meter radius.
Cluster 1 has 6 neighborhoods. These neighborhoods have 21 to 28 Indian Restaurants in a 500 meter radius.


In [14]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, count, cluster in zip(mnhttn_indian_rest['Latitude'], mnhttn_indian_rest['Longitude'], mnhttn_indian_rest['Neighborhood'], mnhttn_indian_rest['Number of Indian Restaurants'], mnhttn_indian_rest['Cluster Labels']):
    label = folium.Popup(str(poi) + ': Indian Restaurant Count =  ' + str(count), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [15]:
# Recommendation - 
# My recommendation to the businessman would be to target the seven neighborhoods in Manhattan north of Central Park. 
# They are geographically contiguous and all have a low density of Indian restaurants (members of Cluster '0'). 
# His service will be an attractive proposition for residents of these neighborhoods who enjoy Indian food.