## Downloading and installing requisite libraries

In [1]:
import numpy as np
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json
!pip install geopy
#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim 
import requests 
from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
#!conda install -c conda-forge folium=0.5.0 --yes
!pip install folium
import folium

print('Libraries imported.')

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 7.8MB/s eta 0:00:011
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/81/6d/31c83485189a2521a75b4130f1fee5364f772a0375f81afff619004e5237/branca-0.4.0-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.0 folium-0.10.1
Libraries imported.


## Dowloading the New York neighborhood data .json file and saving to a dataframe

In [2]:
!wget -q -O 'newyork_data.json' https://cocl.us/new_york_dataset
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [3]:
neighborhoods_data = newyork_data['features']
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 
neighborhoods = pd.DataFrame(columns=column_names)
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [4]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

address = 'New York City, NY'

geolocator = Nominatim(user_agent="ny_explore")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

The dataframe has 5 boroughs and 306 neighborhoods.


## Using the Foursquare API to download venue data for all neighborhoods in NY

In [33]:
CLIENT_ID = 'PNJV3JL13UFR5TVUTSACNAWVLNZ221QV43S3KK1YAIS0JXNJ'
CLIENT_SECRET = 'UJUUD3PLP5SB2OLXCXTOI2VFNMHX2DLBZUYE5F55UQNMRB5U' 
VERSION = '20180605'
LIMIT = 100

In [6]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
    
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['id'],
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue ID',
                  'Venue',
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [7]:
newyork_venues = getNearbyVenues(names=neighborhoods['Neighborhood'],
                                   latitudes=neighborhoods['Latitude'],
                                   longitudes=neighborhoods['Longitude']
                                  )

Wakefield
Co-op City
Eastchester
Fieldston
Riverdale
Kingsbridge
Marble Hill
Woodlawn
Norwood
Williamsbridge
Baychester
Pelham Parkway
City Island
Bedford Park
University Heights
Morris Heights
Fordham
East Tremont
West Farms
High  Bridge
Melrose
Mott Haven
Port Morris
Longwood
Hunts Point
Morrisania
Soundview
Clason Point
Throgs Neck
Country Club
Parkchester
Westchester Square
Van Nest
Morris Park
Belmont
Spuyten Duyvil
North Riverdale
Pelham Bay
Schuylerville
Edgewater Park
Castle Hill
Olinville
Pelham Gardens
Concourse
Unionport
Edenwald
Bay Ridge
Bensonhurst
Sunset Park
Greenpoint
Gravesend
Brighton Beach
Sheepshead Bay
Manhattan Terrace
Flatbush
Crown Heights
East Flatbush
Kensington
Windsor Terrace
Prospect Heights
Brownsville
Williamsburg
Bushwick
Bedford Stuyvesant
Brooklyn Heights
Cobble Hill
Carroll Gardens
Red Hook
Gowanus
Fort Greene
Park Slope
Cypress Hills
East New York
Starrett City
Canarsie
Flatlands
Mill Island
Manhattan Beach
Coney Island
Bath Beach
Borough Park
Dyker

In [8]:
print(newyork_venues.shape)
newyork_venues.head()

(10272, 8)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue ID,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Wakefield,40.894705,-73.847201,4c537892fd2ea593cb077a28,Lollipops Gelato,40.894123,-73.845892,Dessert Shop
1,Wakefield,40.894705,-73.847201,4d6af9426107f04dedeb297a,Rite Aid,40.896649,-73.844846,Pharmacy
2,Wakefield,40.894705,-73.847201,4c783cef3badb1f7e4244b54,Carvel Ice Cream,40.890487,-73.848568,Ice Cream Shop
3,Wakefield,40.894705,-73.847201,5d5f5044d0ae1c0008f043c3,Walgreens,40.896687,-73.84485,Pharmacy
4,Wakefield,40.894705,-73.847201,4c25c212f1272d7f836385c5,Dunkin',40.890459,-73.849089,Donut Shop


In [74]:
categorytypes = pd.DataFrame(newyork_venues['Venue Category'].unique(),columns={"Venue Category"})
categorytypes.sort_values(by='Venue Category', ascending=True)
categorytypes

Unnamed: 0,Venue Category
0,Dessert Shop
1,Pharmacy
2,Ice Cream Shop
3,Donut Shop
4,Gas Station
5,Caribbean Restaurant
6,Sandwich Place
7,Pizza Place
8,Laundromat
9,Discount Store


## Extracting all locations with "Gym" in the Venue Category into a separate dataframe

In [9]:
df_gyms = newyork_venues[newyork_venues['Venue Category'].str.contains('Gym')]
df_gyms.shape

(266, 8)

## Using the Foursquare API to retrieve number of likes for each gym and adding this to the original df_gyms dataframe

In [35]:
likes_list=[]
for vid in df_gyms['Venue ID']:
    url = 'https://api.foursquare.com/v2/venues/{}/?client_id={}&client_secret={}&v={}&stats'.format(vid,CLIENT_ID, CLIENT_SECRET, VERSION)
    results = requests.get(url).json()
    likes = results["response"]["venue"]['likes']['count']
    likes_list.append(likes)

In [36]:
df_gyms["Likes"] = likes_list
df_gyms.reset_index(drop=True,inplace=True)
df_gyms.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue ID,Venue,Venue Latitude,Venue Longitude,Venue Category,Likes
0,Riverdale,40.890834,-73.912585,4ce716cd0f196dcb7fe43bae,Hayden On Hudson Gym,40.889593,-73.917446,Gym,0
1,Marble Hill,40.876551,-73.91066,4cf6ae55d3a8a1cd71a9d243,Astral Fitness & Wellness Center,40.876705,-73.906372,Gym,6
2,Marble Hill,40.876551,-73.91066,55f751ca498eacc0307d1cfe,Blink Fitness,40.877271,-73.905595,Gym,32
3,Baychester,40.866858,-73.835798,4f5ce789e4b0a4baa3d5dbb6,Planet Fitness - Temporarily Closed,40.863298,-73.835568,Gym / Fitness Center,78
4,Bedford Park,40.870185,-73.885512,5cf7f63cb9b37b002c7b4d0f,Blink Fitness,40.873893,-73.888768,Gym,0


## Creating a new dataframe with average likes for each neighborhood

In [37]:
num_likes = df_gyms[['Neighborhood','Likes']].groupby('Neighborhood').mean()
num_likes.reset_index(inplace=True)
num_likes

Unnamed: 0,Neighborhood,Likes
0,Astoria,68.25
1,Auburndale,3.0
2,Battery Park City,64.0
3,Bay Ridge,42.0
4,Baychester,78.0
5,Bayside,6.5
6,Bedford Park,0.0
7,Beechhurst,6.0
8,Bellaire,1.0
9,Boerum Hill,18.666667


## Running cluster analysis based on number of likes

In [38]:
kclusters = 5

gym_clustering = num_likes.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(gym_clustering)
kmeans.labels_[0:40]

array([1, 0, 3, 3, 1, 0, 0, 0, 0, 0, 0, 1, 3, 0, 0, 3, 3, 3, 0, 2, 0, 1,
       0, 3, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 3, 0, 1, 4, 0, 0], dtype=int32)

In [39]:
num_likes.insert(0, 'Cluster Labels', kmeans.labels_)
num_likes.head()

Unnamed: 0,Cluster Labels,Neighborhood,Likes
0,1,Astoria,68.25
1,0,Auburndale,3.0
2,3,Battery Park City,64.0
3,3,Bay Ridge,42.0
4,1,Baychester,78.0


## Creating a new dataframe wiht combined data

In [55]:
latlon = df_gyms[['Neighborhood','Neighborhood Latitude','Neighborhood Longitude']]
latlon.drop_duplicates(keep=False, inplace=True)
latlon

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude
0,Riverdale,40.890834,-73.912585
3,Baychester,40.866858,-73.835798
4,Bedford Park,40.870185,-73.885512
11,West Farms,40.839475,-73.877745
12,High Bridge,40.836623,-73.926102
16,Mott Haven,40.806239,-73.9161
17,Parkchester,40.837938,-73.856003
23,Bay Ridge,40.625801,-74.030621
26,Greenpoint,40.730201,-73.954241
27,Gravesend,40.59526,-73.973471


In [56]:
latlon.reset_index(drop=True, inplace=True)
latlon

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude
0,Riverdale,40.890834,-73.912585
1,Baychester,40.866858,-73.835798
2,Bedford Park,40.870185,-73.885512
3,West Farms,40.839475,-73.877745
4,High Bridge,40.836623,-73.926102
5,Mott Haven,40.806239,-73.9161
6,Parkchester,40.837938,-73.856003
7,Bay Ridge,40.625801,-74.030621
8,Greenpoint,40.730201,-73.954241
9,Gravesend,40.59526,-73.973471


In [57]:
latlon = latlon.join(num_likes.set_index('Neighborhood'), on='Neighborhood')
latlon

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Cluster Labels,Likes
0,Riverdale,40.890834,-73.912585,0,0.0
1,Baychester,40.866858,-73.835798,1,78.0
2,Bedford Park,40.870185,-73.885512,0,0.0
3,West Farms,40.839475,-73.877745,0,0.0
4,High Bridge,40.836623,-73.926102,0,1.0
5,Mott Haven,40.806239,-73.9161,0,8.0
6,Parkchester,40.837938,-73.856003,3,38.0
7,Bay Ridge,40.625801,-74.030621,3,42.0
8,Greenpoint,40.730201,-73.954241,0,10.0
9,Gravesend,40.59526,-73.973471,0,1.0


## Cluster Key for identifying neighborhoods with highest average likes

In [52]:
cluster_likes = latlon[['Cluster Labels','Likes']]
cluster_likes.groupby('Cluster Labels').mean().sort_values(by=['Likes'],ascending=True)

Unnamed: 0_level_0,Likes
Cluster Labels,Unnamed: 1_level_1
0,8.911111
3,40.727273
1,75.0
4,153.0
2,321.0


## Plotting a Cluster map

In [44]:
address = 'New York'
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude  

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(latlon['Neighborhood Latitude'], latlon['Neighborhood Longitude'], latlon['Neighborhood'], latlon['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[(cluster-1)],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Final sorted dataframe (by Cluster Labels)

In [51]:
final_df = latlon.sort_values(by=['Cluster Labels'],ascending=True)
final_df

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Cluster Labels,Likes
0,Riverdale,40.890834,-73.912585,0,0.0
28,Jackson Heights,40.751981,-73.882821,0,18.0
30,Richmond Hill,40.697947,-73.831833,0,21.0
32,Ridgewood,40.708323,-73.901435,0,13.0
34,Auburndale,40.76173,-73.791762,0,3.0
35,Little Neck,40.770826,-73.738898,0,9.0
36,Springfield Gardens,40.66623,-73.760421,0,8.0
37,Cambria Heights,40.692775,-73.735269,0,0.0
38,Steinway,40.775923,-73.90229,0,25.0
39,Beechhurst,40.792781,-73.804365,0,6.0
