In [2]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json # library to handle JSON files

In [3]:
# !conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt

In [6]:
# import k-means from clustering stage
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    certifi-2020.6.20          |   py36h9f0ad1d_0         151 KB  conda-forge
    altair-4.1.0               |             py_1         614 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ca-certificates-2020.6.20  |       hecda079_0         145 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    branca-0.4.1               |             py_0          26 KB  conda-forge
    ------------------------------------------------------------
                       

In [34]:
!wget -q -O 'newyork_data.json' https://ibm.box.com/shared/static/fbpwbovar7lf8p5sgddm06cgipa2rxpe.json
print('Data downloaded!')

Data downloaded!


In [35]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [36]:
neighborhoods_data = newyork_data['features']
neighborhoods_data[0]

{'type': 'Feature',
 'id': 'nyu_2451_34572.1',
 'geometry': {'type': 'Point',
  'coordinates': [-73.84720052054902, 40.89470517661]},
 'geometry_name': 'geom',
 'properties': {'name': 'Wakefield',
  'stacked': 1,
  'annoline1': 'Wakefield',
  'annoline2': None,
  'annoline3': None,
  'annoangle': 0.0,
  'borough': 'Bronx',
  'bbox': [-73.84720052054902,
   40.89470517661,
   -73.84720052054902,
   40.89470517661]}}

In [37]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)
neighborhoods

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude


In [38]:
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [39]:
neighborhoods.to_csv('BON1_NYC_GEO.csv',index=False)
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [40]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent="Jupyter")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 40.7127281, -74.0060152.


In [41]:
# create map of Toronto using latitude and longitude values
map_NewYork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_NewYork)  
    
map_NewYork

## Web scrapping of Population data from wikipedia page using BeautifulSoup.

In [13]:
# conda install -c anaconda beautiful-soup --yes
from bs4 import BeautifulSoup # package for parsing HTML and XML documents
import csv # implements classes to read and write tabular data in CSV form

website_url = requests.get('https://en.wikipedia.org/wiki/Demographics_of_New_York_City').text
soup = BeautifulSoup(website_url,'lxml')
table = soup.find('table',{'class':'wikitable sortable'})

i=0
headers = []
for header in table.find_all('th'):
   if (i<6) :
      i = i + 1
      continue
   headers.append(header.text)

print(headers)

table_rows = table.find_all('tr')        
rows = []
for row in table_rows:
   td = row.find_all('td')
   row = [row.text for row in td]
   #row = row.replace(r'\n', ' ')
   rows.append(row)


with open('BON2_POPULATION1.csv', 'w') as f:
   writer = csv.writer(f)
   writer.writerow(headers)
   writer.writerows(row for row in rows if row)


['Borough', 'County', 'Estimate (2019)[12]', 'billions(US$)[13]', 'per capita(US$)', 'square miles', 'squarekm', 'persons / sq. mi', 'persons /km2\n']


In [14]:
Pop_data=pd.read_csv('BON2_POPULATION1.csv')
Pop_data.drop(Pop_data.columns[[3,4]], axis=1,inplace=True)
print('Data downloaded!')
Pop_data.head()

Data downloaded!


Unnamed: 0,Borough,County,Estimate (2019)[12],square miles,squarekm,persons / sq. mi,persons /km2
0,The Bronx\n,\n Bronx\n,"1,418,207\n",42.10\n,109.04\n,"33,867\n","13,006\n"
1,Brooklyn\n,\n Kings\n,"2,559,903\n",70.82\n,183.42\n,"36,147\n","13,957\n"
2,Manhattan\n,\n New York\n,"1,628,706\n",22.83\n,59.13\n,"71,341\n","27,544\n"
3,Queens\n,\n Queens\n,"2,253,858\n",108.53\n,281.09\n,"20,767\n","8,018\n"
4,Staten Island\n,\n Richmond\n,"476,143\n",58.37\n,151.18\n,"8,157\n","3,150\n"


In [15]:
print('Remove whitespaces and rename columns')
Pop_data.columns = Pop_data.columns.str.replace(' ', '')
Pop_data.columns = Pop_data.columns.str.replace(r'\n', ' ')
Pop_data

Remove whitespaces and rename columns


Unnamed: 0,Borough,County,Estimate(2019)[12],squaremiles,squarekm,persons/sq.mi,persons/km2
0,The Bronx\n,\n Bronx\n,"1,418,207\n",42.10\n,109.04\n,"33,867\n","13,006\n"
1,Brooklyn\n,\n Kings\n,"2,559,903\n",70.82\n,183.42\n,"36,147\n","13,957\n"
2,Manhattan\n,\n New York\n,"1,628,706\n",22.83\n,59.13\n,"71,341\n","27,544\n"
3,Queens\n,\n Queens\n,"2,253,858\n",108.53\n,281.09\n,"20,767\n","8,018\n"
4,Staten Island\n,\n Richmond\n,"476,143\n",58.37\n,151.18\n,"8,157\n","3,150\n"
5,City of New York,8336817,842.343,783.83,27547,"10,636\n",
6,State of New York,19453561,1731.910,122284,412,159\n,
7,Sources:[14] and see individual borough articl...,,,,,,


In [16]:
Pop_data.rename(columns = {'Estimate(2019)[12]':'Estimate_2019', 
                   'squaremiles':'square_miles',
                    'squarekm':'square_km',
                    'persons/sq.mi':'persons_sq_mi',
                    'persons/km2 ':'persons_sq_km'}, inplace=True)
Pop_data

Unnamed: 0,Borough,County,Estimate_2019,square_miles,square_km,persons_sq_mi,persons_sq_km
0,The Bronx\n,\n Bronx\n,"1,418,207\n",42.10\n,109.04\n,"33,867\n","13,006\n"
1,Brooklyn\n,\n Kings\n,"2,559,903\n",70.82\n,183.42\n,"36,147\n","13,957\n"
2,Manhattan\n,\n New York\n,"1,628,706\n",22.83\n,59.13\n,"71,341\n","27,544\n"
3,Queens\n,\n Queens\n,"2,253,858\n",108.53\n,281.09\n,"20,767\n","8,018\n"
4,Staten Island\n,\n Richmond\n,"476,143\n",58.37\n,151.18\n,"8,157\n","3,150\n"
5,City of New York,8336817,842.343,783.83,27547,"10,636\n",
6,State of New York,19453561,1731.910,122284,412,159\n,
7,Sources:[14] and see individual borough articl...,,,,,,


In [17]:
Pop_data['Borough']=Pop_data['Borough'].replace(to_replace='\n', value='', regex=True)
Pop_data['County']=Pop_data['County'].replace(to_replace='\n', value='', regex=True)
Pop_data['Estimate_2019']=Pop_data['Estimate_2019'].replace(to_replace='\n', value='', regex=True)
Pop_data['square_miles']=Pop_data['square_miles'].replace(to_replace='\n', value='', regex=True)
Pop_data['square_km']=Pop_data['square_km'].replace(to_replace='\n', value='', regex=True)
Pop_data['persons_sq_mi']=Pop_data['persons_sq_mi'].replace(to_replace='\n', value='', regex=True)
Pop_data['persons_sq_km']=Pop_data['persons_sq_km'].replace(to_replace='\n', value='', regex=True)
Pop_data

Unnamed: 0,Borough,County,Estimate_2019,square_miles,square_km,persons_sq_mi,persons_sq_km
0,The Bronx,Bronx,1418207.0,42.1,109.04,33867.0,13006.0
1,Brooklyn,Kings,2559903.0,70.82,183.42,36147.0,13957.0
2,Manhattan,New York,1628706.0,22.83,59.13,71341.0,27544.0
3,Queens,Queens,2253858.0,108.53,281.09,20767.0,8018.0
4,Staten Island,Richmond,476143.0,58.37,151.18,8157.0,3150.0
5,City of New York,8336817,842.343,783.83,27547.0,10636.0,
6,State of New York,19453561,1731.91,122284.0,412.0,159.0,
7,Sources:[14] and see individual borough articles,,,,,,


In [104]:
Pop_data.loc[5:,['persons_sq_mi','persons_sq_km']] = Pop_data.loc[2:,['persons_sq_mi','persons_sq_km']].shift(1,axis=1)
Pop_data.loc[5:,['square_km','persons_sq_mi']] = Pop_data.loc[2:,['square_km','persons_sq_mi']].shift(1,axis=1)
Pop_data.loc[5:,['square_miles','square_km']] = Pop_data.loc[2:,['square_miles','square_km']].shift(1,axis=1)
Pop_data.loc[5:,['Estimate_2019','square_miles']] = Pop_data.loc[2:,['Estimate_2019','square_miles']].shift(1,axis=1)
Pop_data.loc[5:,['County','Estimate_2019']] = Pop_data.loc[2:,['County','Estimate_2019']].shift(1,axis=1)
Pop_data.loc[5:,['Borough','County']] = Pop_data.loc[2:,['Borough','County']].shift(1,axis=1)
Pop_data

Unnamed: 0,Borough,County,Estimate_2019,square_miles,square_km,persons_sq_mi,persons_sq_km
0,The Bronx,Bronx,1418207.0,42.1,109.04,33867.0,13006.0
1,Brooklyn,Kings,2559903.0,70.82,183.42,36147.0,13957.0
2,Manhattan,New York,1628706.0,22.83,59.13,71341.0,27544.0
3,Queens,Queens,2253858.0,108.53,281.09,20767.0,8018.0
4,Staten Island,Richmond,476143.0,58.37,151.18,8157.0,3150.0
5,,City of New York,8336817.0,842.343,,,783.83
6,,State of New York,19453561.0,1731.91,,,122284.0
7,,Sources:[14] and see individual borough articles,,,,,


In [18]:
Pop_data = Pop_data.fillna('')
Pop_data

Unnamed: 0,Borough,County,Estimate_2019,square_miles,square_km,persons_sq_mi,persons_sq_km
0,The Bronx,Bronx,1418207.0,42.1,109.04,33867.0,13006.0
1,Brooklyn,Kings,2559903.0,70.82,183.42,36147.0,13957.0
2,Manhattan,New York,1628706.0,22.83,59.13,71341.0,27544.0
3,Queens,Queens,2253858.0,108.53,281.09,20767.0,8018.0
4,Staten Island,Richmond,476143.0,58.37,151.18,8157.0,3150.0
5,City of New York,8336817,842.343,783.83,27547.0,10636.0,
6,State of New York,19453561,1731.91,122284.0,412.0,159.0,
7,Sources:[14] and see individual borough articles,,,,,,


In [20]:
print('drop the last row')
i = Pop_data[((Pop_data.Borough == 'Sources:[14] and see individual borough articles'))].index
Pop_data.drop(i)

drop the last row


Unnamed: 0,Borough,County,Estimate_2019,square_miles,square_km,persons_sq_mi,persons_sq_km
0,The Bronx,Bronx,1418207.0,42.1,109.04,33867,13006.0
1,Brooklyn,Kings,2559903.0,70.82,183.42,36147,13957.0
2,Manhattan,New York,1628706.0,22.83,59.13,71341,27544.0
3,Queens,Queens,2253858.0,108.53,281.09,20767,8018.0
4,Staten Island,Richmond,476143.0,58.37,151.18,8157,3150.0
5,City of New York,8336817,842.343,783.83,27547.0,10636,
6,State of New York,19453561,1731.91,122284.0,412.0,159,


In [21]:
Pop_data.to_csv('BON2_POPULATION.csv',index=False)

## Explore Neighborhoods in Brooklyn and Manhattan

In [42]:
NYC_Geo=pd.read_csv('BON1_NYC_GEO.csv')
print('Data downloaded!')

Data downloaded!


In [43]:
NYC_Geo.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [44]:
NYC_Geo['Borough'].value_counts().to_frame()

Unnamed: 0,Borough
Queens,81
Brooklyn,70
Staten Island,63
Bronx,52
Manhattan,40


In [45]:
NYC_Geo.shape

(306, 4)

In [46]:
print(NYC_Geo.Borough.unique())

['Bronx' 'Manhattan' 'Brooklyn' 'Queens' 'Staten Island']


In [47]:
NYC_Geo.isnull().sum()

Borough         0
Neighborhood    0
Latitude        0
Longitude       0
dtype: int64

## Segmenting and Clustering Neighborhoods - Brooklyn and Manhattan 

In [48]:
BM_Geo = NYC_Geo.loc[(NYC_Geo['Borough'] == 'Brooklyn')|(NYC_Geo['Borough'] == 'Manhattan')]
BM_Geo = BM_Geo.reset_index(drop=True)
BM_Geo.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Manhattan,Marble Hill,40.876551,-73.91066
1,Brooklyn,Bay Ridge,40.625801,-74.030621
2,Brooklyn,Bensonhurst,40.611009,-73.99518
3,Brooklyn,Sunset Park,40.645103,-74.010316
4,Brooklyn,Greenpoint,40.730201,-73.954241


In [49]:
BM_Geo.shape

(110, 4)

## Use geopy library to get the latitude and longitude values of New York City 

In [50]:
import time
start_time = time.time()

address = 'New York City, NY'

geolocator = Nominatim(user_agent="Jupyter")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

print("--- %s seconds ---" % round((time.time() - start_time), 2))

The geograpical coordinate of New York City are 40.7127281, -74.0060152.
--- 1.23 seconds ---


## Create a map of Brooklyn and Manhattan with neighborhoods superimposed on top. 

In [51]:

# create map of Toronto using latitude and longitude values
map_BM = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(BM_Geo['Latitude'], BM_Geo['Longitude'], BM_Geo['Borough'], BM_Geo['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_BM)  
    
map_BM

## Define Foursquare Credentials and Version 

In [52]:
CLIENT_ID = 'OLN1BAQQBHO234LKFIU1ZNGV4Z3O3P1GS5KIMTNPJHLX1MKL' # your Foursquare ID
CLIENT_SECRET = 'VDM5CGGVSUOGKMY21ETO4J1UAJH5QJEALQCJAIWUF2DJXR2T' # your Foursquare Secret
VERSION = '20181218' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: OLN1BAQQBHO234LKFIU1ZNGV4Z3O3P1GS5KIMTNPJHLX1MKL
CLIENT_SECRET:VDM5CGGVSUOGKMY21ETO4J1UAJH5QJEALQCJAIWUF2DJXR2T


### Extract Venues data for each neighborhoods in Brooklyn 

In [101]:
def getNearbyVets(names, latitudes, longitudes, radius, LIMIT):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&query={}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            'veterinarian',
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()
        #venues  = results['response']['groups'][0]['items'][0]['venue']
        venues  = results['response']['groups'][0]['items']

        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in venues])
        print(venues_list)
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [124]:
BM_venues = getNearbyVets(['New York'],['40.7127281'],['-74.0060152'],10000,LIMIT=200)

print('The "BM_venues" dataframe has {} venues and {} unique venue types.'.format(
      len(BM_venues['Venue Category']),
      len(BM_venues['Venue Category'].unique())))

BM_venues.to_csv('BM_venues.csv', sep=',', encoding='UTF8')
BM_venues.head()

New York
[[('New York', '40.7127281', '-74.0060152', 'Seaport Animal Hospital', 40.708821, -74.003925, 'Veterinarian'), ('New York', '40.7127281', '-74.0060152', 'West Village Veterinary Hospital', 40.7396051177395, -74.00296437301637, 'Veterinarian'), ('New York', '40.7127281', '-74.0060152', 'Hope Veterinary Clinic', 40.686878, -73.985526, 'Veterinarian'), ('New York', '40.7127281', '-74.0060152', 'Pure Paws Veterinary Care of Clinton Hill', 40.68252106177534, -73.96301800796925, 'Veterinarian'), ('New York', '40.7127281', '-74.0060152', 'Eastside Animal Hospital', 40.755886, -73.966446, 'Veterinarian'), ('New York', '40.7127281', '-74.0060152', 'All Creatures Veterinary Hospital', 40.678102, -73.963604, 'Veterinarian'), ('New York', '40.7127281', '-74.0060152', "Pure Paws Veterinary Care of Hell's Kitchen", 40.7596826, -73.996128, 'Veterinarian'), ('New York', '40.7127281', '-74.0060152', 'Secaucus Animal Hospital', 40.77102, -74.06567, 'Veterinarian'), ('New York', '40.7127281', '-

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,New York,40.7127281,-74.0060152,Seaport Animal Hospital,40.708821,-74.003925,Veterinarian
1,New York,40.7127281,-74.0060152,West Village Veterinary Hospital,40.739605,-74.002964,Veterinarian
2,New York,40.7127281,-74.0060152,Hope Veterinary Clinic,40.686878,-73.985526,Veterinarian
3,New York,40.7127281,-74.0060152,Pure Paws Veterinary Care of Clinton Hill,40.682521,-73.963018,Veterinarian
4,New York,40.7127281,-74.0060152,Eastside Animal Hospital,40.755886,-73.966446,Veterinarian


### Run the above function on each neighborhood and create a new dataframe called BM_venues 

In [125]:
BM_venues.rename(columns = {'Neighborhood Latitude': 'NeighLatitude',
                    'Neighborhood Longitude': 'NeighLongitude',
                    'Venue Latitude':'VenLatitude', 
                   'Venue Longitude':'VenLongitude',
                    'Venue Category':'VenCategory'}, inplace=True)
BM_venues

Unnamed: 0,Neighborhood,NeighLatitude,NeighLongitude,Venue,VenLatitude,VenLongitude,VenCategory
0,New York,40.7127281,-74.0060152,Seaport Animal Hospital,40.708821,-74.003925,Veterinarian
1,New York,40.7127281,-74.0060152,West Village Veterinary Hospital,40.739605,-74.002964,Veterinarian
2,New York,40.7127281,-74.0060152,Hope Veterinary Clinic,40.686878,-73.985526,Veterinarian
3,New York,40.7127281,-74.0060152,Pure Paws Veterinary Care of Clinton Hill,40.682521,-73.963018,Veterinarian
4,New York,40.7127281,-74.0060152,Eastside Animal Hospital,40.755886,-73.966446,Veterinarian
5,New York,40.7127281,-74.0060152,All Creatures Veterinary Hospital,40.678102,-73.963604,Veterinarian
6,New York,40.7127281,-74.0060152,Pure Paws Veterinary Care of Hell's Kitchen,40.759683,-73.996128,Veterinarian
7,New York,40.7127281,-74.0060152,Secaucus Animal Hospital,40.77102,-74.06567,Veterinarian
8,New York,40.7127281,-74.0060152,Pet Haven Animal Hospital,40.647007,-73.980315,Veterinarian
9,New York,40.7127281,-74.0060152,Banfield Pet Hospital,40.689081,-73.991341,Veterinarian


In [112]:
latitude = 40.7127281
longitude = -74.0060152

venues_map = folium.Map(location=[latitude, longitude], zoom_start=13) # generate map centred around the Conrad Hotel

# add a red circle marker to represent the Conrad Hotel
folium.features.CircleMarker(
    [latitude, longitude],
    radius=10,
    color='red',
    popup='New York',
    fill = True,
    fill_color = 'red',
    fill_opacity = 0.6
).add_to(venues_map)

# add the Veterinarians as blue circle markers
for lat, lng, label in zip(BM_venues.VenLatitude, BM_venues.VenLongitude, BM_venues.VenCategory):
    folium.features.CircleMarker(
        [lat, lng],
        radius=5,
        color='blue',
        popup=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(venues_map)

# display map
venues_map

## Cluster Neighborhoods and Examine Clusters 

### First, let's determine the optimal value of K for our dataset using the Silhouette Coefficient Method 

In [126]:
#BM_venues.groupby('Venue').count()
#BM_venues = BM_venues.drop('Neighborhood', 1)
BM_venues.drop(BM_venues.columns[[0,1,2,6]], axis=1,inplace=True)
BM_venues

Unnamed: 0,Venue,VenLatitude,VenLongitude
0,Seaport Animal Hospital,40.708821,-74.003925
1,West Village Veterinary Hospital,40.739605,-74.002964
2,Hope Veterinary Clinic,40.686878,-73.985526
3,Pure Paws Veterinary Care of Clinton Hill,40.682521,-73.963018
4,Eastside Animal Hospital,40.755886,-73.966446
5,All Creatures Veterinary Hospital,40.678102,-73.963604
6,Pure Paws Veterinary Care of Hell's Kitchen,40.759683,-73.996128
7,Secaucus Animal Hospital,40.77102,-74.06567
8,Pet Haven Animal Hospital,40.647007,-73.980315
9,Banfield Pet Hospital,40.689081,-73.991341


In [127]:
BM_grouped_clustering = BM_venues.drop('Venue', 1)

for n_cluster in range(2, 10):
    kmeans = KMeans(n_clusters=n_cluster).fit(BM_grouped_clustering)
    label = kmeans.labels_
    sil_coeff = silhouette_score(BM_grouped_clustering, label, metric='euclidean')
    print("For n_clusters={}, The Silhouette Coefficient is {}".format(n_cluster, sil_coeff))

For n_clusters=2, The Silhouette Coefficient is 0.47589810209839606
For n_clusters=3, The Silhouette Coefficient is 0.5281169616187027
For n_clusters=4, The Silhouette Coefficient is 0.4602634502830808
For n_clusters=5, The Silhouette Coefficient is 0.42237012780420613
For n_clusters=6, The Silhouette Coefficient is 0.4456025495948846
For n_clusters=7, The Silhouette Coefficient is 0.43976662936557503
For n_clusters=8, The Silhouette Coefficient is 0.38599593740775595
For n_clusters=9, The Silhouette Coefficient is 0.4177674102459581


#### As we can see, n_clusters=3 has highest Silhouette Coefficient. This means that 3 should be the optimal number of clusters.
#### For n_clusters=3, The Silhouette Coefficient is 0.5281169616187027

#### Run k-means to cluster the neighborhood into 3 clusters. 

In [128]:
# set number of clusters
kclusters = 3

BM_grouped_clustering = BM_venues.drop('Venue', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(BM_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([0, 0, 1, 1, 0, 1, 0, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 1, 0,
       2, 0, 0, 2, 0, 0, 0, 1, 0, 1, 1, 1, 2, 0, 1, 2, 0, 0, 0, 1, 2, 2,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 2, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 2, 1, 0, 2, 1, 0, 0, 0, 2], dtype=int32)

In [130]:
BQS_results = pd.DataFrame(kmeans.cluster_centers_)
BQS_results.columns = BM_grouped_clustering.columns
BQS_results.index = ['cluster0','cluster1','cluster2']
BQS_results['Total Sum'] = BQS_results.sum(axis = 1)
BQS_results

Unnamed: 0,VenLatitude,VenLongitude,Total Sum
cluster0,40.73678,-73.985829,-33.249049
cluster1,40.678269,-73.98265,-33.304381
cluster2,40.744698,-74.044589,-33.299891


### The Total and Total Sum of cluster0 has smallest value. It shows that the market is not saturated. 

In [153]:
latitude = 40.7127281
longitude = -74.0060152

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**3 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 8,len(ys)))
#rainbow = [colors.rgb2hex(i) for i in colors_array]
rainbow = ['red','blue','green']
BM_venues = BM_venues.assign(Cluster_Labels = kmeans.labels_)

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(BM_venues['VenLatitude'], BM_venues['VenLongitude'], BM_venues['Venue'], BM_venues['Cluster_Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters