In [1]:
# The first step is the installation of the libraries that are necessary

import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
#import folium # map rendering library

print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.18.1-py_0 conda-forge

geographiclib- 100% |################################| Time: 0:00:00  24.46 MB/s
geopy-1.18.1-p 100% |################################| Time: 0:00:00  37.33 MB/s
Libraries imported.


In [2]:
#In the next step loading neighourhood data from New York

!wget -q -O 'newyork_data.json' https://ibm.box.com/shared/static/fbpwbovar7lf8p5sgddm06cgipa2rxpe.json
    
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)
    
    
neighborhoods_data = newyork_data['features']

# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)
    


Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585
5,Bronx,Kingsbridge,40.881687,-73.902818
6,Manhattan,Marble Hill,40.876551,-73.91066
7,Bronx,Woodlawn,40.898273,-73.867315
8,Bronx,Norwood,40.877224,-73.879391
9,Bronx,Williamsbridge,40.881039,-73.857446


In [3]:
# Input FourSquare Credentials and Version
CLIENT_ID = 'ON0R1QZ1103UU2VWUA5B1XZ0A5BEQMJABRLJLCQWX4RCMY2O' # your Foursquare ID
CLIENT_SECRET = 'MUVCSUO5QHB0TXAL2ODGPJCO1QWB2WW0LFJEJEX2WBAR0Q5T' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100

In [4]:
# Getting venues for each neighbourhood from foursquare

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue_Category']
    
    return(nearby_venues)

nyc_venues = getNearbyVenues(names=neighborhoods['Neighborhood'],
                                   latitudes=neighborhoods['Latitude'],
                                   longitudes=neighborhoods['Longitude']
                                  )

Wakefield
Co-op City
Eastchester
Fieldston
Riverdale
Kingsbridge
Marble Hill
Woodlawn
Norwood
Williamsbridge
Baychester
Pelham Parkway
City Island
Bedford Park
University Heights
Morris Heights
Fordham
East Tremont
West Farms
High  Bridge
Melrose
Mott Haven
Port Morris
Longwood
Hunts Point
Morrisania
Soundview
Clason Point
Throgs Neck
Country Club
Parkchester
Westchester Square
Van Nest
Morris Park
Belmont
Spuyten Duyvil
North Riverdale
Pelham Bay
Schuylerville
Edgewater Park
Castle Hill
Olinville
Pelham Gardens
Concourse
Unionport
Edenwald
Bay Ridge
Bensonhurst
Sunset Park
Greenpoint
Gravesend
Brighton Beach
Sheepshead Bay
Manhattan Terrace
Flatbush
Crown Heights
East Flatbush
Kensington
Windsor Terrace
Prospect Heights
Brownsville
Williamsburg
Bushwick
Bedford Stuyvesant
Brooklyn Heights
Cobble Hill
Carroll Gardens
Red Hook
Gowanus
Fort Greene
Park Slope
Cypress Hills
East New York
Starrett City
Canarsie
Flatlands
Mill Island
Manhattan Beach
Coney Island
Bath Beach
Borough Park
Dyker

In [34]:
#Filtering only cofffee shops

Category = ['Coffee Shop']


nyc_cafes= nyc_venues[nyc_venues.Venue_Category.isin(Category)]




In [39]:
#grouping the number of coffee shop per neighbourhood

# one hot encoding
nyc_cafes_onehot = pd.get_dummies(nyc_cafes[['Venue_Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
nyc_cafes_onehot['Neighborhood'] = nyc_cafes['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [nyc_cafes_onehot.columns[-1]] + list(nyc_cafes_onehot.columns[:-1])
nyc_cafes_onehot = nyc_cafes_onehot[fixed_columns]

nyc_cafe_grouped = nyc_cafes_onehot.groupby('Neighborhood').sum().reset_index()

nyc_cafe_grouped

Unnamed: 0,Neighborhood,Coffee Shop
0,Arden Heights,1
1,Arlington,1
2,Astoria,2
3,Bath Beach,1
4,Battery Park City,8
5,Bay Terrace,1
6,Bedford Stuyvesant,3
7,Bellaire,1
8,Belmont,1
9,Bensonhurst,1


In [42]:
# process kmeans clustering
kclusters = 7

nyc_cafe_clustering = nyc_cafe_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(nyc_cafe_clustering)

labels = kmeans.labels_

print(labels)

#nyc_cafe_clustering

[0 0 3 0 4 0 5 0 0 0 6 0 3 0 3 1 6 1 0 2 3 0 5 6 1 0 0 0 0 0 0 5 2 0 0 3 6
 3 0 4 3 3 0 5 3 5 1 6 3 1 0 0 6 3 0 0 3 0 0 3 2 3 0 5 2 5 0 5 0 3 1 6 6 0
 2 0 1 0 6 0 0 0 1 0 3 0 0 0 0 0 0 5 1 0 3 5 0 0 5 0 1 1 5 0 3 0 5 0 3 3 0
 0 0 2]


In [50]:
nyc_cafe_merged = nyc_cafe_grouped

# add clustering labels
nyc_cafe_merged['Cluster_Labels'] = kmeans.labels_
nyc_cafe_merged

print(nyc_cafe_merged.loc[nyc_cafe_merged['Cluster_Labels']==6])


           Neighborhood  Coffee Shop  Cluster Labels  Cluster_Labels
10          Boerum Hill            5               6               6
16        Carnegie Hill            5               6               6
23              Clinton            5               6               6
36    East Williamsburg            5               6               6
47           Greenpoint            5               6               6
52         Hudson Yards            5               6               6
71        Midtown South            5               6               6
72  Morningside Heights            5               6               6
78           North Side            5               6               6
