##  Project

### Import the required packages

In [1]:
# library to handle vetorized data
import numpy as np 

# library for data analsysis
import pandas as pd 

#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

# library to handle JSON files
import json 

# convert an address into latitude and longitude values
from geopy.geocoders import Nominatim 

# library to handle requests
import requests 

# tranform JSON file into a pandas dataframe
from pandas.io.json import json_normalize 

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# map rendering library
import folium 

print('Libraries imported.')

Libraries imported.


### Load and explore the data

In [2]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [3]:
neighborhoods_data = newyork_data['features']

#### Create a dataframe of the data 

In [4]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

neighborhoods = pd.DataFrame(columns=column_names)

Then let's loop through the data and fill the dataframe one row at a time.

In [5]:
for data in neighborhoods_data:
    borough = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

##### A quick look at the data

In [6]:
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


### creating a new dataframe of the Manhattan data.

In [7]:
manhattan_data = neighborhoods[neighborhoods['Borough'] == 'Manhattan'].reset_index(drop=True)

#### let's have a look at the manhattan data

In [8]:
manhattan_data.head(10)

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Manhattan,Marble Hill,40.876551,-73.91066
1,Manhattan,Chinatown,40.715618,-73.994279
2,Manhattan,Washington Heights,40.851903,-73.9369
3,Manhattan,Inwood,40.867684,-73.92121
4,Manhattan,Hamilton Heights,40.823604,-73.949688
5,Manhattan,Manhattanville,40.816934,-73.957385
6,Manhattan,Central Harlem,40.815976,-73.943211
7,Manhattan,East Harlem,40.792249,-73.944182
8,Manhattan,Upper East Side,40.775639,-73.960508
9,Manhattan,Yorkville,40.77593,-73.947118


### Let's get the geographical coordinates of Manhattan.

In [9]:
address = 'Manhattan, NY'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))

  This is separate from the ipykernel package so we can avoid doing imports until


The geograpical coordinate of Manhattan are 40.7900869, -73.9598295.


### visualizing Manhattan and the neighborhoods in it.

In [10]:
# create map of Manhattan using latitude and longitude values
map_manhattan = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(manhattan_data['Latitude'], manhattan_data['Longitude'], manhattan_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        #parse_html=False
    ).add_to(map_manhattan)  
    
map_manhattan

### connecting to foursquare API

#### Define Foursquare Credentials and Version

In [40]:
CLIENT_ID = 'XXX' # masked for security purpose
CLIENT_SECRET = 'XXX' # masked for security purpose
VERSION = '20190130' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: XXX
CLIENT_SECRET:XXX


### Let's get the category id for the indian resturant in the Manhattan city

In [41]:
search_query = 'Indian'
radius = 5000
LIMIT=50
print(search_query + ' .... OK!')

Indian .... OK!


In [42]:
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius=5000&limit=50'.format(CLIENT_ID, CLIENT_SECRET, latitude,longitude,VERSION,search_query)
url

'https://api.foursquare.com/v2/venues/search?client_id=XXX&client_secret=XXX&ll=40.7900869,-73.9598295&v=20190130&query=Indian&radius=5000&limit=50'

In [14]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5c52a29b1ed2193b477cfe58'},
 'response': {'venues': [{'id': '56ed855a498ef3bb022352c3',
    'name': 'mughlai Indian Cuisine',
    'location': {'address': '1724 2nd Ave',
     'crossStreet': '89th & 90th St',
     'lat': 40.78020580283837,
     'lng': -73.95008785684102,
     'labeledLatLngs': [{'label': 'display',
       'lat': 40.78020580283837,
       'lng': -73.95008785684102}],
     'distance': 1372,
     'postalCode': '10128',
     'cc': 'US',
     'city': 'New York',
     'state': 'NY',
     'country': 'United States',
     'formattedAddress': ['1724 2nd Ave (89th & 90th St)',
      'New York, NY 10128',
      'United States']},
    'categories': [{'id': '4bf58dd8d48988d10f941735',
      'name': 'Indian Restaurant',
      'pluralName': 'Indian Restaurants',
      'shortName': 'Indian',
      'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/indian_',
       'suffix': '.png'},
      'primary': True}],
    'delivery': {'id': '32197

### Note : From the above result we can observe that the category id of the ondian resturant is 4bf58dd8d48988d1cb941735

### create url to find Indian Restaurants (categoryId = '4bf58dd8d48988d1cb941735')

In [15]:
LIMIT = 200
radius = 500
categoryId = '4bf58dd8d48988d1cb941735'

url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}&categoryId={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION,  radius, LIMIT,categoryId)


In [16]:
def getNearbyIRs(names, latitudes, longitudes, radius=1000):
    venues_list=[]
    
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}&categoryId={}'.format(CLIENT_ID, CLIENT_SECRET, lat, lng, VERSION,  radius, LIMIT,categoryId)

        
        results = requests.get(url).json()["response"]['venues']
        
        venues_list.append([(
            name,
            v['id'],
            v['name'], 
            v['location']['lat'], 
            v['location']['lng'],
            v['categories'][0]['name'])for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Id',          
                  'Name', 
                  'Latitude', 
                  'Longitude',
                  'Category']
    
    return(nearby_venues)

In [17]:
manhattan_ind_rest = getNearbyIRs(names=manhattan_data['Neighborhood'],
                                   latitudes=manhattan_data['Latitude'],
                                   longitudes=manhattan_data['Longitude']
                                  )

Marble Hill
Chinatown
Washington Heights
Inwood
Hamilton Heights
Manhattanville
Central Harlem
East Harlem
Upper East Side
Yorkville
Lenox Hill
Roosevelt Island
Upper West Side
Lincoln Square
Clinton
Midtown
Murray Hill
Chelsea
Greenwich Village
East Village
Lower East Side
Tribeca
Little Italy
Soho
West Village
Manhattan Valley
Morningside Heights
Gramercy
Battery Park City
Financial District
Carnegie Hill
Noho
Civic Center
Midtown South
Sutton Place
Turtle Bay
Tudor City
Stuyvesant Town
Flatiron
Hudson Yards


In [18]:
manhattan_ind_rest.head(10)

Unnamed: 0,Neighborhood,Id,Name,Latitude,Longitude,Category
0,Marble Hill,5185d659498ea00e930bca91,Parada Tropical,40.872606,-73.905942,Food Truck
1,Marble Hill,5481e834498ee0762e4692ff,The Little Kitchen,40.87909,-73.909652,Food Truck
2,Marble Hill,4e64fa5462e13e3bcfb78506,Saber Halal Food,40.873554,-73.908325,Food Truck
3,Marble Hill,4d979c8d647d8cfa1f48da3d,Ice Cream Truck,40.87201,-73.918699,Food Truck
4,Marble Hill,4dc1b460d4c07bbdf77b84ed,Tony's Halal,40.87142,-73.898663,Food Truck
5,Marble Hill,51925621498ee1830045bbd1,Mexican Food Truck,40.871658,-73.897858,Food Truck
6,Marble Hill,5296c23511d28a5d3c197554,Monkey Brothers Lab,40.867402,-73.916902,Food Truck
7,Marble Hill,50649316e4b0add9c16a9d10,Taquerıa Garıbaldı,40.870649,-73.916099,Food Truck
8,Marble Hill,4ce4749fe571a0932eed8487,52st Street Food Truck,40.872131,-73.900867,Food Truck
9,Marble Hill,4fa807e0e4b038e6a43d4e02,Halal Food Cart,40.870188,-73.914629,Food Truck


### find Neighborhoods who have most Indian Restaurants 

In [19]:
ind_res_neigh=manhattan_ind_rest[['Id','Neighborhood']].groupby('Neighborhood').count()
ind_res_neigh = ind_res_neigh.sort_values(by=['Id'], ascending=False)
ind_res_neigh.reset_index(inplace=True)
ind_res_neigh.columns = ['Neighborhood','count']
ind_res_neigh

Unnamed: 0,Neighborhood,count
0,Battery Park City,50
1,Noho,50
2,Lower East Side,50
3,Manhattanville,50
4,Midtown,50
5,Midtown South,50
6,Morningside Heights,50
7,Murray Hill,50
8,Soho,50
9,Lincoln Square,50


### find charaçterisitcs of manhattan Neighborhood by using K means clustering

In [20]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [21]:
manhattan_venues = getNearbyVenues(names=manhattan_data['Neighborhood'],
                                   latitudes=manhattan_data['Latitude'],
                                   longitudes=manhattan_data['Longitude']
                                  )

Marble Hill
Chinatown
Washington Heights
Inwood
Hamilton Heights
Manhattanville
Central Harlem
East Harlem
Upper East Side
Yorkville
Lenox Hill
Roosevelt Island
Upper West Side
Lincoln Square
Clinton
Midtown
Murray Hill
Chelsea
Greenwich Village
East Village
Lower East Side
Tribeca
Little Italy
Soho
West Village
Manhattan Valley
Morningside Heights
Gramercy
Battery Park City
Financial District
Carnegie Hill
Noho
Civic Center
Midtown South
Sutton Place
Turtle Bay
Tudor City
Stuyvesant Town
Flatiron
Hudson Yards


In [22]:
print(manhattan_venues.shape)
manhattan_venues.head()

(3311, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Marble Hill,40.876551,-73.91066,Arturo's,40.874412,-73.910271,Pizza Place
1,Marble Hill,40.876551,-73.91066,Bikram Yoga,40.876844,-73.906204,Yoga Studio
2,Marble Hill,40.876551,-73.91066,Tibbett Diner,40.880404,-73.908937,Diner
3,Marble Hill,40.876551,-73.91066,Starbucks,40.877531,-73.905582,Coffee Shop
4,Marble Hill,40.876551,-73.91066,Land & Sea Restaurant,40.877885,-73.905873,Seafood Restaurant


### Data preprocessing for clustering

In [23]:
# one hot encoding
manhattan_onehot = pd.get_dummies(manhattan_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
manhattan_onehot['Neighborhood'] = manhattan_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [manhattan_onehot.columns[-1]] + list(manhattan_onehot.columns[:-1])
manhattan_onehot = manhattan_onehot[fixed_columns]

manhattan_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,American Restaurant,Animal Shelter,Antique Shop,Arcade,Arepa Restaurant,...,Volleyball Court,Watch Shop,Waterfront,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Marble Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Marble Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,Marble Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Marble Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Marble Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
manhattan_grouped = manhattan_onehot.groupby('Neighborhood').mean().reset_index()

In [25]:
manhattan_grouped.head()

Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,American Restaurant,Animal Shelter,Antique Shop,Arcade,Arepa Restaurant,...,Volleyball Court,Watch Shop,Waterfront,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Battery Park City,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.01,0.03,0.0,0.02,0.0
1,Carnegie Hill,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.01,0.03,0.0,0.01,0.03
2,Central Harlem,0.0,0.0,0.0,0.065217,0.043478,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Chelsea,0.0,0.0,0.0,0.0,0.04,0.0,0.01,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.0
4,Chinatown,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01


### function to get most common venues

In [26]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [27]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = manhattan_grouped['Neighborhood']

for ind in np.arange(manhattan_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(manhattan_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head(5)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Battery Park City,Coffee Shop,Park,Hotel,Italian Restaurant,Wine Shop,Cupcake Shop,Memorial Site,Food Court,BBQ Joint,Department Store
1,Carnegie Hill,Pizza Place,Cosmetics Shop,Coffee Shop,Café,Spa,Japanese Restaurant,Gym,French Restaurant,Yoga Studio,Bookstore
2,Central Harlem,African Restaurant,Seafood Restaurant,American Restaurant,Gym / Fitness Center,Chinese Restaurant,Cosmetics Shop,French Restaurant,Fried Chicken Joint,Music Venue,Bookstore
3,Chelsea,Coffee Shop,Italian Restaurant,Ice Cream Shop,American Restaurant,Nightclub,Bakery,Seafood Restaurant,Theater,Art Gallery,Hotel
4,Chinatown,Chinese Restaurant,Bubble Tea Shop,Dim Sum Restaurant,American Restaurant,Vietnamese Restaurant,Cocktail Bar,Hotpot Restaurant,Salon / Barbershop,Bakery,Noodle House


In [28]:
kclusters = 4

manhattan_grouped_clustering = manhattan_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(manhattan_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 3, 1, 3, 1, 1, 1, 0, 3, 1])

In [29]:
manhattan_merged = manhattan_data

# add clustering labels
manhattan_merged['Cluster Labels'] = kmeans.labels_

# merge manhattan_merged with neighborhoods_venues_sorted to add latitude/longitude for each neighborhood
manhattan_merged = manhattan_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')


### Add the cluster id to the dataframe

In [30]:
manhattan_clusters=manhattan_merged[['Neighborhood','Cluster Labels']]

In [31]:
manhattan_clusters.head()

Unnamed: 0,Neighborhood,Cluster Labels
0,Marble Hill,1
1,Chinatown,3
2,Washington Heights,1
3,Inwood,3
4,Hamilton Heights,1


#### View Indian resturant neighborhoods

In [32]:
ind_res_neigh

Unnamed: 0,Neighborhood,count
0,Battery Park City,50
1,Noho,50
2,Lower East Side,50
3,Manhattanville,50
4,Midtown,50
5,Midtown South,50
6,Morningside Heights,50
7,Murray Hill,50
8,Soho,50
9,Lincoln Square,50


### using merge find the cluster labels of indian restaurant neighborhoods

In [33]:
ind_res_cluster = pd.merge(ind_res_neigh,manhattan_clusters,how='inner',on='Neighborhood')

In [34]:
ind_res_cluster.head()

Unnamed: 0,Neighborhood,count,Cluster Labels
0,Battery Park City,50,1
1,Noho,50,3
2,Lower East Side,50,3
3,Manhattanville,50,1
4,Midtown,50,0


### we observe that the count 50 is high ie,most of the indian resturants are located here. get the neihbiurhoods with count 50

In [35]:
top_indian_resturant = ind_res_cluster.loc[ind_res_cluster['count'] == 50]

In [36]:
top_indian_resturant.head()

Unnamed: 0,Neighborhood,count,Cluster Labels
0,Battery Park City,50,1
1,Noho,50,3
2,Lower East Side,50,3
3,Manhattanville,50,1
4,Midtown,50,0


In [37]:
neigh_with_more_indian_res = top_indian_resturant[['Neighborhood','Cluster Labels']].groupby('Cluster Labels').count()

In [38]:
neigh_with_more_indian_res

Unnamed: 0_level_0,Neighborhood
Cluster Labels,Unnamed: 1_level_1
0,4
1,9
3,15


### Neighbourhood suitable for opening an Indian resturant 

#### we can observe that that most of the indian resturants are located in cluster 3.list all the neighborhood in cluster 3

In [39]:
#list all the neighborhoods in cluster 3
neigh_suited_for_opening_res = manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 3, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]
neigh_suited_for_opening_res['Neighborhood'] 

1               Chinatown
3                  Inwood
8         Upper East Side
10             Lenox Hill
11       Roosevelt Island
12        Upper West Side
16            Murray Hill
18      Greenwich Village
20        Lower East Side
22           Little Italy
26    Morningside Heights
27               Gramercy
31                   Noho
34           Sutton Place
35             Turtle Bay
36             Tudor City
38               Flatiron
39           Hudson Yards
Name: Neighborhood, dtype: object