# Capstone Project (* The Battle of Neighborhoods * )

In [9]:
import numpy as np
import pandas as pd
from geopy.geocoders import Nominatim
import urllib.request
from urllib.request import urlopen
import requests
import json
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

# Explore Dataset

In [10]:
# Dataset Downloaded from https://cocl.us/new_york_dataset
with open('nyu_2451_34572-geojson.json') as json_data:
    newyork_data = json.load(json_data)

In [11]:
neighborhoods_data = newyork_data['features']
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [12]:
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [14]:
address = 'New York City, NY'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 40.7127281, -74.0060152.


In [15]:
brooklyn_data = neighborhoods[neighborhoods['Borough'] == 'Brooklyn'].reset_index(drop=True)
brooklyn_data.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Brooklyn,Bay Ridge,40.625801,-74.030621
1,Brooklyn,Bensonhurst,40.611009,-73.99518
2,Brooklyn,Sunset Park,40.645103,-74.010316
3,Brooklyn,Greenpoint,40.730201,-73.954241
4,Brooklyn,Gravesend,40.59526,-73.973471


In [16]:
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(brooklyn_data['Latitude'], brooklyn_data['Longitude'], brooklyn_data['Borough'], brooklyn_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

# Foursquare Venues

In [17]:
import urllib
def getNearbyVenues(names, latitudes, longitudes, radius=5000, categoryIds=''):
    try:
        venues_list=[]
        for name, lat, lng in zip(names, latitudes, longitudes):
            #print(name)

            # create the API request URL
            url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)

            if (categoryIds != ''):
                url = url + '&categoryId={}'
                url = url.format(categoryIds)

            # make the GET request
            response = requests.get(url).json()
            results = response["response"]['venues']

            # return only relevant information for each nearby venue
            for v in results:
                success = False
                try:
                    category = v['categories'][0]['name']
                    success = True
                except:
                    pass

                if success:
                    venues_list.append([(
                        name, 
                        lat, 
                        lng, 
                        v['name'], 
                        v['location']['lat'], 
                        v['location']['lng'],
                        v['categories'][0]['name']
                    )])

        nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
        nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude',  
                  'Venue Category']
    
    except:
        print(url)
        print(response)
        print(results)
        print(nearby_venues)

    return(nearby_venues)

In [18]:
LIMIT = 500 
radius = 5000 
CLIENT_ID = 'TLQOO2RKMHMRNHGQQDSX31I4ZAXO2EY22APRAXGJW3UNLOWF'
CLIENT_SECRET = 'ERYEVK2YY4KUYUGBIMUO44H4SLAOPKWQQ1VF4V2BJRZZZC05'
VERSION = '20181020'

In [19]:
#https://developer.foursquare.com/docs/resources/categories
#Dance Studio = 4bf58dd8d48988d134941735
import urllib.request
neighborhoods = neighborhoods[neighborhoods['Borough'] == 'Brooklyn'].reset_index(drop=True)
newyork_dance_studio = getNearbyVenues(names=neighborhoods['Neighborhood'], latitudes=neighborhoods['Latitude'], longitudes=neighborhoods['Longitude'], radius=1000, categoryIds='4bf58dd8d48988d134941735')
newyork_dance_studio.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Bay Ridge,40.625801,-74.030621,Dimensions in dance,40.630644,-74.028088,Dance Studio
1,Bay Ridge,40.625801,-74.030621,Beyond Dance,40.622214,-74.031565,Dance Studio
2,Bay Ridge,40.625801,-74.030621,Beyond Dance,40.619341,-74.03612,Dance Studio
3,Bay Ridge,40.625801,-74.030621,Brooklyn Dance Center III,40.618322,-74.028902,Dance Studio
4,Bay Ridge,40.625801,-74.030621,Triple Essence Dance Studio,40.623343,-74.019051,Dance Studio


In [20]:
newyork_dance_studio.shape

(367, 7)

In [21]:
def addToMap(df, color, existingMap):
    for lat, lng, local, venue, venueCat in zip(df['Venue Latitude'], df['Venue Longitude'], df['Neighborhood'], df['Venue'], df['Venue Category']):
        label = '{} ({}) - {}'.format(venue, venueCat, local)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color=color,
            fill=True,
            fill_color=color,
            fill_opacity=0.7).add_to(existingMap)

In [22]:
map_newyork_dance_studio = folium.Map(location=[latitude, longitude], zoom_start=10)
addToMap(newyork_dance_studio, 'red', map_newyork_dance_studio)

map_newyork_dance_studio

In [23]:
def addColumn(startDf, columnTitle, dataDf):
    grouped = dataDf.groupby('Neighborhood').count()
    
    for n in startDf['Neighborhood']:
        try:
            startDf.loc[startDf['Neighborhood'] == n,columnTitle] = grouped.loc[n, 'Venue']
        except:
            startDf.loc[startDf['Neighborhood'] == n,columnTitle] = 0

In [24]:
brooklyn_grouped = newyork_dance_studio.groupby('Neighborhood').count()
brooklyn_grouped

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bath Beach,6,6,6,6,6,6
Bay Ridge,6,6,6,6,6,6
Bedford Stuyvesant,8,8,8,8,8,8
Bensonhurst,7,7,7,7,7,7
Bergen Beach,1,1,1,1,1,1
Boerum Hill,25,25,25,25,25,25
Borough Park,1,1,1,1,1,1
Brighton Beach,1,1,1,1,1,1
Broadway Junction,3,3,3,3,3,3
Brooklyn Heights,13,13,13,13,13,13


# Analyze Each Neighborhood

In [25]:
brooklyn_onehot = pd.get_dummies(newyork_dance_studio[['Venue Category']], prefix="", prefix_sep="")
brooklyn_onehot['Neighborhood'] = newyork_dance_studio['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [brooklyn_onehot.columns[-1]] + list(brooklyn_onehot.columns[:-1])
brooklyn_onehot = brooklyn_onehot[fixed_columns]

brooklyn_onehot.head()

Unnamed: 0,Neighborhood,Art Gallery,Country Dance Club,Dance Studio,Event Space,General Entertainment,Gymnastics Gym,Office,Performing Arts Venue,School,Yoga Studio
0,Bay Ridge,0,0,1,0,0,0,0,0,0,0
1,Bay Ridge,0,0,1,0,0,0,0,0,0,0
2,Bay Ridge,0,0,1,0,0,0,0,0,0,0
3,Bay Ridge,0,0,1,0,0,0,0,0,0,0
4,Bay Ridge,0,0,1,0,0,0,0,0,0,0


In [26]:
brooklyn_grouped = brooklyn_onehot.groupby('Neighborhood').mean().reset_index()
brooklyn_grouped

Unnamed: 0,Neighborhood,Art Gallery,Country Dance Club,Dance Studio,Event Space,General Entertainment,Gymnastics Gym,Office,Performing Arts Venue,School,Yoga Studio
0,Bath Beach,0.000000,0.000,0.666667,0.000,0.000000,0.166667,0.000000,0.000000,0.000000,0.166667
1,Bay Ridge,0.000000,0.000,1.000000,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,Bedford Stuyvesant,0.000000,0.125,0.875000,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,Bensonhurst,0.000000,0.000,0.714286,0.000,0.000000,0.142857,0.000000,0.000000,0.000000,0.142857
4,Bergen Beach,0.000000,0.000,1.000000,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,Boerum Hill,0.040000,0.000,0.960000,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,Borough Park,0.000000,0.000,1.000000,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,Brighton Beach,0.000000,0.000,1.000000,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,Broadway Junction,0.000000,0.000,1.000000,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,Brooklyn Heights,0.076923,0.000,0.846154,0.000,0.000000,0.000000,0.000000,0.076923,0.000000,0.000000


In [27]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [28]:
num_top_venues = 10
indicators = ['st', 'nd', 'rd']
columns = ['Neighborhood']

for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = brooklyn_grouped['Neighborhood']

for ind in np.arange(brooklyn_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(brooklyn_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Bath Beach,Dance Studio,Yoga Studio,Gymnastics Gym,School,Performing Arts Venue,Office,General Entertainment,Event Space,Country Dance Club,Art Gallery
1,Bay Ridge,Dance Studio,Yoga Studio,School,Performing Arts Venue,Office,Gymnastics Gym,General Entertainment,Event Space,Country Dance Club,Art Gallery
2,Bedford Stuyvesant,Dance Studio,Country Dance Club,Yoga Studio,School,Performing Arts Venue,Office,Gymnastics Gym,General Entertainment,Event Space,Art Gallery
3,Bensonhurst,Dance Studio,Yoga Studio,Gymnastics Gym,School,Performing Arts Venue,Office,General Entertainment,Event Space,Country Dance Club,Art Gallery
4,Bergen Beach,Dance Studio,Yoga Studio,School,Performing Arts Venue,Office,Gymnastics Gym,General Entertainment,Event Space,Country Dance Club,Art Gallery


# Cluster Neighborhoods

In [29]:
kclusters = 5
brooklyn_grouped_clustering = brooklyn_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(brooklyn_grouped_clustering)
kmeans.labels_[0:10]

array([2, 0, 3, 2, 0, 0, 0, 0, 0, 3])

In [30]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
brooklyn_merged = brooklyn_data
brooklyn_merged = brooklyn_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
brooklyn_merged

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Brooklyn,Bay Ridge,40.625801,-74.030621,0.0,Dance Studio,Yoga Studio,School,Performing Arts Venue,Office,Gymnastics Gym,General Entertainment,Event Space,Country Dance Club,Art Gallery
1,Brooklyn,Bensonhurst,40.611009,-73.995180,2.0,Dance Studio,Yoga Studio,Gymnastics Gym,School,Performing Arts Venue,Office,General Entertainment,Event Space,Country Dance Club,Art Gallery
2,Brooklyn,Sunset Park,40.645103,-74.010316,0.0,Dance Studio,Yoga Studio,School,Performing Arts Venue,Office,Gymnastics Gym,General Entertainment,Event Space,Country Dance Club,Art Gallery
3,Brooklyn,Greenpoint,40.730201,-73.954241,4.0,Dance Studio,Office,Event Space,Yoga Studio,School,Performing Arts Venue,Gymnastics Gym,General Entertainment,Country Dance Club,Art Gallery
4,Brooklyn,Gravesend,40.595260,-73.973471,0.0,Dance Studio,Yoga Studio,School,Performing Arts Venue,Office,Gymnastics Gym,General Entertainment,Event Space,Country Dance Club,Art Gallery
5,Brooklyn,Brighton Beach,40.576825,-73.965094,0.0,Dance Studio,Yoga Studio,School,Performing Arts Venue,Office,Gymnastics Gym,General Entertainment,Event Space,Country Dance Club,Art Gallery
6,Brooklyn,Sheepshead Bay,40.586890,-73.943186,0.0,Dance Studio,Yoga Studio,School,Performing Arts Venue,Office,Gymnastics Gym,General Entertainment,Event Space,Country Dance Club,Art Gallery
7,Brooklyn,Manhattan Terrace,40.614433,-73.957438,0.0,Dance Studio,Yoga Studio,School,Performing Arts Venue,Office,Gymnastics Gym,General Entertainment,Event Space,Country Dance Club,Art Gallery
8,Brooklyn,Flatbush,40.636326,-73.958401,0.0,Dance Studio,Yoga Studio,School,Performing Arts Venue,Office,Gymnastics Gym,General Entertainment,Event Space,Country Dance Club,Art Gallery
9,Brooklyn,Crown Heights,40.670829,-73.943291,0.0,Dance Studio,Yoga Studio,School,Performing Arts Venue,Office,Gymnastics Gym,General Entertainment,Event Space,Country Dance Club,Art Gallery


## Cluster - 1

In [31]:
brooklyn_merged.loc[brooklyn_merged['Cluster Labels'] == 1, brooklyn_merged.columns[[1] + list(range(5, brooklyn_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
19,Cobble Hill,Dance Studio,School,Art Gallery,Yoga Studio,Performing Arts Venue,Office,Gymnastics Gym,General Entertainment,Event Space,Country Dance Club
20,Carroll Gardens,Dance Studio,School,Art Gallery,Yoga Studio,Performing Arts Venue,Office,Gymnastics Gym,General Entertainment,Event Space,Country Dance Club


## Cluster - 2

In [32]:
brooklyn_merged.loc[brooklyn_merged['Cluster Labels'] == 2, brooklyn_merged.columns[[1] + list(range(5, brooklyn_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Bensonhurst,Dance Studio,Yoga Studio,Gymnastics Gym,School,Performing Arts Venue,Office,General Entertainment,Event Space,Country Dance Club,Art Gallery
33,Bath Beach,Dance Studio,Yoga Studio,Gymnastics Gym,School,Performing Arts Venue,Office,General Entertainment,Event Space,Country Dance Club,Art Gallery


## Cluster - 3

In [33]:
brooklyn_merged.loc[brooklyn_merged['Cluster Labels'] == 3, brooklyn_merged.columns[[1] + list(range(5, brooklyn_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
15,Williamsburg,Dance Studio,General Entertainment,Yoga Studio,School,Performing Arts Venue,Office,Gymnastics Gym,Event Space,Country Dance Club,Art Gallery
17,Bedford Stuyvesant,Dance Studio,Country Dance Club,Yoga Studio,School,Performing Arts Venue,Office,Gymnastics Gym,General Entertainment,Event Space,Art Gallery
18,Brooklyn Heights,Dance Studio,Performing Arts Venue,Art Gallery,Yoga Studio,School,Office,Gymnastics Gym,General Entertainment,Event Space,Country Dance Club
22,Gowanus,Dance Studio,School,Yoga Studio,Performing Arts Venue,Office,Gymnastics Gym,General Entertainment,Event Space,Country Dance Club,Art Gallery
40,Downtown,Dance Studio,Performing Arts Venue,Art Gallery,Yoga Studio,School,Office,Gymnastics Gym,General Entertainment,Event Space,Country Dance Club
50,North Side,Dance Studio,Office,General Entertainment,Yoga Studio,School,Performing Arts Venue,Gymnastics Gym,Event Space,Country Dance Club,Art Gallery
51,South Side,Dance Studio,Office,General Entertainment,Yoga Studio,School,Performing Arts Venue,Gymnastics Gym,Event Space,Country Dance Club,Art Gallery
61,Fulton Ferry,Dance Studio,Performing Arts Venue,Yoga Studio,School,Office,Gymnastics Gym,General Entertainment,Event Space,Country Dance Club,Art Gallery
62,Vinegar Hill,Dance Studio,Performing Arts Venue,Yoga Studio,School,Office,Gymnastics Gym,General Entertainment,Event Space,Country Dance Club,Art Gallery
65,Dumbo,Dance Studio,Performing Arts Venue,Yoga Studio,School,Office,Gymnastics Gym,General Entertainment,Event Space,Country Dance Club,Art Gallery


## Cluster - 4

In [34]:
brooklyn_merged.loc[brooklyn_merged['Cluster Labels'] == 4, brooklyn_merged.columns[[1] + list(range(5, brooklyn_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Greenpoint,Dance Studio,Office,Event Space,Yoga Studio,School,Performing Arts Venue,Gymnastics Gym,General Entertainment,Country Dance Club,Art Gallery
