# This is the notebook for the capstone project

In [92]:
import pandas as pd
import numpy as np
import folium
import requests
import matplotlib.cm as cm
import matplotlib.colors as colors
import json
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans

### Read html of list of postal codes of Canada to DataFrame

In [93]:
data_link = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs/newyork_data.json'
x = requests.get(data_link)
response = x.json()

In [94]:
features = response['features']
print(features[0])
col_features = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 
df = pd.DataFrame(columns = col_features)

{'type': 'Feature', 'id': 'nyu_2451_34572.1', 'geometry': {'type': 'Point', 'coordinates': [-73.84720052054902, 40.89470517661]}, 'geometry_name': 'geom', 'properties': {'name': 'Wakefield', 'stacked': 1, 'annoline1': 'Wakefield', 'annoline2': None, 'annoline3': None, 'annoangle': 0.0, 'borough': 'Bronx', 'bbox': [-73.84720052054902, 40.89470517661, -73.84720052054902, 40.89470517661]}}


In [95]:
for data in features:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    df = df.append({'Borough': borough,
                  'Neighborhood': neighborhood_name,
                  'Latitude': neighborhood_lat,
                  'Longitude': neighborhood_lon}, ignore_index=True)

In [96]:
df.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [97]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[df['Latitude'].mean(), df['Longitude'].mean()], zoom_start=10)

# add markers to df
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

In [98]:
CLIENT_ID = 'KUNJIZC4CDXDWKAWSZAJZHI45WCOSUWR3SCAGUROE5KYVW0Y' # your Foursquare ID
CLIENT_SECRET = 'PL2UCSV2WZHP2PHOR5HCRCZO1LTYNMVNYOPPVIVQODJOTAKY' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: KUNJIZC4CDXDWKAWSZAJZHI45WCOSUWR3SCAGUROE5KYVW0Y
CLIENT_SECRET:PL2UCSV2WZHP2PHOR5HCRCZO1LTYNMVNYOPPVIVQODJOTAKY


In [99]:
neighborhood_latitude = df.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df.loc[0, 'Longitude'] # neighborhood longitude value

# type your answer here
LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=KUNJIZC4CDXDWKAWSZAJZHI45WCOSUWR3SCAGUROE5KYVW0Y&client_secret=PL2UCSV2WZHP2PHOR5HCRCZO1LTYNMVNYOPPVIVQODJOTAKY&v=20180605&ll=40.89470517661,-73.84720052054902&radius=500&limit=100'

In [100]:
results = requests.get(url).json()
results['meta']

{'code': 200, 'requestId': '5f9b4bb1a97868185a7a1b69'}

In [185]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        print(categories_list[0]['name'])
        return categories_list[0]['name']

In [188]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
nearby_venues.head()

Dessert Shop
Pharmacy
Ice Cream Shop
Pharmacy
Donut Shop
Gas Station
Sandwich Place
Deli / Bodega
Pizza Place
Laundromat
Pharmacy          2
Deli / Bodega     1
Dessert Shop      1
Gas Station       1
Sandwich Place    1
Ice Cream Shop    1
Laundromat        1
Pizza Place       1
Donut Shop        1
Name: categories, dtype: int64


  nearby_venues = json_normalize(venues) # flatten JSON


Unnamed: 0,name,categories,lat,lng
0,Lollipops Gelato,Dessert Shop,40.894123,-73.845892
1,Rite Aid,Pharmacy,40.896649,-73.844846
2,Carvel Ice Cream,Ice Cream Shop,40.890487,-73.848568
3,Walgreens,Pharmacy,40.896528,-73.8447
4,Dunkin',Donut Shop,40.890459,-73.849089


In [103]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
                    results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [104]:
df_venues = getNearbyVenues(names=df['Neighborhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

Wakefield
Co-op City
Eastchester
Fieldston
Riverdale
Kingsbridge
Marble Hill
Woodlawn
Norwood
Williamsbridge
Baychester
Pelham Parkway
City Island
Bedford Park
University Heights
Morris Heights
Fordham
East Tremont
West Farms
High  Bridge
Melrose
Mott Haven
Port Morris
Longwood
Hunts Point
Morrisania
Soundview
Clason Point
Throgs Neck
Country Club
Parkchester
Westchester Square
Van Nest
Morris Park
Belmont
Spuyten Duyvil
North Riverdale
Pelham Bay
Schuylerville
Edgewater Park
Castle Hill
Olinville
Pelham Gardens
Concourse
Unionport
Edenwald
Bay Ridge
Bensonhurst
Sunset Park
Greenpoint
Gravesend
Brighton Beach
Sheepshead Bay
Manhattan Terrace
Flatbush
Crown Heights
East Flatbush
Kensington
Windsor Terrace
Prospect Heights
Brownsville
Williamsburg
Bushwick
Bedford Stuyvesant
Brooklyn Heights
Cobble Hill
Carroll Gardens
Red Hook
Gowanus
Fort Greene
Park Slope
Cypress Hills
East New York
Starrett City
Canarsie
Flatlands
Mill Island
Manhattan Beach
Coney Island
Bath Beach
Borough Park
Dyker

In [302]:
df_venues['Venue Category'] = df_venues['Venue Category'].apply(lambda x: 'Restaurant' if x.find('Restaurant') != -1 else x)
df_venues['Venue Category'] = df_venues['Venue Category'].apply(lambda x: 'Shop' if x.find('Shop') != -1 else x)
df_venues['Venue Category'] = df_venues['Venue Category'].apply(lambda x: 'Store' if x.find('Store') != -1 else x)
df_venues['Venue Category'] = df_venues['Venue Category'].apply(lambda x: 'Restaurant' if x.find('Sandwich') != -1 else x)
df_venues['Venue Category'] = df_venues['Venue Category'].apply(lambda x: 'Restaurant' if x.find('Pizza') != -1 else x)
df_venues['Venue Category'] = df_venues['Venue Category'].apply(lambda x: 'Gym' if x in ('Gym / Fitness Center', 'Athletics & Sports') else x)


'Gym / Fitness Center', 'Gym', 'Athletics & Sports'

print(df_venues['Venue Category'].value_counts())

Restaurant          3135
Shop                1407
Store                911
Deli / Bodega        271
Gym                  251
                    ... 
Airport Terminal       1
Train                  1
Event Service          1
Go Kart Track          1
Veterinarian           1
Name: Venue Category, Length: 255, dtype: int64


In [303]:
# one hot encoding
df_onehot = pd.get_dummies(df_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
df_onehot['Neighborhood'] = df_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [df_onehot.columns[-1]] + list(df_onehot.columns[:-1])
df_onehot = df_onehot[fixed_columns]

df_onehot.head()

Unnamed: 0,Yoga Studio,Adult Boutique,Airport Terminal,Arcade,Art Gallery,Art Museum,Arts & Entertainment,Auditorium,Auto Garage,BBQ Joint,...,Used Bookstore,Varenyky restaurant,Veterinarian,Volleyball Court,Waste Facility,Waterfront,Weight Loss Center,Whisky Bar,Wine Bar,Wings Joint
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [304]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [305]:
df_grouped = df_onehot.groupby('Neighborhood').mean().reset_index()
df_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Adult Boutique,Airport Terminal,Arcade,Art Gallery,Art Museum,Arts & Entertainment,Auditorium,Auto Garage,...,Used Bookstore,Varenyky restaurant,Veterinarian,Volleyball Court,Waste Facility,Waterfront,Weight Loss Center,Whisky Bar,Wine Bar,Wings Joint
0,Allerton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0
1,Annadale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0
2,Arden Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0
3,Arlington,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0
4,Arrochar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,Woodhaven,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0
298,Woodlawn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0
299,Woodrow,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0
300,Woodside,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0


In [306]:
# set number of clusters
kclusters = 10

df_grouped_clustering = df_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 6, 0, 2, 6, 0, 6, 7, 1, 6])

In [307]:
num_top_venues = 15

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = df_grouped['Neighborhood']

for ind in np.arange(df_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(df_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue
0,Allerton,Restaurant,Store,Shop,Supermarket,Spa,Deli / Bodega,Check Cashing Service,Bakery,Fried Chicken Joint,Breakfast Spot,Martial Arts School,Playground,Construction & Landscaping,Intersection,Gas Station
1,Annadale,Restaurant,Diner,Food,Dance Studio,Deli / Bodega,Park,Train Station,Farmers Market,Exhibit,Eye Doctor,Factory,Farm,Wings Joint,Field,Event Service
2,Arden Heights,Bus Stop,Deli / Bodega,Pharmacy,Restaurant,Shop,Fish Market,Field,Flea Market,Food,Farmers Market,Farm,Factory,Eye Doctor,Exhibit,Event Space
3,Arlington,Home Service,Shop,Store,Deli / Bodega,Bus Stop,Boat or Ferry,Wings Joint,Eye Doctor,Factory,Farm,Farmers Market,Field,Event Space,Fish Market,Flea Market
4,Arrochar,Restaurant,Bus Stop,Deli / Bodega,Shop,Food Truck,Store,Supermarket,Outdoors & Recreation,Gym,Pharmacy,Beach,Hotel,Dive Bar,Design Studio,Fish Market


In [308]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

df_merged = df

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
df_merged = df_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

df_merged.head() # check the last columns!

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue
0,Bronx,Wakefield,40.894705,-73.847201,0,Shop,Pharmacy,Restaurant,Deli / Bodega,Laundromat,Gas Station,Field,Fish Market,Flea Market,Food,Farmers Market,Farm,Factory,Eye Doctor,Exhibit
1,Bronx,Co-op City,40.874294,-73.829939,1,Restaurant,Store,Bus Station,Trail,Pharmacy,Shop,Baseball Field,Bar,Post Office,Park,Factory,Event Service,Eye Doctor,Farm,Exhibit
2,Bronx,Eastchester,40.887556,-73.827806,0,Restaurant,Shop,Bus Station,Deli / Bodega,Diner,Metro Station,Juice Bar,Store,Bus Stop,Bowling Alley,Farmers Market,Factory,Farm,Wings Joint,Field
3,Bronx,Fieldston,40.895437,-73.905643,2,River,Store,Plaza,Farmers Market,Event Space,Exhibit,Eye Doctor,Factory,Farm,Wings Joint,Entertainment Service,Field,Fish Market,Flea Market,Food
4,Bronx,Riverdale,40.890834,-73.912585,7,Park,Food Truck,Plaza,Moving Target,Bank,Home Service,Playground,Bus Station,Gym,Field,Fish Market,Exhibit,Flea Market,Farmers Market,Farm


In [309]:
# create map
map_clusters = folium.Map(location=[df['Latitude'].mean(), df['Longitude'].mean()], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Neighborhood'], df_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [310]:
df_merged.columns

Index(['Borough', 'Neighborhood', 'Latitude', 'Longitude', 'Cluster Labels',
       '1st Most Common Venue', '2nd Most Common Venue',
       '3rd Most Common Venue', '4th Most Common Venue',
       '5th Most Common Venue', '6th Most Common Venue',
       '7th Most Common Venue', '8th Most Common Venue',
       '9th Most Common Venue', '10th Most Common Venue',
       '11th Most Common Venue', '12th Most Common Venue',
       '13th Most Common Venue', '14th Most Common Venue',
       '15th Most Common Venue'],
      dtype='object')

In [311]:
df_merged['1st Most Common Venue'].value_counts()

Restaurant                    218
Shop                           28
Store                          14
Park                            7
Beach                           4
Bus Stop                        3
Deli / Bodega                   3
Dive Bar                        2
Pool                            2
Hotel                           2
Intersection                    2
Burger Joint                    1
Playground                      1
Massage Studio                  1
Pier                            1
Tennis Court                    1
River                           1
Sports Club                     1
Nightlife Spot                  1
Pharmacy                        1
Café                            1
Irish Pub                       1
Boat or Ferry                   1
Child Care Service              1
Bar                             1
Home Service                    1
Recording Studio                1
Construction & Landscaping      1
Harbor / Marina                 1
Gym           

In [312]:
#df_gym = df_merged[(df_merged['1st Most Common Venue'].isin('Gym / Fitness Center', 'Gym') | (df_merged['2nd Most Common Venue'] == 'Gym / Fitness Center') | (df_merged['3rd Most Common Venue'] == 'Gym / Fitness Center') | (df_merged['4th Most Common Venue'] == 'Gym / Fitness Center') | (df_merged['5th Most Common Venue'] == 'Gym / Fitness Center') | (df_merged['6th Most Common Venue'] == 'Gym / Fitness Center') | (df_merged['7th Most Common Venue'] == 'Gym / Fitness Center') | (df_merged['8th Most Common Venue'] == 'Gym / Fitness Center') | (df_merged['9th Most Common Venue'] == 'Gym / Fitness Center') | (df_merged['10th Most Common Venue'] == 'Gym / Fitness Center') ]
df_gym = df_merged[df_merged['1st Most Common Venue'].isin(['Gym / Fitness Center', 'Gym', 'Athletics & Sports']) | df_merged['2nd Most Common Venue'].isin(['Gym / Fitness Center', 'Gym', 'Athletics & Sports']) | df_merged['3rd Most Common Venue'].isin(['Gym / Fitness Center', 'Gym', 'Athletics & Sports']) | df_merged['4th Most Common Venue'].isin(['Gym / Fitness Center', 'Gym', 'Athletics & Sports'])]
print(f'Neighborhoods with gym/fitness center in 6 top common vanues: {len(df_gym)}')
df_gym['Cluster Labels'].value_counts()
df_gym.head()

Neighborhoods with gym/fitness center in 6 top common vanues: 36


Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue
6,Manhattan,Marble Hill,40.876551,-73.91066,1,Shop,Store,Restaurant,Gym,Yoga Studio,Deli / Bodega,Diner,Pharmacy,Tennis Stadium,Steakhouse,Hockey Field,Fish Market,Farmers Market,Farm,Factory
16,Bronx,Fordham,40.860997,-73.896427,1,Restaurant,Store,Shop,Gym,Bank,Pharmacy,Intersection,Deli / Bodega,Plaza,Diner,Supermarket,Bakery,Music Venue,Café,Fried Chicken Joint
21,Bronx,Mott Haven,40.806239,-73.9161,6,Restaurant,Shop,Store,Gym,Bookstore,Burger Joint,Storage Facility,Pharmacy,Bakery,Exhibit,Eye Doctor,Factory,Wings Joint,Farm,Farmers Market
29,Bronx,Country Club,40.844246,-73.824099,6,Restaurant,Flea Market,Gym,Playground,Wings Joint,Factory,Event Space,Exhibit,Eye Doctor,Farmers Market,Farm,Entertainment Service,Field,Fish Market,Food
74,Brooklyn,Canarsie,40.635564,-73.902093,6,Restaurant,Store,Food,Gym,Wings Joint,Farm,Event Space,Exhibit,Eye Doctor,Factory,Field,Farmers Market,Entertainment Service,Fish Market,Flea Market


In [313]:
df_merged[['1st Most Common Venue','2nd Most Common Venue','3rd Most Common Venue','4th Most Common Venue','5th Most Common Venue','6th Most Common Venue','7th Most Common Venue','8th Most Common Venue','9th Most Common Venue','10th Most Common Venue']].isin(['Gym / Fitness Center', 'Gym'])

Unnamed: 0,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...
301,False,True,False,False,False,False,False,False,False,False
302,False,False,False,False,False,False,False,False,False,True
303,False,False,False,False,False,False,False,False,False,False
304,False,False,False,False,True,False,False,False,False,False


In [314]:
df_gym_grouped = df_gym[['Cluster Labels', 'Neighborhood']].groupby('Cluster Labels', as_index = False).count()
df_merged_grouped = df_merged[['Cluster Labels', 'Neighborhood']].groupby('Cluster Labels', as_index = False).count()

In [315]:
df_gym_grouped = pd.merge(df_gym_grouped, df_merged_grouped, how = 'left', on = 'Cluster Labels')
df_gym_grouped.head()

Unnamed: 0,Cluster Labels,Neighborhood_x,Neighborhood_y
0,0,4,65
1,1,7,70
2,3,1,4
3,6,15,96
4,7,9,49


In [316]:
df_gym_grouped['gym proportion nb'] = df_gym_grouped['Neighborhood_x']/df_gym_grouped['Neighborhood_y']
df_gym_grouped

Unnamed: 0,Cluster Labels,Neighborhood_x,Neighborhood_y,gym proportion nb
0,0,4,65,0.061538
1,1,7,70,0.1
2,3,1,4,0.25
3,6,15,96,0.15625
4,7,9,49,0.183673


In [318]:
df_merged_cluster1 = df_merged[df_merged['Cluster Labels'] == 3]
df_merged_cluster1

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue
172,Queens,Breezy Point,40.557401,-73.925512,3,Beach,Monument / Landmark,Trail,Bus Stop,Field,Eye Doctor,Factory,Farm,Farmers Market,Wings Joint,Exhibit,Flea Market,Food,Food Court,Food Stand
179,Queens,Neponsit,40.572037,-73.857547,3,Beach,Wings Joint,Fried Chicken Joint,Food Truck,Food Stand,Food Court,Food,Flea Market,Fish Market,Field,Farmers Market,Farm,Factory,Eye Doctor,Exhibit
204,Staten Island,South Beach,40.580247,-74.079553,3,Pier,Deli / Bodega,Gym,Beach,Wings Joint,Farm,Exhibit,Eye Doctor,Factory,Field,Farmers Market,Fish Market,Flea Market,Food,Food Court
302,Queens,Hammels,40.587338,-73.80553,3,Beach,Fried Chicken Joint,Store,Deli / Bodega,Food Truck,Diner,Bus Station,Restaurant,Bus Stop,Gym,Dog Run,Farmers Market,Farm,Flea Market,Field


In [328]:
neighborhoods_list = []
print(len(df_merged_cluster1.values))
for i in list(df_merged_cluster1.values):
    if 'Gym' not in i:
        neighborhoods_list.append([i][0][0:2])
len(neighborhoods_list)    

4


2

In [329]:
print(neighborhoods_list)

[array(['Queens', 'Breezy Point'], dtype=object), array(['Queens', 'Neponsit'], dtype=object)]
