# Capstone Project: The final challenge

In [1]:
from bs4 import BeautifulSoup
import urllib.request as rq
import pandas as pd
import numpy as np
import requests
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import json
from geopy.geocoders import Nominatim
import folium

## New York DataSet preparation

Open New York file with data and convert it to a data frame

In [2]:
with open('newyork.json') as json_data:
    newyork_data = json.load(json_data)

In [3]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

In [4]:
neighborhoods_data = newyork_data['features']

In [5]:
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [6]:
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


Visualize data

In [7]:
address = 'New York City, NY'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of New York City are 40.7308619, -73.9871558.


In [8]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

Let's limit our research to Manhattan to make sure that we fit Foursquare api limits

In [9]:
manhattan_data = neighborhoods[neighborhoods['Borough'] == 'Manhattan'].reset_index(drop=True)
manhattan_data.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Manhattan,Marble Hill,40.876551,-73.91066
1,Manhattan,Chinatown,40.715618,-73.994279
2,Manhattan,Washington Heights,40.851903,-73.9369
3,Manhattan,Inwood,40.867684,-73.92121
4,Manhattan,Hamilton Heights,40.823604,-73.949688


Visualize data around Manhattan

In [10]:
address = 'Manhattan, NY'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Manhattan are 40.7900869, -73.9598295.


In [11]:
# create map of Manhattan using latitude and longitude values
map_manhattan = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(manhattan_data['Latitude'], manhattan_data['Longitude'], manhattan_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_manhattan)  
    
map_manhattan

## Toronto DataSet preparation

This is a variable that contains a link to the page

In [12]:
wiki_url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

All needed packages for the project.
- BeautifulSoup should be installed to the environment (ex.: pip install beautifulsoup4)

Open url and read body with html

In [13]:
res=rq.urlopen(wiki_url)
body=res.read()

In [14]:
soup = BeautifulSoup(body, 'html.parser')

Our data in the first table. The table contains tr tags for rows, th/td for text. We just need to gran all text from there and convert to a data frame 

In [15]:
data_list=[]
table=soup.find_all('table')[0]
for tr in table.find_all('tr'):
    td=tr.find_all('td')
    if len(td)==3:
        data_list.append([td[0].get_text(),td[1].get_text(),td[2].get_text().replace("\n","")])

In [16]:
th=table.find_all('th')
frame=pd.DataFrame(data=data_list,columns=[th[0].get_text(),th[1].get_text(),th[2].get_text().replace("\n","")])

We remove all Not assigned columns from our dataframe and reset index to make sure that we don't have gaps to complete the next task

In [17]:
frame=frame[frame['Borough']!='Not assigned']
frame=frame.reset_index()[['Postcode','Borough','Neighbourhood']]

We are applying transformation to the second column to make sure that we don't have "Not assigned" values there

In [18]:
index=(frame['Neighbourhood']=='Not assigned')
idxs=index[index==True].index
frame.iloc[np.array(idxs),2]=frame.iloc[np.array(idxs),1]

The following code create groups and apply a function that concatenate third column

In [19]:
def f(x):
     return pd.Series(dict(Postcode = x['Postcode'].max(), 
                        Borough = x['Borough'].max(), 
                        Neighbourhood = "%s" % ', '.join(x['Neighbourhood'])))

In [20]:
grouping=frame.groupby(by="Postcode",as_index=False)

In [21]:
final_frame=grouping.apply(f)

In [22]:
final_frame

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


printing shape

In [23]:
final_frame.shape

(103, 3)

Read geocodes from the file and join them with existing pandas

In [24]:
geo_coord=pd.read_csv("Geospatial_Coordinates.csv")

In [25]:
geo_coord.columns=['Postcode','Latitude','Longitude']

In [26]:
final_frame=final_frame.join(geo_coord.set_index('Postcode'),on="Postcode")

In [27]:
final_frame

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


Show all borough in Toronto

In [28]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

In [29]:
address = 'Toronto, ON'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [30]:

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(final_frame['Latitude'], final_frame['Longitude'], final_frame['Borough'], final_frame['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Limit our research to Etobicoke

In [31]:
eto_data = final_frame[final_frame['Borough'] == 'Etobicoke'].reset_index(drop=True)
eto_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M8V,Etobicoke,"Humber Bay Shores, Mimico South, New Toronto",43.605647,-79.501321
1,M8W,Etobicoke,"Alderwood, Long Branch",43.602414,-79.543484
2,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
3,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So...",43.636258,-79.498509
4,M8Z,Etobicoke,"Kingsway Park South West, Mimico NW, The Queen...",43.628841,-79.520999


In [32]:
address = 'Etobicoke, ON'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Etobicoke are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Etobicoke are 43.6435559, -79.5656326.


In [33]:
# create map of Etobicoke using latitude and longitude values
map_etobicoke = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(eto_data['Latitude'], eto_data['Longitude'], eto_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_etobicoke)  
    
map_etobicoke

## Applying Foursquare API

In [34]:
del eto_data['Postcode']

In [35]:
eto_data.columns=['Borough','Neighborhood','Latitude','Longitude']
eto_data.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Etobicoke,"Humber Bay Shores, Mimico South, New Toronto",43.605647,-79.501321
1,Etobicoke,"Alderwood, Long Branch",43.602414,-79.543484
2,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
3,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So...",43.636258,-79.498509
4,Etobicoke,"Kingsway Park South West, Mimico NW, The Queen...",43.628841,-79.520999


In [36]:
manhattan_data.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Manhattan,Marble Hill,40.876551,-73.91066
1,Manhattan,Chinatown,40.715618,-73.994279
2,Manhattan,Washington Heights,40.851903,-73.9369
3,Manhattan,Inwood,40.867684,-73.92121
4,Manhattan,Hamilton Heights,40.823604,-73.949688


In [37]:
mix_frame=pd.concat([eto_data,manhattan_data],ignore_index=True)
print(eto_data.shape)
print(manhattan_data.shape)
print(mix_frame.shape)

(12, 4)
(40, 4)
(52, 4)


In [38]:
CLIENT_ID = '2XXTEC1HOWXXTQXWIXTCOUVU53RTTHMAEK1JKTYJJUMKTE14' # your Foursquare ID
CLIENT_SECRET = 'NFUTKGJREO42B1HTCH25HW402U1F03J2PFMAK3TZKH11PNJK' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 2XXTEC1HOWXXTQXWIXTCOUVU53RTTHMAEK1JKTYJJUMKTE14
CLIENT_SECRET:NFUTKGJREO42B1HTCH25HW402U1F03J2PFMAK3TZKH11PNJK


In [39]:
mix_frame.loc[0, 'Neighborhood']

'Humber Bay Shores, Mimico South, New Toronto'

In [40]:
neighborhood_latitude = mix_frame.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = mix_frame.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = eto_data.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Humber Bay Shores, Mimico South, New Toronto are 43.6056466, -79.50132070000001.


Try to get foursquare data fro one record only to check if it works at all

In [41]:
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            neighborhood_latitude, 
            neighborhood_longitude, 
            500, 
            50)

In [42]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5bc516284c1f67197dd5826d'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 15,
  'suggestedBounds': {'ne': {'lat': 43.6101466045, 'lng': -79.49511771930959},
   'sw': {'lat': 43.6011465955, 'lng': -79.50752368069043}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4b119977f964a520488023e3',
       'name': 'LCBO',
       'location': {'address': '2762 Lake Shore Blvd W',
        'crossStreet': 'btwn 1st & 2nd St',
        'lat': 43.60228082768786,
        'lng': -79.4993016827402,
        'labeledLatLngs': [{'label': 'display',
          'lat':

In [43]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [44]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,LCBO,Liquor Store,43.602281,-79.499302
1,New Toronto Fish & Chips,Restaurant,43.601849,-79.503281
2,Lucky Dice Restaurant,Café,43.601392,-79.503056
3,Delicia Bakery & Pastry,Bakery,43.601403,-79.503012
4,McDonald's,Fast Food Restaurant,43.60247,-79.498963


In [45]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

15 venues were returned by Foursquare.


Work with Foursquare API and parse json

In [46]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            500)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Let's check the resulting dataframe

In [47]:
eto_venues=getNearbyVenues(mix_frame['Neighborhood'],mix_frame['Latitude'],mix_frame['Longitude'])
eto_venues.head()

Humber Bay Shores, Mimico South, New Toronto
Alderwood, Long Branch
The Kingsway, Montgomery Road, Old Mill North
Humber Bay, King's Mill Park, Kingsway Park South East, Mimico NE, Old Mill South, The Queensway East, Royal York South East, Sunnylea
Kingsway Park South West, Mimico NW, The Queensway West, Royal York South West, South of Bloor
Islington Avenue
Cloverdale, Islington, Martin Grove, Princess Gardens, West Deane Park
Bloordale Gardens, Eringate, Markland Wood, Old Burnhamthorpe
Westmount
Kingsview Village, Martin Grove Gardens, Richview Gardens, St. Phillips
Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown
Northwest
Marble Hill
Chinatown
Washington Heights
Inwood
Hamilton Heights
Manhattanville
Central Harlem
East Harlem
Upper East Side
Yorkville
Lenox Hill
Roosevelt Island
Upper West Side
Lincoln Square
Clinton
Midtown
Murray Hill
Chelsea
Greenwich Village
East Village
Lower East Side
Tribeca
Little Italy
Soho
Wes

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Humber Bay Shores, Mimico South, New Toronto",43.605647,-79.501321,LCBO,43.602281,-79.499302,Liquor Store
1,"Humber Bay Shores, Mimico South, New Toronto",43.605647,-79.501321,Huevos Gourmet,43.601188,-79.503717,Mexican Restaurant
2,"Humber Bay Shores, Mimico South, New Toronto",43.605647,-79.501321,Sweet Olenka's,43.601099,-79.500325,Dessert Shop
3,"Humber Bay Shores, Mimico South, New Toronto",43.605647,-79.501321,Cellar Door Restaurant,43.600221,-79.507638,Italian Restaurant
4,"Humber Bay Shores, Mimico South, New Toronto",43.605647,-79.501321,Prince Of Wales Park,43.598797,-79.499001,Park


Let's check the size of the resulting dataframe

In [48]:
print(eto_venues.shape)

(4229, 7)


In [49]:
eto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown",17,17,17,17,17,17
"Alderwood, Long Branch",26,26,26,26,26,26
Battery Park City,100,100,100,100,100,100
"Bloordale Gardens, Eringate, Markland Wood, Old Burnhamthorpe",15,15,15,15,15,15
Carnegie Hill,100,100,100,100,100,100
Central Harlem,100,100,100,100,100,100
Chelsea,100,100,100,100,100,100
Chinatown,100,100,100,100,100,100
Civic Center,100,100,100,100,100,100
Clinton,100,100,100,100,100,100


In [50]:
print('There are {} uniques categories.'.format(len(eto_venues['Venue Category'].unique())))

There are 314 uniques categories.


Apply one-hot encoding to the category field

In [51]:
# one hot encoding
eto_onehot = pd.get_dummies(eto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
eto_onehot['Neighbourhood'] = eto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [eto_onehot.columns[-1]] + list(eto_onehot.columns[:-1])
eto_onehot = eto_onehot[fixed_columns]

eto_onehot.head()

Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,African Restaurant,American Restaurant,Animal Shelter,Antique Shop,Arcade,Arepa Restaurant,Argentinian Restaurant,...,Video Store,Vietnamese Restaurant,Volleyball Court,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Humber Bay Shores, Mimico South, New Toronto",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Humber Bay Shores, Mimico South, New Toronto",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Humber Bay Shores, Mimico South, New Toronto",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Humber Bay Shores, Mimico South, New Toronto",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Humber Bay Shores, Mimico South, New Toronto",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
eto_onehot.shape

(4229, 315)

Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [53]:
eto_grouped = eto_onehot.groupby('Neighbourhood').mean().reset_index()
eto_grouped

Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,African Restaurant,American Restaurant,Animal Shelter,Antique Shop,Arcade,Arepa Restaurant,Argentinian Restaurant,...,Video Store,Vietnamese Restaurant,Volleyball Court,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Battery Park City,0.0,0.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.01,0.0
3,"Bloordale Gardens, Eringate, Markland Wood, Ol...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Carnegie Hill,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,...,0.01,0.01,0.0,0.0,0.0,0.01,0.03,0.0,0.01,0.04
5,Central Harlem,0.0,0.0,0.04,0.03,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.02
6,Chelsea,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0
7,Chinatown,0.0,0.0,0.0,0.03,0.01,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.01
8,Civic Center,0.0,0.0,0.0,0.03,0.0,0.01,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.02,0.01,0.0,0.0,0.02
9,Clinton,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.03,0.0,0.0,0.01


In [54]:
eto_grouped.shape

(52, 315)

Let's print each neighborhood along with the top 5 most common venues

In [55]:
num_top_venues = 5

for hood in eto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = eto_grouped[eto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown----
                 venue  freq
0          Pizza Place  0.18
1        Grocery Store  0.18
2  Fried Chicken Joint  0.06
3       Discount Store  0.06
4          Coffee Shop  0.06


----Alderwood, Long Branch----
               venue  freq
0     Discount Store  0.12
1        Pizza Place  0.08
2      Grocery Store  0.08
3           Pharmacy  0.08
4  Convenience Store  0.04


----Battery Park City----
                 venue  freq
0                 Park  0.08
1          Coffee Shop  0.08
2            Wine Shop  0.04
3  American Restaurant  0.03
4                  Gym  0.03


----Bloordale Gardens, Eringate, Markland Wood, Old Burnhamthorpe----
                venue  freq
0      Farmers Market  0.07
1      Shopping Plaza  0.07
2  College Rec Center  0.07
3          Beer Store  0.07
4         Pizza Place  0.07


----Carnegie Hill----
                venue  freq
0         Pizza Pla

          venue  freq
0  Cocktail Bar  0.07
1           Bar  0.06
2   Pizza Place  0.06
3          Park  0.04
4   Coffee Shop  0.04


----Sutton Place----
                  venue  freq
0    Italian Restaurant  0.05
1   American Restaurant  0.05
2     Indian Restaurant  0.04
3           Coffee Shop  0.04
4  Gym / Fitness Center  0.04


----The Kingsway, Montgomery Road, Old Mill North----
                venue  freq
0         Coffee Shop  0.09
1                Park  0.07
2  Italian Restaurant  0.04
3                 Pub  0.04
4        Dessert Shop  0.04


----Tribeca----
                 venue  freq
0          Coffee Shop  0.05
1                Hotel  0.05
2  American Restaurant  0.05
3                 Park  0.05
4   Italian Restaurant  0.04


----Tudor City----
                 venue  freq
0          Coffee Shop  0.06
1        Grocery Store  0.04
2      Thai Restaurant  0.03
3  Japanese Restaurant  0.03
4   Seafood Restaurant  0.03


----Turtle Bay----
                 venue  freq
0   

Let's put that into a pandas dataframe

First, let's write a function to sort the venues in descending order.

In [56]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [57]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = eto_grouped['Neighbourhood']

for ind in np.arange(eto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(eto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Pizza Place,Japanese Restaurant,Sandwich Place,Fast Food Restaurant,Park,Coffee Shop,Pharmacy,Beer Store,Hardware Store
1,"Alderwood, Long Branch",Discount Store,Pharmacy,Grocery Store,Pizza Place,Intersection,Moroccan Restaurant,Garden Center,Gas Station,Park,Gym
2,Battery Park City,Park,Coffee Shop,Wine Shop,Plaza,Hotel,American Restaurant,Gym,Fountain,Bookstore,Burger Joint
3,"Bloordale Gardens, Eringate, Markland Wood, Ol...",Bank,Pizza Place,Coffee Shop,Shopping Plaza,College Rec Center,Beer Store,Pharmacy,Grocery Store,Fish & Chips Shop,Liquor Store
4,Carnegie Hill,Pizza Place,Italian Restaurant,Coffee Shop,Yoga Studio,Gym,Cocktail Bar,Wine Shop,Gym / Fitness Center,Bookstore,Art Museum
5,Central Harlem,Southern / Soul Food Restaurant,Café,African Restaurant,French Restaurant,Sushi Restaurant,American Restaurant,Mexican Restaurant,Seafood Restaurant,Theater,Yoga Studio
6,Chelsea,Art Gallery,Coffee Shop,American Restaurant,Seafood Restaurant,Italian Restaurant,Hotel,Nightclub,Gym / Fitness Center,Asian Restaurant,Café
7,Chinatown,Chinese Restaurant,Cocktail Bar,Ice Cream Shop,Café,Wine Bar,Sandwich Place,American Restaurant,Shoe Store,Optical Shop,Thai Restaurant
8,Civic Center,Coffee Shop,French Restaurant,Bakery,Hotel,Cocktail Bar,Chinese Restaurant,Men's Store,Café,Spa,American Restaurant
9,Clinton,Theater,Italian Restaurant,American Restaurant,Hotel,Burger Joint,Coffee Shop,Bakery,Wine Shop,Indie Theater,Gym / Fitness Center


Run k-means to cluster the neighborhood into 5 clusters.

In [58]:
# set number of clusters
kclusters = 10

eto_grouped_clustering = eto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(eto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([7, 7, 3, 7, 0, 5, 0, 0, 3, 3])

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [60]:
eto_merged = mix_frame
eto_merged.columns=['Borough','Neighbourhood','Latitude','Longitude']

# add clustering labels
eto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
eto_merged = eto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

eto_merged.head() # check the last columns!

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Etobicoke,"Humber Bay Shores, Mimico South, New Toronto",43.605647,-79.501321,7,Park,Restaurant,Bakery,Indian Restaurant,Liquor Store,Grocery Store,Mexican Restaurant,Fried Chicken Joint,Fast Food Restaurant,Falafel Restaurant
1,Etobicoke,"Alderwood, Long Branch",43.602414,-79.543484,7,Discount Store,Pharmacy,Grocery Store,Pizza Place,Intersection,Moroccan Restaurant,Garden Center,Gas Station,Park,Gym
2,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944,3,Coffee Shop,Park,Burger Joint,Pub,French Restaurant,Breakfast Spot,Sushi Restaurant,Italian Restaurant,Dessert Shop,Seafood Restaurant
3,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So...",43.636258,-79.498509,7,Italian Restaurant,Park,Gym / Fitness Center,Shopping Mall,Ice Cream Shop,Eastern European Restaurant,Yoga Studio,Flower Shop,Filipino Restaurant,Fish & Chips Shop
4,Etobicoke,"Kingsway Park South West, Mimico NW, The Queen...",43.628841,-79.520999,0,Burrito Place,Gym / Fitness Center,Coffee Shop,Italian Restaurant,Burger Joint,BBQ Joint,Bakery,Sandwich Place,Yoga Studio,Sushi Restaurant


Finally, let's visualize the resulting clusters

Now, we can examine each cluster and determine the discriminating venue categories that distinguish each cluster.

In [61]:
eto_merged.loc[eto_merged['Cluster Labels'] == 0, eto_merged.columns[[0] +[1]+ list(range(5, eto_merged.shape[1]))]]

Unnamed: 0,Borough,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Etobicoke,"Kingsway Park South West, Mimico NW, The Queen...",Burrito Place,Gym / Fitness Center,Coffee Shop,Italian Restaurant,Burger Joint,BBQ Joint,Bakery,Sandwich Place,Yoga Studio,Sushi Restaurant
6,Etobicoke,"Cloverdale, Islington, Martin Grove, Princess ...",Park,Bank,Pizza Place,Hotel,American Restaurant,Burrito Place,Fish & Chips Shop,Pharmacy,Gym,Mobile Phone Shop
7,Etobicoke,"Bloordale Gardens, Eringate, Markland Wood, Ol...",Bank,Pizza Place,Coffee Shop,Shopping Plaza,College Rec Center,Beer Store,Pharmacy,Grocery Store,Fish & Chips Shop,Liquor Store
12,Manhattan,Marble Hill,Park,Pizza Place,Spanish Restaurant,Supermarket,Mexican Restaurant,Donut Shop,Café,Sandwich Place,Athletics & Sports,Bar
16,Manhattan,Hamilton Heights,Coffee Shop,Mexican Restaurant,Bar,Café,Sushi Restaurant,Caribbean Restaurant,Scenic Lookout,Chinese Restaurant,Yoga Studio,American Restaurant
25,Manhattan,Lincoln Square,Italian Restaurant,Gym / Fitness Center,French Restaurant,Jazz Club,Bakery,Gym,Yoga Studio,Concert Hall,Indie Movie Theater,Theater
27,Manhattan,Midtown,Theater,Coffee Shop,Sandwich Place,Hotel,Gym,Plaza,Cuban Restaurant,Sporting Goods Shop,Women's Store,Bakery
28,Manhattan,Murray Hill,Japanese Restaurant,Korean Restaurant,Gym / Fitness Center,Gym,Chinese Restaurant,Coffee Shop,Italian Restaurant,Sandwich Place,Gourmet Shop,Pizza Place
36,Manhattan,West Village,Italian Restaurant,Wine Bar,Bakery,American Restaurant,Ice Cream Shop,Jazz Club,Coffee Shop,New American Restaurant,Park,Gastropub
39,Manhattan,Gramercy,American Restaurant,New American Restaurant,Indian Restaurant,Restaurant,Juice Bar,Wine Shop,Gym,Cheese Shop,Mediterranean Restaurant,Cosmetics Shop


In [62]:
eto_merged.loc[eto_merged['Cluster Labels'] == 1, eto_merged.columns[[0] +[1]+ list(range(5, eto_merged.shape[1]))]]

Unnamed: 0,Borough,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,Manhattan,Lenox Hill,Italian Restaurant,Gym / Fitness Center,French Restaurant,Gym,Women's Store,Bakery,Sporting Goods Shop,Dessert Shop,Sushi Restaurant,Grocery Store


In [63]:
eto_merged.loc[eto_merged['Cluster Labels'] == 2, eto_merged.columns[[0] +[1]+ list(range(5, eto_merged.shape[1]))]]

Unnamed: 0,Borough,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,Manhattan,Manhattan Valley,Coffee Shop,Park,Pizza Place,Grocery Store,Ice Cream Shop,Indian Restaurant,Playground,Mexican Restaurant,Bar,Yoga Studio


In [64]:
eto_merged.loc[eto_merged['Cluster Labels'] == 3, eto_merged.columns[[0] + [1]+list(range(5, eto_merged.shape[1]))]]

Unnamed: 0,Borough,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",Coffee Shop,Park,Burger Joint,Pub,French Restaurant,Breakfast Spot,Sushi Restaurant,Italian Restaurant,Dessert Shop,Seafood Restaurant
8,Etobicoke,Westmount,Pizza Place,Sandwich Place,Supermarket,Chinese Restaurant,Discount Store,Intersection,Middle Eastern Restaurant,Coffee Shop,Breakfast Spot,Ice Cream Shop
9,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",Supermarket,Pizza Place,Supplement Shop,Mobile Phone Shop,Chinese Restaurant,Beer Store,Coffee Shop,Pharmacy,Sandwich Place,Bank
13,Manhattan,Chinatown,Chinese Restaurant,Cocktail Bar,Ice Cream Shop,Café,Wine Bar,Sandwich Place,American Restaurant,Shoe Store,Optical Shop,Thai Restaurant
14,Manhattan,Washington Heights,Pizza Place,Latin American Restaurant,Café,Bakery,Grocery Store,Mexican Restaurant,Deli / Bodega,Tapas Restaurant,Bar,Park
15,Manhattan,Inwood,Latin American Restaurant,Mexican Restaurant,Deli / Bodega,Pizza Place,Café,Wine Bar,Lounge,Spanish Restaurant,Chinese Restaurant,Bakery
18,Manhattan,Central Harlem,Southern / Soul Food Restaurant,Café,African Restaurant,French Restaurant,Sushi Restaurant,American Restaurant,Mexican Restaurant,Seafood Restaurant,Theater,Yoga Studio
24,Manhattan,Upper West Side,Italian Restaurant,Park,American Restaurant,Indian Restaurant,Wine Bar,Ice Cream Shop,Bakery,Bar,Coffee Shop,Sushi Restaurant
26,Manhattan,Clinton,Theater,Italian Restaurant,American Restaurant,Hotel,Burger Joint,Coffee Shop,Bakery,Wine Shop,Indie Theater,Gym / Fitness Center
32,Manhattan,Lower East Side,Italian Restaurant,Mexican Restaurant,Boutique,Coffee Shop,Wine Bar,Japanese Restaurant,Ice Cream Shop,Deli / Bodega,Garden,Cocktail Bar


In [65]:
eto_merged.loc[eto_merged['Cluster Labels'] == 4, eto_merged.columns[[0] +[1]+ list(range(5, eto_merged.shape[1]))]]

Unnamed: 0,Borough,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
20,Manhattan,Upper East Side,Exhibit,Italian Restaurant,Coffee Shop,Bakery,Yoga Studio,Hotel,Playground,Cocktail Bar,Seafood Restaurant,Mexican Restaurant


In [66]:
eto_merged.loc[eto_merged['Cluster Labels'] == 5, eto_merged.columns[[0] +[1]+ list(range(5, eto_merged.shape[1]))]]

Unnamed: 0,Borough,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Etobicoke,Islington Avenue,Pharmacy,Bakery,Convenience Store,Bank,Skating Rink,Park,Shopping Mall,Grocery Store,Playground,Golf Course
11,Etobicoke,Northwest,Coffee Shop,Lounge,Dog Run,Yoga Studio,Food & Drink Shop,Filipino Restaurant,Fish & Chips Shop,Fish Market,Flea Market,Flower Shop
17,Manhattan,Manhattanville,Mexican Restaurant,Park,American Restaurant,Italian Restaurant,Tennis Court,Pizza Place,Café,Coffee Shop,Seafood Restaurant,Deli / Bodega
19,Manhattan,East Harlem,Mexican Restaurant,Bakery,Pizza Place,Deli / Bodega,Café,Latin American Restaurant,Plaza,Thai Restaurant,Gym,Cocktail Bar
21,Manhattan,Yorkville,Italian Restaurant,Coffee Shop,Gym,Bar,Ice Cream Shop,Pizza Place,Sushi Restaurant,Wine Shop,Thai Restaurant,Bagel Shop
29,Manhattan,Chelsea,Art Gallery,Coffee Shop,American Restaurant,Seafood Restaurant,Italian Restaurant,Hotel,Nightclub,Gym / Fitness Center,Asian Restaurant,Café
30,Manhattan,Greenwich Village,Italian Restaurant,Coffee Shop,Pizza Place,American Restaurant,Seafood Restaurant,Spa,Indie Movie Theater,Café,Clothing Store,French Restaurant
31,Manhattan,East Village,Cocktail Bar,Ice Cream Shop,Coffee Shop,Wine Bar,Speakeasy,Vegetarian / Vegan Restaurant,Bar,Bagel Shop,Pizza Place,Mexican Restaurant
34,Manhattan,Little Italy,Café,Men's Store,Italian Restaurant,Clothing Store,Chinese Restaurant,Sandwich Place,Coffee Shop,Cocktail Bar,Shoe Store,Yoga Studio
38,Manhattan,Morningside Heights,Coffee Shop,American Restaurant,Italian Restaurant,Park,Bookstore,Bakery,Restaurant,Seafood Restaurant,Mexican Restaurant,Café


In [67]:
eto_merged.loc[eto_merged['Cluster Labels'] == 6, eto_merged.columns[[0] +[1]+ list(range(5, eto_merged.shape[1]))]]

Unnamed: 0,Borough,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
23,Manhattan,Roosevelt Island,Sushi Restaurant,Park,Pizza Place,Coffee Shop,Deli / Bodega,Greek Restaurant,Cocktail Bar,Tennis Court,Café,Playground


In [68]:
eto_merged.loc[eto_merged['Cluster Labels'] == 7, eto_merged.columns[[0] +[1]+ list(range(5, eto_merged.shape[1]))]]

Unnamed: 0,Borough,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Etobicoke,"Humber Bay Shores, Mimico South, New Toronto",Park,Restaurant,Bakery,Indian Restaurant,Liquor Store,Grocery Store,Mexican Restaurant,Fried Chicken Joint,Fast Food Restaurant,Falafel Restaurant
1,Etobicoke,"Alderwood, Long Branch",Discount Store,Pharmacy,Grocery Store,Pizza Place,Intersection,Moroccan Restaurant,Garden Center,Gas Station,Park,Gym
3,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So...",Italian Restaurant,Park,Gym / Fitness Center,Shopping Mall,Ice Cream Shop,Eastern European Restaurant,Yoga Studio,Flower Shop,Filipino Restaurant,Fish & Chips Shop


In [69]:
eto_merged.loc[eto_merged['Cluster Labels'] == 8, eto_merged.columns[[0] +[1]+ list(range(5, eto_merged.shape[1]))]]

Unnamed: 0,Borough,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Pizza Place,Japanese Restaurant,Sandwich Place,Fast Food Restaurant,Park,Coffee Shop,Pharmacy,Beer Store,Hardware Store


In [70]:
eto_merged.loc[eto_merged['Cluster Labels'] == 9, eto_merged.columns[[0] +[1]+ list(range(5, eto_merged.shape[1]))]]

Unnamed: 0,Borough,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
50,Manhattan,Flatiron,Gym,New American Restaurant,Gym / Fitness Center,Cycle Studio,Italian Restaurant,American Restaurant,Bookstore,Park,Vegetarian / Vegan Restaurant,Bakery
