# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

## 1. Scraping Wikipedia

In [2]:
from urllib import request
from bs4 import BeautifulSoup

In [3]:
#import the webpage into soup
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = request.urlopen(url)
soup = BeautifulSoup(page, "lxml")

#Extract the table from the webpage
table=soup.find('table', class_='wikitable sortable')

#split table on rows
table_rows = table.findAll('tr')

#Extract headers from table
headers = []
for header in table_rows[0].findAll('th'):
    headers.append(header.find(text=True)[:-1])

In [4]:
#Create empty numpy arrays that will contain table data per column
info_rows = len(table_rows[1:])
info_cols = len(headers)
data = np.empty([info_rows, info_cols], dtype=object)

#Iterate every row and add the information to the right numpy array
for i, row in enumerate(table_rows[1:]):
    #print(row)
    cells = row.findAll('td')
    for p in range(info_cols):
        data[i, p] = cells[p].find(text=True)[:-1]

#Create a dataframe using previously gathered data
df = pd.DataFrame(data, columns=headers)

#Remove all entries where borough was not assigned
df = df[df['Borough'] != 'Not assigned']

#Clean neighborhood column to replace '/' with a ',' also remove double spaces
df['Neighborhood'] = df['Neighborhood'].str.replace('  ', ' ')
df['Neighborhood'] = df['Neighborhood'].str.replace(' /', ', ')

#If an entry exists where the neighborhood is not assigned while burough is, remove these from the dataframe
df = df[df['Neighborhood'] != '']

In [5]:
#Print shape of dataframe, to display amount of rows still in the dataframe
df.shape

(103, 3)

## 2. Adding latitude and longitude

In [6]:
lat_lng_df = pd.read_csv('Geospatial_Coordinates.csv')
df = df.merge(lat_lng_df, left_on='Postalcode', right_on='Postal Code').drop(columns=['Postal Code'])

In [7]:
df

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,Business reply mail Processing Centre,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, ...",43.636258,-79.498509


## 3. Exploring the dataset

In [8]:
!pip install geopy
from geopy.geocoders import Nominatim
!pip install folium
import folium # map rendering library



In [9]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [10]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [11]:
CLIENT_ID = '2OS4YFSMYGEMPT2KFDEI0OEB5HUCP5R40MITFVN0APGD0GNE'
CLIENT_SECRET = 'VWWZJTT2FA2B00Z3FCZLCEQIG5FCXQBPYNHU4YVNEN0TZZ5K'
VERSION = '20190725' # Foursquare API version

print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

CLIENT_ID: 2OS4YFSMYGEMPT2KFDEI0OEB5HUCP5R40MITFVN0APGD0GNE
CLIENT_SECRET:VWWZJTT2FA2B00Z3FCZLCEQIG5FCXQBPYNHU4YVNEN0TZZ5K


In [12]:
df['Neighborhood'][0]

'Parkwoods'

In [13]:
neigh_latitude = df.loc[0, 'Latitude']
neigh_longitude = df.loc[0, 'Longitude']
neigh_name = df.loc[0, 'Neighborhood']

In [14]:
# set the radius to be 500 m and the maximum number of retrieved items to be 15
radius = 500
LIMIT = 15

url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}\
&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, 
                                         neigh_latitude, neigh_longitude, 
                                         radius, LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?client_id=2OS4YFSMYGEMPT2KFDEI0OEB5HUCP5R40MITFVN0APGD0GNE&client_secret=VWWZJTT2FA2B00Z3FCZLCEQIG5FCXQBPYNHU4YVNEN0TZZ5K&v=20190725&ll=43.7532586,-79.3296565&radius=500&limit=15'

In [15]:
import requests

search_result = requests.get(url).json()
search_result

{'meta': {'code': 200, 'requestId': '5eb30fc8df2774001b4421d1'},
  'headerLocation': 'Parkwoods - Donalda',
  'headerFullLocation': 'Parkwoods - Donalda, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 2,
  'suggestedBounds': {'ne': {'lat': 43.757758604500005,
    'lng': -79.32343823984928},
   'sw': {'lat': 43.7487585955, 'lng': -79.33587476015072}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4e8d9dcdd5fbbbb6b3003c7b',
       'name': 'Brookbanks Park',
       'location': {'address': 'Toronto',
        'lat': 43.751976046055574,
        'lng': -79.33214044722958,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.751976046055574,
          'lng': -79.33214044722958}],
        'distance': 245,
        'cc': 'CA',
        'c

In [16]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [17]:
import json
venues = search_result['response']['groups'][0]['items']
    
nearby_venues = pd.io.json.json_normalize(venues) 

# craete filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

Unnamed: 0,name,categories,lat,lng
0,Brookbanks Park,Park,43.751976,-79.33214
1,Variety Store,Food & Drink Shop,43.751974,-79.333114


In [18]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        search_results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue, venues_list will be a list of lists
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in search_results])
    # using list coprehension to add individual items from venues_list to nearby_venues dataframe
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

toronto_venues = getNearbyVenues(names = df['Neighborhood'], latitudes = df['Latitude'], longitudes = df['Longitude'])

Parkwoods
Victoria Village
Regent Park,  Harbourfront
Lawrence Manor,  Lawrence Heights
Queen's Park,  Ontario Provincial Government
Islington Avenue
Malvern,  Rouge
Don Mills
Parkview Hill,  Woodbine Gardens
Garden District,  Ryerson
Glencairn
West Deane Park,  Princess Gardens,  Martin Grove,  Islington,  Cloverdale
Rouge Hill,  Port Union,  Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate,  Bloordale Gardens,  Old Burnhamthorpe,  Markland Wood
Guildwood,  Morningside,  West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor,  Wilson Heights,  Downsview North
Thorncliffe Park
Richmond,  Adelaide,  King
Dufferin,  Dovercourt Village
Scarborough Village
Fairview,  Henry Farm,  Oriole
Northwood Park,  York University
East Toronto
Harbourfront East,  Union Station,  Toronto Islands
Little Portugal,  Trinity
Kennedy Park,  Ionview,  East Birchmount Park
Bayview Village
D

In [19]:
print('The Toronto Venue dataframe contains {} rows and {} columns.'.format(toronto_venues.shape[0], toronto_venues.shape[1]))
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

The Toronto Venue dataframe contains 913 rows and 7 columns.
There are 205 uniques categories.


In [20]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 
# switch position of column names
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighbourhood,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Art Gallery,...,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
toronto_onehot.shape

(913, 206)

In [22]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighbourhood,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Art Gallery,...,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,Willowdale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
89,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
90,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0
91,York Mills West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
num_top_venues = 5
# loop through all neighbourhoods in toronto_grouped dataframe
for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
                       venue  freq
0                     Lounge  0.25
1             Clothing Store  0.25
2  Latin American Restaurant  0.25
3             Breakfast Spot  0.25
4          Accessories Store  0.00


----Alderwood,  Long Branch----
            venue  freq
0     Pizza Place   0.2
1             Gym   0.1
2    Skating Rink   0.1
3  Sandwich Place   0.1
4             Pub   0.1


----Bathurst Manor,  Wilson Heights,  Downsview North----
              venue  freq
0              Bank  0.13
1       Coffee Shop  0.13
2       Pizza Place  0.07
3  Sushi Restaurant  0.07
4       Bridal Shop  0.07


----Bayview Village----
                 venue  freq
0                 Café  0.25
1   Chinese Restaurant  0.25
2                 Bank  0.25
3  Japanese Restaurant  0.25
4    Accessories Store  0.00


----Bedford Park,  Lawrence Manor East----
                     venue  freq
0              Coffee Shop  0.13
1  Comfort Food Restaurant  0.07
2        Indian Restaurant  0.07
3

4        Medical Center   0.0


----Milliken,  Agincourt North,  Steeles East,  L'Amoreaux East----
               venue  freq
0               Park   0.5
1         Playground   0.5
2  Accessories Store   0.0
3      Movie Theater   0.0
4     Medical Center   0.0


----Mimico NW,  The Queensway West,  South of Bloor,  Kingsway Park South West,  Royal York South West----
            venue  freq
0             Gym  0.07
1          Bakery  0.07
2  Hardware Store  0.07
3  Discount Store  0.07
4  Sandwich Place  0.07


----Moore Park,  Summerhill East----
          venue  freq
0           Gym  0.25
1    Playground  0.25
2         Trail  0.25
3  Tennis Court  0.25
4         Motel  0.00


----New Toronto,  Mimico South,  Humber Bay Shores----
         venue  freq
0          Gym  0.08
1       Bakery  0.08
2     Pharmacy  0.08
3  Pizza Place  0.08
4   Restaurant  0.08


----North Park,  Maple Leaf Park,  Upwood Park----
                        venue  freq
0  Construction & Landscaping  0.33
1     

Create a funtion to sort the venue categories in descending order and create a dataframe to display the top 10 venue categories for each Neighbourhood

In [24]:
# first create the function
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

# then apply the function to find the top 10 venue categories
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head(10)

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Latin American Restaurant,Breakfast Spot,Lounge,Clothing Store,Dim Sum Restaurant,Electronics Store,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run
1,"Alderwood, Long Branch",Pizza Place,Gym,Skating Rink,Sandwich Place,Pub,Athletics & Sports,Pool,Coffee Shop,Pharmacy,Construction & Landscaping
2,"Bathurst Manor, Wilson Heights, Downsview North",Bank,Coffee Shop,Ice Cream Shop,Deli / Bodega,Bridal Shop,Sushi Restaurant,Restaurant,Fried Chicken Joint,Diner,Pizza Place
3,Bayview Village,Café,Chinese Restaurant,Japanese Restaurant,Bank,Dessert Shop,Electronics Store,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run
4,"Bedford Park, Lawrence Manor East",Coffee Shop,Café,Pub,Sushi Restaurant,Indian Restaurant,Thai Restaurant,Italian Restaurant,Restaurant,Butcher,American Restaurant
5,Berczy Park,Seafood Restaurant,Cocktail Bar,Museum,Breakfast Spot,Bistro,Farmers Market,Restaurant,Fountain,French Restaurant,Liquor Store
6,"Birch Cliff, Cliffside West",College Stadium,Skating Rink,General Entertainment,Café,Dessert Shop,Electronics Store,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run
7,"Brockton, Parkdale Village, Exhibition Place",Coffee Shop,Café,Breakfast Spot,Gym,Bakery,Furniture / Home Store,Italian Restaurant,Pet Store,Climbing Gym,Restaurant
8,Business reply mail Processing Centre,Park,Auto Workshop,Skate Park,Light Rail Station,Smoke Shop,Spa,Burrito Place,Farmers Market,Fast Food Restaurant,Restaurant
9,"CN Tower, King and Spadina, Railway Lands, ...",Airport Lounge,Airport Service,Rental Car Location,Airport,Airport Food Court,Airport Gate,Airport Terminal,Boat or Ferry,Plane,Harbor / Marina


In [25]:
# define the total number of clusters
kclusters = 5
# drop the 'Neighbourhood' column 
toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0, n_init = 12).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([4, 4, 4, 4, 4, 4, 4, 4, 4, 4])

In [26]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_final = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_final = toronto_final.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighborhood')

toronto_final.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,1.0,Park,Food & Drink Shop,Falafel Restaurant,Electronics Store,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run,Distribution Center,Discount Store
1,M4A,North York,Victoria Village,43.725882,-79.315572,4.0,French Restaurant,Coffee Shop,Portuguese Restaurant,Hockey Arena,Intersection,Department Store,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,4.0,Coffee Shop,Breakfast Spot,Historic Site,Chocolate Shop,Bakery,Pub,Distribution Center,Farmers Market,Restaurant,Spa
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,4.0,Clothing Store,Furniture / Home Store,Women's Store,Accessories Store,Boutique,Miscellaneous Shop,Event Space,Gift Shop,Vietnamese Restaurant,Coffee Shop
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,4.0,Coffee Shop,Yoga Studio,Hobby Shop,Park,Sushi Restaurant,Italian Restaurant,Burrito Place,Creperie,Mexican Restaurant,Beer Bar


In [27]:
import matplotlib.cm as cm
import matplotlib.colors as colors

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_final['Latitude'], toronto_final['Longitude'], toronto_final['Neighborhood'], toronto_final['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        #color=rainbow[cluster-1,0],
        fill=True,
        #fill_color=rainbow[cluster-1,0],
        fill_opacity=0.7).add_to(map_clusters)

### Prt 3. Visualze and examine the clusters

In [28]:
map_clusters

Cluster 1

In [29]:
toronto_final.loc[toronto_final['Cluster Labels'] == 0, toronto_final.columns[[1] + list(range(5, toronto_final.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
12,Scarborough,0.0,Bar,Moving Target,Yoga Studio,Dessert Shop,Electronics Store,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run,Distribution Center


Cluster 2

In [30]:
toronto_final.loc[toronto_final['Cluster Labels'] == 1, toronto_final.columns[[1] + list(range(5, toronto_final.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,North York,1.0,Park,Food & Drink Shop,Falafel Restaurant,Electronics Store,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run,Distribution Center,Discount Store
10,North York,1.0,Park,Pizza Place,Japanese Restaurant,Pub,Deli / Bodega,Drugstore,Donut Shop,Dog Run,Distribution Center,Discount Store
21,York,1.0,Park,Women's Store,Pool,Department Store,Electronics Store,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run,Distribution Center
32,Scarborough,1.0,Playground,Convenience Store,Yoga Studio,Department Store,Electronics Store,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run,Distribution Center
35,East York,1.0,Park,Convenience Store,Department Store,Electronics Store,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run,Distribution Center,Discount Store
49,North York,1.0,Park,Construction & Landscaping,Bakery,Dessert Shop,Electronics Store,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run,Distribution Center
61,Central Toronto,1.0,Park,Swim School,Bus Line,Dessert Shop,Electronics Store,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run,Distribution Center
62,Central Toronto,1.0,Garden,Yoga Studio,Department Store,Electronics Store,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run,Distribution Center,Discount Store
66,North York,1.0,Park,Convenience Store,Bank,Dessert Shop,Electronics Store,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run,Distribution Center
68,Central Toronto,1.0,Park,Jewelry Store,Sushi Restaurant,Trail,Yoga Studio,Department Store,Drugstore,Donut Shop,Dog Run,Distribution Center


Cluster 3

In [31]:
toronto_final.loc[toronto_final['Cluster Labels'] == 2, toronto_final.columns[[1] + list(range(5, toronto_final.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
45,North York,2.0,Cafeteria,Department Store,Electronics Store,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run,Distribution Center,Discount Store,Diner


Cluster 4

In [32]:
toronto_final.loc[toronto_final['Cluster Labels'] == 3, toronto_final.columns[[1] + list(range(5, toronto_final.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
50,North York,3.0,Pizza Place,Yoga Studio,Department Store,Electronics Store,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run,Distribution Center,Discount Store


Cluster 5

In [33]:
toronto_final.loc[toronto_final['Cluster Labels'] == 4, toronto_final.columns[[1] + list(range(5, toronto_final.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,North York,4.0,French Restaurant,Coffee Shop,Portuguese Restaurant,Hockey Arena,Intersection,Department Store,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run
2,Downtown Toronto,4.0,Coffee Shop,Breakfast Spot,Historic Site,Chocolate Shop,Bakery,Pub,Distribution Center,Farmers Market,Restaurant,Spa
3,North York,4.0,Clothing Store,Furniture / Home Store,Women's Store,Accessories Store,Boutique,Miscellaneous Shop,Event Space,Gift Shop,Vietnamese Restaurant,Coffee Shop
4,Downtown Toronto,4.0,Coffee Shop,Yoga Studio,Hobby Shop,Park,Sushi Restaurant,Italian Restaurant,Burrito Place,Creperie,Mexican Restaurant,Beer Bar
6,Scarborough,4.0,Fast Food Restaurant,Print Shop,Yoga Studio,Department Store,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run,Distribution Center,Discount Store
...,...,...,...,...,...,...,...,...,...,...,...,...
97,Downtown Toronto,4.0,Café,Coffee Shop,Restaurant,Bakery,Gym / Fitness Center,Gastropub,Deli / Bodega,Bookstore,Steakhouse,Gym
99,Downtown Toronto,4.0,Park,Theme Restaurant,Breakfast Spot,Bookstore,Salon / Barbershop,Beer Bar,Italian Restaurant,Restaurant,Ramen Restaurant,Bubble Tea Shop
100,East Toronto,4.0,Park,Auto Workshop,Skate Park,Light Rail Station,Smoke Shop,Spa,Burrito Place,Farmers Market,Fast Food Restaurant,Restaurant
101,Etobicoke,4.0,Breakfast Spot,Baseball Field,Home Service,Dim Sum Restaurant,Falafel Restaurant,Event Space,Electronics Store,Eastern European Restaurant,Drugstore,Donut Shop


### Analysis

KMeans is used to cluster the neighbourhoods from Downtown Toronto, Central Toronto, West Toronto and East Toronto into 5 distint clusters. Based on the visualization and tabular summary above, it can be noticed that Cluster 2 and Cluster 3 have much less data sample in comparison to ther other 3 clusters. This implies that the top venues in the neighbourhoods of Cluster 2 and 3 are very different from the other 3 clusters. More analysis needs to be done to explore find out what the difference is. We can also see that West Toronto and East Toronto have all been put into either cluster 0 or cluster 1. This implies that top venues these two boroughs are not as common in the other 3 clusters.