# Explore and Cluster Neighbourhoods in Toronto

## 1. Get Neighbourhood Data and Store in a Pandas Dataframe

In [2]:
# import dependencies
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import numpy as np

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors

!conda install -c conda-forge folium=0.5.0 --yes
import folium

from bs4 import BeautifulSoup
import requests

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge geocoder --yes
import geocoder
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values
from pandas.io.json import json_normalize

print("Libraries imported")

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/DSX-Python35

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    branca-0.3.1               |             py_0          25 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    certifi-2018.8.24          |        py35_1001         139 KB  conda-forge
    altair-2.2.2               |           py35_1         462 KB  conda-forge
    openssl-1.0.2r             |       h14c3975_0         3.1 MB  conda-forge
    ca-certificates-2019.6.16  |       hecc5488_0         145 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         4.0 MB

The following NEW packages will

#### Get the url with Toronto neighbourhood data

In [87]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

# create a beautiful soup instance
soup = BeautifulSoup(source, 'html5lib')

In [88]:
# store the table of data 
site_table = soup.find('div', class_='mw-content-ltr').table

#### Store the column names from the website in a list to later make a pandas dataframe

In [89]:
# create a list to store the column names
col_names = []
for heading in site_table.find_all('th'):
    # append the string name to the list and strip the new line at the end
    col_names.append(heading.text.rstrip())

col_names

['Postcode', 'Borough', 'Neighbourhood']

In [90]:
# replace 'Postcode' with 'PostalCode'
col_names[0] = 'PostalCode'

col_names

['PostalCode', 'Borough', 'Neighbourhood']

#### Store the table entries from the website in a list

In [91]:
# create a list for the entries
data_entries = []
for entry in site_table.find_all('td'):
    # append the string name to the list and strip the new lines
    data_entries.append(entry.text.rstrip())

data_entries[:9]

['M1A',
 'Not assigned',
 'Not assigned',
 'M2A',
 'Not assigned',
 'Not assigned',
 'M3A',
 'North York',
 'Parkwoods']

#### Create an array for the entries to easily reshape for a pandas dataframe

In [92]:
data_array = np.array(data_entries)

data_array = np.reshape(data_array, (len(data_entries)//3, 3))
data_array[:5]

array([['M1A', 'Not assigned', 'Not assigned'],
       ['M2A', 'Not assigned', 'Not assigned'],
       ['M3A', 'North York', 'Parkwoods'],
       ['M4A', 'North York', 'Victoria Village'],
       ['M5A', 'Downtown Toronto', 'Harbourfront']],
      dtype='<U49')

#### Create a pandas dataframe with the Toronto neighbourhood data

In [93]:
df = pd.DataFrame(data_array, columns=col_names)

df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [55]:
# view the length
df.shape

(288, 3)

#### Eliminate boroughs with 'not assigned' values

In [98]:
df = df[df.Borough != 'Not assigned']

df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [95]:
# view the shape
df.shape

(211, 3)

#### Get the index of the "Not assigned" value in the Neighbourhood column and replace it with the Borough name

In [101]:
pd.Index(df["Neighbourhood"]).get_loc('Not assigned')

6

In [110]:
# replace the "Not assigned" value with the Borough name "Queen's Park" in the Neighbourhood column
df.iloc[6, 2] = df['Borough'][8]

df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [111]:
# view the length
df.shape

(211, 3)

#### Join neighbourhoods with the same postal code into the same row

In [120]:
tor_neigh_df = df.groupby(df['PostalCode'], sort=False)\
.aggregate({'Borough': 'first', 'Neighbourhood': lambda x: ', '.join(x)}).reset_index()

tor_neigh_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [121]:
# view the shape of the dataframe
tor_neigh_df.shape

(103, 3)

## 2. Get the latitude and longitude of the neighbourhoods and add them to the dataframe

In [124]:
# import the geocoder library
!conda install -c conda-forge geocoder --yes
import geocoder

print('geocoder ready')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/DSX-Python35

  added / updated specs: 
    - geocoder


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ratelim-0.1.6              |           py35_0           5 KB  conda-forge
    geocoder-1.38.1            |             py_0          52 KB  conda-forge
    orderedset-2.0             |           py35_0         685 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         742 KB

The following NEW packages will be INSTALLED:

    geocoder:   1.38.1-py_0  conda-forge
    orderedset: 2.0-py35_0   conda-forge
    ratelim:    0.1.6-py35_0 conda-forge


Downloading and Extracting Packages
ratelim-0.1.6        | 5 KB      | ##################################### | 100% 
geocoder-1.38.1      | 52 KB     | #######################

In [136]:
# initialize a variable to store latitude and longitude
lat_lng = None

# create lists to hold latitude and longitude values
latitude = []
longitude = []

# loop through the postal codes 
for p_code in tor_neigh_df.PostalCode:
    # loop to ensure we get the coordinates
    while(lat_lng is None):
        coords = geocoder.arcgis('{}, Toronto, Ontario'.format(p_code))
        lat_lng = coords.latlng
    latitude.append(lat_lng[0])
    longitude.append(lat_lng[1]) 
    lat_lng = None

In [141]:
# add the latitude and longitude coordinates to the dataframe
tor_neigh_df['Latitude'] = latitude
tor_neigh_df['Longitude'] = longitude

tor_neigh_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75244,-79.329271
1,M4A,North York,Victoria Village,43.730421,-79.31332
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65512,-79.36264
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.723125,-79.451589
4,M7A,Queen's Park,Queen's Park,43.661102,-79.391035
5,M9A,Etobicoke,Islington Avenue,43.662242,-79.528379
6,M1B,Scarborough,"Rouge, Malvern",43.811525,-79.195517
7,M3B,North York,Don Mills North,43.749195,-79.361905
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.707535,-79.311773
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657363,-79.37818


## 3. Explore and Cluster Neighbourhoods in Toronto

In [142]:
# view the names of the Boroughs
tor_neigh_df.Borough.unique()

array(['North York', 'Downtown Toronto', "Queen's Park", 'Etobicoke',
       'Scarborough', 'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

#### Explore the Boroughs of Toronto with the name "Toronto" in them

In [157]:
# create a new dataframe with Boroughs that contain "Toronto"

toronto_data = tor_neigh_df[tor_neigh_df.Borough.str.endswith("Toronto")]

toronto_data.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65512,-79.36264
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657363,-79.37818
15,M5C,Downtown Toronto,St. James Town,43.65121,-79.375481
19,M4E,East Toronto,The Beaches,43.676845,-79.295225
20,M5E,Downtown Toronto,Berczy Park,43.64516,-79.373675
24,M5G,Downtown Toronto,Central Bay Street,43.656091,-79.38493
25,M6G,Downtown Toronto,Christie,43.668781,-79.42071
30,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.649515,-79.382503
31,M6H,West Toronto,"Dovercourt Village, Dufferin",43.665087,-79.438705
36,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",43.62347,-79.391507


#### Make sure the dataframe has the correct Boroughs: Downtown Toronto, East Toronto, West Toronto and Central Toronto

In [158]:
toronto_data['Borough'].unique()

array(['Downtown Toronto', 'East Toronto', 'West Toronto',
       'Central Toronto'], dtype=object)

#### Use geopy to get the latitude and longitude of Toronto

In [159]:
address = 'Toronto, ONT'

geolocator = Nominatim(user_agent="toronto_explorer") 
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.6781255, -79.6321235335026.


#### Create a map of Toronto with neighbourhoods superimposed on top 

In [161]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Borough'], toronto_data['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

map_toronto

#### Define Foursquare credentials and versions

In [163]:
CLIENT_ID = '3HWJIJNOWO12VFPTDF0KSBSHMHMXRRFCHQYFJBQ5SYYKK5CO' 
CLIENT_SECRET = 'WMUTAKA12VS5N150FWLUN2LV55H2SJLHJJIRY4E2B15EICP3' 
VERSION = '20190605' # Foursquare API version

#### Create a function to get the top 100 venues in the chosen boroughs of Toronto

In [164]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)


#### Call the function above to create a new dataframe called toronto_venues 

In [165]:
toronto_venues = getNearbyVenues(names=toronto_data['Neighbourhood'],
                                 latitudes=toronto_data['Latitude'],
                                 longitudes=toronto_data['Longitude']
                                )

Harbourfront, Regent Park
Ryerson, Garden District
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Adelaide, King, Richmond
Dovercourt Village, Dufferin
Harbourfront East, Toronto Islands, Union Station
Little Portugal, Trinity
The Danforth West, Riverdale
Design Exchange, Toronto Dominion Centre
Brockton, Exhibition Place, Parkdale Village
The Beaches West, India Bazaar
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North, Forest Hill West
High Park, The Junction South
North Toronto West
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Davisville
Harbord, University of Toronto
Runnymede, Swansea
Moore Park, Summerhill East
Chinatown, Grange Park, Kensington Market
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Rosedale
Stn A PO Boxes 25 The Esplanade
Cabbagetown, St. James Town
Fir

#### View the size of the dataframe and the first 10 entries

In [166]:
print(toronto_venues.shape)

toronto_venues.head()

(1737, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Harbourfront, Regent Park",43.65512,-79.36264,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Harbourfront, Regent Park",43.65512,-79.36264,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Harbourfront, Regent Park",43.65512,-79.36264,Figs Breakfast & Lunch,43.655675,-79.364503,Breakfast Spot
3,"Harbourfront, Regent Park",43.65512,-79.36264,Cocina Economica,43.654959,-79.365657,Mexican Restaurant
4,"Harbourfront, Regent Park",43.65512,-79.36264,Body Blitz Spa East,43.654735,-79.359874,Spa


#### Check how many venues were returned for each neighbourhood

In [167]:
toronto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Berczy Park,63,63,63,63,63,63
"Brockton, Exhibition Place, Parkdale Village",67,67,67,67,67,67
Business Reply Mail Processing Centre 969 Eastern,100,100,100,100,100,100
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",68,68,68,68,68,68
"Cabbagetown, St. James Town",43,43,43,43,43,43
Central Bay Street,100,100,100,100,100,100
"Chinatown, Grange Park, Kensington Market",94,94,94,94,94,94
Christie,9,9,9,9,9,9
Church and Wellesley,82,82,82,82,82,82


#### View the number of unique categories

In [168]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 212 uniques categories.


#### One hot encode the venues and store in a dataframe to analyze the neighbourhoods

In [169]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighbourhood,Adult Boutique,Afghan Restaurant,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [170]:
# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_onehot['Neighbourhood'] 

toronto_onehot.head()

Unnamed: 0,Neighbourhood,Adult Boutique,Afghan Restaurant,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Group the rows by neighborhood and by taking the mean of the frequency of occurrence of each category for analysis purposes

In [171]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighbourhood,Adult Boutique,Afghan Restaurant,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.03,0.0,0.01,0.0,0.0,0.03,0.0,...,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.015873,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.015873,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.014925,0.0,0.014925,0.0,0.0,...,0.0,0.0,0.0,0.029851,0.0,0.014925,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.03,0.0,0.01,0.0,0.01,0.02,0.0,...,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014706,0.0,...,0.0,0.0,0.014706,0.014706,0.0,0.0,0.0,0.0,0.0,0.014706


#### View the size of the dataframe

In [172]:
toronto_grouped.shape

(37, 213)

#### Print each neighborhood along with the top 5 most common venues

In [174]:
num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
                 venue  freq
0                 Café  0.07
1                Hotel  0.07
2          Coffee Shop  0.06
3            Gastropub  0.03
4  American Restaurant  0.03


----Berczy Park----
                venue  freq
0         Coffee Shop  0.11
1        Cocktail Bar  0.05
2          Restaurant  0.05
3              Bakery  0.03
4  Seafood Restaurant  0.03


----Brockton, Exhibition Place, Parkdale Village----
                    venue  freq
0             Coffee Shop  0.09
1                    Café  0.06
2  Furniture / Home Store  0.06
3          Sandwich Place  0.04
4                     Bar  0.04


----Business Reply Mail Processing Centre 969 Eastern----
         venue  freq
0  Coffee Shop  0.07
1         Café  0.04
2          Bar  0.04
3   Steakhouse  0.04
4        Hotel  0.04


----CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara----
                venue  freq
0         Coffee Shop  0.

#### Create a function to sort the venues in descending order.

In [175]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#### Create a new dataframe and display the top 10 venues for each neighborhood.

In [177]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()


Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Hotel,Café,Coffee Shop,Japanese Restaurant,Restaurant,Burger Joint,Breakfast Spot,Steakhouse,Deli / Bodega,Bakery
1,Berczy Park,Coffee Shop,Cocktail Bar,Restaurant,Italian Restaurant,Steakhouse,Café,Beer Bar,Seafood Restaurant,Cheese Shop,Bakery
2,"Brockton, Exhibition Place, Parkdale Village",Coffee Shop,Furniture / Home Store,Café,Restaurant,Sandwich Place,Bar,Hotel,Bakery,Italian Restaurant,Supermarket
3,Business Reply Mail Processing Centre 969 Eastern,Coffee Shop,Hotel,Bar,Steakhouse,Café,American Restaurant,Italian Restaurant,Pub,Pizza Place,Japanese Restaurant
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Coffee Shop,Italian Restaurant,Bar,Restaurant,Gym / Fitness Center,Café,Sandwich Place,Speakeasy,Park,Pub


### Cluster the neighbourhoods

#### Run k-means to cluster the neighbourhood into 4 clusters 

In [178]:
# set number of clusters
kclusters = 4

# drop the neighbourhood
toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

#### Check that there are 4 different cluster groups

In [185]:
print("The unique cluster group labels: ", np.unique(kmeans.labels_))

The unique cluster group labels:  [0 1 2 3]


#### Create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [186]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data


toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65512,-79.36264
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657363,-79.37818
15,M5C,Downtown Toronto,St. James Town,43.65121,-79.375481
19,M4E,East Toronto,The Beaches,43.676845,-79.295225
20,M5E,Downtown Toronto,Berczy Park,43.64516,-79.373675


#### Merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood

In [189]:
toronto_merged = toronto_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65512,-79.36264,1.0,Coffee Shop,Restaurant,Breakfast Spot,Yoga Studio,Theater,Pub,Electronics Store,Event Space,Mexican Restaurant,Food Truck
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657363,-79.37818,1.0,Coffee Shop,Clothing Store,Cosmetics Shop,Café,Middle Eastern Restaurant,Tea Room,Furniture / Home Store,Fast Food Restaurant,Italian Restaurant,Japanese Restaurant
15,M5C,Downtown Toronto,St. James Town,43.65121,-79.375481,1.0,Café,Hotel,Restaurant,Coffee Shop,Bakery,Cosmetics Shop,Seafood Restaurant,Cocktail Bar,Breakfast Spot,Gastropub
19,M4E,East Toronto,The Beaches,43.676845,-79.295225,1.0,Health Food Store,Other Great Outdoors,Trail,Pub,Neighborhood,Yoga Studio,Ethiopian Restaurant,Food,Flower Shop,Fish Market
20,M5E,Downtown Toronto,Berczy Park,43.64516,-79.373675,1.0,Coffee Shop,Cocktail Bar,Restaurant,Italian Restaurant,Steakhouse,Café,Beer Bar,Seafood Restaurant,Cheese Shop,Bakery


#### Check for missing values

In [200]:
toronto_merged.isnull()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
15,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
19,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
20,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
24,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
25,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
30,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
31,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
36,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


#### Drop the row at index 62 as it has null values

In [210]:
toronto_merged.drop(62, inplace=True)

#### Change the cluster labels to int

In [219]:
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype('int32', copy=True)

#### Visualize the clusters

In [221]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine the Clusters

#### Cluster 1

In [222]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(4, toronto_merged.shape[1]))]]


Unnamed: 0,Borough,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
36,Downtown Toronto,-79.391507,0,Harbor / Marina,American Restaurant,Park,Athletics & Sports,Music Venue,Yoga Studio,Falafel Restaurant,Food Court,Food & Drink Shop,Food
41,East Toronto,-79.35512,0,Bus Line,Discount Store,Park,Grocery Store,Fast Food Restaurant,Yoga Studio,Falafel Restaurant,Food Court,Food & Drink Shop,Food
69,West Toronto,-79.462874,0,Sandwich Place,Park,Convenience Store,Event Space,Food Court,Food & Drink Shop,Food,Flower Shop,Fish Market,Fish & Chips Shop
73,Central Toronto,-79.40696,0,Playground,Park,Garden,Gym Pool,Event Space,Food & Drink Shop,Food,Flower Shop,Fish Market,Fish & Chips Shop
91,Downtown Toronto,-79.378474,0,Playground,Park,Gym / Fitness Center,Bank,Dessert Shop,Diner,Food Court,Food & Drink Shop,Food,Flower Shop


Cluster 1 above may be identified by it's playgrounds and parks, which dominate the top 3 common venues

#### Cluster 2

In [223]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(4, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Downtown Toronto,-79.36264,1,Coffee Shop,Restaurant,Breakfast Spot,Yoga Studio,Theater,Pub,Electronics Store,Event Space,Mexican Restaurant,Food Truck
9,Downtown Toronto,-79.37818,1,Coffee Shop,Clothing Store,Cosmetics Shop,Café,Middle Eastern Restaurant,Tea Room,Furniture / Home Store,Fast Food Restaurant,Italian Restaurant,Japanese Restaurant
15,Downtown Toronto,-79.375481,1,Café,Hotel,Restaurant,Coffee Shop,Bakery,Cosmetics Shop,Seafood Restaurant,Cocktail Bar,Breakfast Spot,Gastropub
19,East Toronto,-79.295225,1,Health Food Store,Other Great Outdoors,Trail,Pub,Neighborhood,Yoga Studio,Ethiopian Restaurant,Food,Flower Shop,Fish Market
20,Downtown Toronto,-79.373675,1,Coffee Shop,Cocktail Bar,Restaurant,Italian Restaurant,Steakhouse,Café,Beer Bar,Seafood Restaurant,Cheese Shop,Bakery
24,Downtown Toronto,-79.38493,1,Coffee Shop,Clothing Store,Cosmetics Shop,Sushi Restaurant,Middle Eastern Restaurant,Tea Room,Bakery,Bubble Tea Shop,Fast Food Restaurant,Italian Restaurant
25,Downtown Toronto,-79.42071,1,Café,Grocery Store,Playground,Italian Restaurant,Baby Store,Coffee Shop,Falafel Restaurant,Food Court,Food & Drink Shop,Food
30,Downtown Toronto,-79.382503,1,Hotel,Café,Coffee Shop,Japanese Restaurant,Restaurant,Burger Joint,Breakfast Spot,Steakhouse,Deli / Bodega,Bakery
31,West Toronto,-79.438705,1,Park,Bakery,Smoke Shop,Middle Eastern Restaurant,Café,Fast Food Restaurant,Liquor Store,Furniture / Home Store,Brazilian Restaurant,Supermarket
37,West Toronto,-79.417742,1,Bar,Coffee Shop,Asian Restaurant,Cocktail Bar,Restaurant,Vietnamese Restaurant,New American Restaurant,Pizza Place,Bakery,Men's Store


Cluster 2 has a predominance of coffee shops, restaurants, bars and hotels. It is likely popular for tourists and for eating out. 

#### Cluster 3

In [224]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(4, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
61,Central Toronto,-79.387085,2,Bus Line,Swim School,Yoga Studio,Falafel Restaurant,Food Court,Food & Drink Shop,Food,Flower Shop,Fish Market,Fish & Chips Shop


Cluster 3 has only one neighbourhood. With a bus line and activities (swim school, yoga studio) it may be more suburban.

#### Cluster 4

In [225]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(4, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
68,Central Toronto,-79.414405,3,Park,Yoga Studio,Event Space,Food Court,Food & Drink Shop,Food,Flower Shop,Fish Market,Fish & Chips Shop,Fast Food Restaurant


Cluster 4 also has but one neighbourhood. It appears to be event driven, with a park, yoga studio and event space in the top 3.