In [54]:
# Importing required libraries
import numpy as np
import pandas as pd
import requests  # library to handle requests
from bs4 import BeautifulSoup

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

# Getting data from wikipedia
* Requests library will be used

In [55]:
postal_page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(postal_page.text, 'html.parser')

table_data = []
table = soup.find_all('table')[0]
table_body = table.find('tbody')
rows = table_body.find_all('tr')

for row in rows:
    cols = [col.text.strip() for col in row.find_all('td')]
    table_data.append(cols)

postal_df = pd.DataFrame(table_data, columns=['PostalCode','Borough','Neighborhood'])
print(postal_df.head())

  PostalCode       Borough      Neighborhood
0       None          None              None
1        M1A  Not assigned      Not assigned
2        M2A  Not assigned      Not assigned
3        M3A    North York         Parkwoods
4        M4A    North York  Victoria Village


# Cleaning and processing dataframe

In [56]:
# Dropping empty cells using "PostalCode"
postal_df = postal_df.dropna(subset=['PostalCode'])

# Ignoring cells with a borough that is Not assigned.
postal_df = postal_df[~postal_df['Borough'].str.contains('Not assigned')]

# Grouping the Neighborhoods into one row
postal_df = postal_df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(','.join).reset_index()

# Assigning borough as neighborhood, if a cell has a borough but a Not assigned neighborhood
postal_df['Neighborhood'] = np.where(postal_df['Neighborhood'].str.contains('Not assigned'), postal_df['Borough'], postal_df['Neighborhood'])

print(postal_df.head())
print(postal_df.shape)


  PostalCode      Borough                          Neighborhood
0        M1B  Scarborough                         Rouge,Malvern
1        M1C  Scarborough  Highland Creek,Rouge Hill,Port Union
2        M1E  Scarborough       Guildwood,Morningside,West Hill
3        M1G  Scarborough                                Woburn
4        M1H  Scarborough                             Cedarbrae
(103, 3)


## Adding Geospatial coordinates

In [57]:
geo_df = pd.read_csv('Geospatial_Coordinates.csv')
geo_df = geo_df.rename(columns={'Postal Code': 'PostalCode'})
print(geo_df.head())

  PostalCode   Latitude  Longitude
0        M1B  43.806686 -79.194353
1        M1C  43.784535 -79.160497
2        M1E  43.763573 -79.188711
3        M1G  43.770992 -79.216917
4        M1H  43.773136 -79.239476


In [58]:
# Merging postal data and geospatial coordinates 
toronto_df = postal_df.merge(geo_df, on='PostalCode', how='left')

print(toronto_df.head())

  PostalCode      Borough                          Neighborhood   Latitude  \
0        M1B  Scarborough                         Rouge,Malvern  43.806686   
1        M1C  Scarborough  Highland Creek,Rouge Hill,Port Union  43.784535   
2        M1E  Scarborough       Guildwood,Morningside,West Hill  43.763573   
3        M1G  Scarborough                                Woburn  43.770992   
4        M1H  Scarborough                             Cedarbrae  43.773136   

   Longitude  
0 -79.194353  
1 -79.160497  
2 -79.188711  
3 -79.216917  
4 -79.239476  


# Clustering

#### Checking the number of boroughs and neighborhouds

Filtering only boroughs that contain the word Toronto

In [59]:
toronto_df = toronto_df[~toronto_df['Borough'].str.contains('Toronto')]
print(toronto_df.head())

  PostalCode      Borough                          Neighborhood   Latitude  \
0        M1B  Scarborough                         Rouge,Malvern  43.806686   
1        M1C  Scarborough  Highland Creek,Rouge Hill,Port Union  43.784535   
2        M1E  Scarborough       Guildwood,Morningside,West Hill  43.763573   
3        M1G  Scarborough                                Woburn  43.770992   
4        M1H  Scarborough                             Cedarbrae  43.773136   

   Longitude  
0 -79.194353  
1 -79.160497  
2 -79.188711  
3 -79.216917  
4 -79.239476  


In [60]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(toronto_df['Borough'].unique()), toronto_df.shape[0]
    )
)

The dataframe has 7 boroughs and 65 neighborhoods.


#### Use geopy library to get the latitude and longitude values of Toronto

In [61]:
import certifi
import ssl
import geopy.geocoders

In [62]:
# Creating a SSL context, if not, geopy produces an error with SSL certificate
ctx = ssl.create_default_context(cafile=certifi.where())
geopy.geocoders.options.default_ssl_context = ctx

address = 'Toronto'
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.653963, -79.387207.


#### Let's visualizate Toronto the neighborhoods in it.

In [63]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, label in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto


#### Now, I'm going to start utilizing the Foursquare API to explore the neighborhoods and segment them

In [64]:
CLIENT_ID = 'GRYPGFTPHX2GFL2L0GLN3W3EXBM3FORRA52M3YBGETW1NSDH' # your Foursquare ID
CLIENT_SECRET = 'BL0U24MID0ALWEZVA3VYFJQT4GD2DMMIYII2XEO3T0ZIDQSE' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: GRYPGFTPHX2GFL2L0GLN3W3EXBM3FORRA52M3YBGETW1NSDH
CLIENT_SECRET:BL0U24MID0ALWEZVA3VYFJQT4GD2DMMIYII2XEO3T0ZIDQSE


#### Let's explore the first neighborhood in our dataframe

In [65]:
neighborhood_latitude = toronto_df.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = toronto_df.loc[0, 'Longitude'] # neighborhood longitude value
neighborhood_name = toronto_df.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Rouge,Malvern are 43.806686299999996, -79.19435340000001.


#### Now, let's get the top 100 venues that are in Marble Hill within a radius of 500 meters.

In [66]:
# type your answer here
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=GRYPGFTPHX2GFL2L0GLN3W3EXBM3FORRA52M3YBGETW1NSDH&client_secret=BL0U24MID0ALWEZVA3VYFJQT4GD2DMMIYII2XEO3T0ZIDQSE&v=20180604&ll=43.806686299999996,-79.19435340000001&radius=500&limit=100'

In [67]:
results = requests.get(url).json()

Using **get_category_type** function from the Foursquare lab.

In [68]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Cleaning the json and structure it into a *pandas* dataframe.

In [69]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Wendy's,Fast Food Restaurant,43.807448,-79.199056


In [70]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

1 venues were returned by Foursquare.


## 2. Explore Neighborhoods in Toronto

#### Repeating the same process to all the neighborhoods in Toronto

In [73]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [74]:
toronto_venues = getNearbyVenues(names=toronto_df['Neighborhood'],
                                 latitudes=toronto_df['Latitude'],
                                 longitudes=toronto_df['Longitude']
                                )

Rouge,Malvern
Highland Creek,Rouge Hill,Port Union
Guildwood,Morningside,West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park,Ionview,Kennedy Park
Clairlea,Golden Mile,Oakridge
Cliffcrest,Cliffside,Scarborough Village West
Birch Cliff,Cliffside West
Dorset Park,Scarborough Town Centre,Wexford Heights
Maryvale,Wexford
Agincourt
Clarks Corners,Sullivan,Tam O'Shanter
Agincourt North,L'Amoreaux East,Milliken,Steeles East
L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview,Henry Farm,Oriole
Bayview Village
Silver Hills,York Mills
Newtonbrook,Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park,Don Mills South
Bathurst Manor,Downsview North,Wilson Heights
Northwood Park,York University
CFB Toronto,Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens,Parkview Hill
Woodbine Heights
Leaside
Thorncliffe Park
East Toronto
Bedford Park,Lawrence Manor East
Lawrence Heights,Lawrence Ma

#### Exploring the resulting dataframe

In [75]:
print(toronto_venues.shape)
toronto_venues.head()

(537, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge,Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497,Scarborough Historical Society,43.788755,-79.162438,History Museum
3,"Guildwood,Morningside,West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,"Guildwood,Morningside,West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


Let's check how many venues were returned for each neighborhood

In [76]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,4,4,4,4,4,4
"Agincourt North,L'Amoreaux East,Milliken,Steeles East",2,2,2,2,2,2
"Albion Gardens,Beaumond Heights,Humbergate,Jamestown,Mount Olive,Silverstone,South Steeles,Thistletown",12,12,12,12,12,12
"Alderwood,Long Branch",10,10,10,10,10,10
"Bathurst Manor,Downsview North,Wilson Heights",18,18,18,18,18,18
Bayview Village,4,4,4,4,4,4
"Bedford Park,Lawrence Manor East",24,24,24,24,24,24
"Birch Cliff,Cliffside West",4,4,4,4,4,4
"Bloordale Gardens,Eringate,Markland Wood,Old Burnhamthorpe",6,6,6,6,6,6
"CFB Toronto,Downsview East",3,3,3,3,3,3


#### Let's find out how many unique categories can be curated from all the returned venues

In [28]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 156 uniques categories.


## 3. Analyzing Each Neighborhood

#### Grouping rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [29]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Garage,Bagel Shop,Bakery,...,Thrift / Vintage Store,Toy / Game Store,Trail,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wings Joint,Women's Store,Yoga Studio
0,"Rouge,Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Highland Creek,Rouge Hill,Port Union",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Highland Creek,Rouge Hill,Port Union",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Guildwood,Morningside,West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Guildwood,Morningside,West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


And let's examine the new dataframe size.

In [31]:
toronto_onehot.shape

(537, 157)

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [33]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Garage,Bagel Shop,Bakery,...,Thrift / Vintage Store,Toy / Game Store,Trail,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,"Agincourt North,L'Amoreaux East,Milliken,Steel...",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,"Alderwood,Long Branch",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,"Bathurst Manor,Downsview North,Wilson Heights",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.055556,0.000000,0.000000,0.000000,0.000000,0.000000
5,Bayview Village,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,"Bedford Park,Lawrence Manor East",0.000000,0.000000,0.041667,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,"Birch Cliff,Cliffside West",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,"Bloordale Gardens,Eringate,Markland Wood,Old B...",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,"CFB Toronto,Downsview East",0.000000,0.333333,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [35]:
toronto_grouped.shape

(62, 157)

#### Let's print each neighborhood along with the top 5 most common venues

In [83]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
            venue  freq
0  Sandwich Place  0.25
1  Breakfast Spot  0.25
2          Lounge  0.25
3  Clothing Store  0.25
4     Music Venue  0.00


----Agincourt North,L'Amoreaux East,Milliken,Steeles East----
                      venue  freq
0                Playground   0.5
1                      Park   0.5
2         Accessories Store   0.0
3                     Motel   0.0
4  Mediterranean Restaurant   0.0


----Albion Gardens,Beaumond Heights,Humbergate,Jamestown,Mount Olive,Silverstone,South Steeles,Thistletown----
                 venue  freq
0        Grocery Store  0.17
1       Sandwich Place  0.08
2          Pizza Place  0.08
3             Pharmacy  0.08
4  Japanese Restaurant  0.08


----Alderwood,Long Branch----
          venue  freq
0   Pizza Place   0.2
1  Dance Studio   0.1
2          Pool   0.1
3           Pub   0.1
4      Pharmacy   0.1


----Bathurst Manor,Downsview North,Wilson Heights----
                 venue  freq
0          Coffee Shop  0.11
1    

                  venue  freq
0    Chinese Restaurant  0.17
1  Fast Food Restaurant  0.17
2           Coffee Shop  0.08
3           Pizza Place  0.08
4              Pharmacy  0.08


----Lawrence Heights,Lawrence Manor----
                    venue  freq
0          Clothing Store  0.31
1             Coffee Shop  0.08
2      Miscellaneous Shop  0.08
3  Furniture / Home Store  0.08
4              Shoe Store  0.08


----Leaside----
                 venue  freq
0          Coffee Shop  0.10
1  Sporting Goods Shop  0.10
2         Burger Joint  0.06
3       Breakfast Spot  0.03
4       Clothing Store  0.03


----Maryvale,Wexford----
            venue  freq
0  Sandwich Place   0.2
1      Smoke Shop   0.2
2     Auto Garage   0.2
3  Breakfast Spot   0.2
4          Bakery   0.2


----Northwest----
                 venue  freq
0  Rental Car Location   0.5
1            Drugstore   0.5
2    Accessories Store   0.0
3        Movie Theater   0.0
4        Metro Station   0.0


----Northwood Park,York Uni

#### Let's put that into a *pandas* dataframe

First, let's write a function to sort the venues in descending order.

In [84]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [115]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Lounge,Breakfast Spot,Clothing Store,Sandwich Place,Yoga Studio,Discount Store,Event Space,Empanada Restaurant,Electronics Store,Drugstore
1,"Agincourt North,L'Amoreaux East,Milliken,Steel...",Playground,Park,Diner,Fast Food Restaurant,Event Space,Empanada Restaurant,Electronics Store,Drugstore,Dog Run,Discount Store
2,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",Grocery Store,Pizza Place,Beer Store,Fast Food Restaurant,Pharmacy,Discount Store,Coffee Shop,Liquor Store,Sandwich Place,Japanese Restaurant
3,"Alderwood,Long Branch",Pizza Place,Pharmacy,Gym,Skating Rink,Coffee Shop,Pool,Pub,Dance Studio,Sandwich Place,Yoga Studio
4,"Bathurst Manor,Downsview North,Wilson Heights",Coffee Shop,Pizza Place,Fried Chicken Joint,Sandwich Place,Diner,Bridal Shop,Fast Food Restaurant,Restaurant,Supermarket,Deli / Bodega


## 4. Cluster Neighborhoods

Run *k*-means to cluster the neighborhood into 5 clusters.

In [116]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 3, 1, 1, 1, 1, 1, 1, 1, 2], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [117]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_df.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353,1.0,Fast Food Restaurant,Yoga Studio,Diner,Event Space,Empanada Restaurant,Electronics Store,Drugstore,Dog Run,Discount Store,Dim Sum Restaurant
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497,1.0,History Museum,Bar,Flower Shop,Field,Fast Food Restaurant,Event Space,Empanada Restaurant,Electronics Store,Drugstore,Dog Run
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711,1.0,Pizza Place,Spa,Electronics Store,Breakfast Spot,Medical Center,Rental Car Location,Intersection,Mexican Restaurant,Diner,Drugstore
3,M1G,Scarborough,Woburn,43.770992,-79.216917,1.0,Coffee Shop,Korean Restaurant,Yoga Studio,Fast Food Restaurant,Event Space,Empanada Restaurant,Electronics Store,Drugstore,Dog Run,Discount Store
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,1.0,Thai Restaurant,Caribbean Restaurant,Hakka Restaurant,Athletics & Sports,Fried Chicken Joint,Bakery,Bank,Event Space,Empanada Restaurant,Electronics Store


Finally, let's visualize the resulting clusters

In [118]:
toronto_merged = toronto_merged.dropna(subset=['Cluster Labels'])

In [119]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 5. Examine Clusters

#### Cluster 1

In [121]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0.0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
17,North York,0.0,Pool,Mediterranean Restaurant,Athletics & Sports,Golf Course,Dog Run,Yoga Studio,Diner,Event Space,Empanada Restaurant,Electronics Store


#### Cluster 2

In [122]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1.0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Scarborough,1.0,Fast Food Restaurant,Yoga Studio,Diner,Event Space,Empanada Restaurant,Electronics Store,Drugstore,Dog Run,Discount Store,Dim Sum Restaurant
1,Scarborough,1.0,History Museum,Bar,Flower Shop,Field,Fast Food Restaurant,Event Space,Empanada Restaurant,Electronics Store,Drugstore,Dog Run
2,Scarborough,1.0,Pizza Place,Spa,Electronics Store,Breakfast Spot,Medical Center,Rental Car Location,Intersection,Mexican Restaurant,Diner,Drugstore
3,Scarborough,1.0,Coffee Shop,Korean Restaurant,Yoga Studio,Fast Food Restaurant,Event Space,Empanada Restaurant,Electronics Store,Drugstore,Dog Run,Discount Store
4,Scarborough,1.0,Thai Restaurant,Caribbean Restaurant,Hakka Restaurant,Athletics & Sports,Fried Chicken Joint,Bakery,Bank,Event Space,Empanada Restaurant,Electronics Store
6,Scarborough,1.0,Discount Store,Hobby Shop,Bus Station,Coffee Shop,Department Store,Chinese Restaurant,Dog Run,Fast Food Restaurant,Event Space,Empanada Restaurant
7,Scarborough,1.0,Bus Line,Bakery,Soccer Field,Park,Bus Station,Fast Food Restaurant,Metro Station,Dog Run,Event Space,Empanada Restaurant
8,Scarborough,1.0,Motel,American Restaurant,Diner,Fast Food Restaurant,Event Space,Empanada Restaurant,Electronics Store,Drugstore,Dog Run,Discount Store
9,Scarborough,1.0,General Entertainment,Café,College Stadium,Skating Rink,Empanada Restaurant,Electronics Store,Drugstore,Dog Run,Discount Store,Diner
10,Scarborough,1.0,Indian Restaurant,Pet Store,Vietnamese Restaurant,Chinese Restaurant,Light Rail Station,Latin American Restaurant,Yoga Studio,Diner,Empanada Restaurant,Electronics Store


#### Cluster 3

In [123]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2.0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
25,North York,2.0,Fast Food Restaurant,Food & Drink Shop,Park,Bus Stop,Discount Store,Event Space,Empanada Restaurant,Electronics Store,Drugstore,Dog Run
30,North York,2.0,Airport,Park,Bus Stop,Yoga Studio,Diner,Event Space,Empanada Restaurant,Electronics Store,Drugstore,Dog Run
40,East York,2.0,Park,Convenience Store,Coffee Shop,Diner,Fast Food Restaurant,Event Space,Empanada Restaurant,Electronics Store,Drugstore,Dog Run
72,North York,2.0,Japanese Restaurant,Park,Pub,Bakery,Dim Sum Restaurant,Event Space,Empanada Restaurant,Electronics Store,Drugstore,Dog Run
74,York,2.0,Park,Fast Food Restaurant,Women's Store,Pharmacy,Market,Diner,Event Space,Empanada Restaurant,Electronics Store,Drugstore
79,North York,2.0,Construction & Landscaping,Park,Bakery,Basketball Court,Discount Store,Fast Food Restaurant,Event Space,Empanada Restaurant,Electronics Store,Drugstore
90,Etobicoke,2.0,Park,River,Yoga Studio,Dim Sum Restaurant,Event Space,Empanada Restaurant,Electronics Store,Drugstore,Dog Run,Discount Store
98,York,2.0,Park,Convenience Store,Dim Sum Restaurant,Event Space,Empanada Restaurant,Electronics Store,Drugstore,Dog Run,Discount Store,Diner
100,Etobicoke,2.0,Pizza Place,Park,Bus Line,Mobile Phone Shop,Creperie,Curling Ice,Dance Studio,Deli / Bodega,Department Store,Dessert Shop


#### Cluster 4

In [124]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2.0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
25,North York,2.0,Fast Food Restaurant,Food & Drink Shop,Park,Bus Stop,Discount Store,Event Space,Empanada Restaurant,Electronics Store,Drugstore,Dog Run
30,North York,2.0,Airport,Park,Bus Stop,Yoga Studio,Diner,Event Space,Empanada Restaurant,Electronics Store,Drugstore,Dog Run
40,East York,2.0,Park,Convenience Store,Coffee Shop,Diner,Fast Food Restaurant,Event Space,Empanada Restaurant,Electronics Store,Drugstore,Dog Run
72,North York,2.0,Japanese Restaurant,Park,Pub,Bakery,Dim Sum Restaurant,Event Space,Empanada Restaurant,Electronics Store,Drugstore,Dog Run
74,York,2.0,Park,Fast Food Restaurant,Women's Store,Pharmacy,Market,Diner,Event Space,Empanada Restaurant,Electronics Store,Drugstore
79,North York,2.0,Construction & Landscaping,Park,Bakery,Basketball Court,Discount Store,Fast Food Restaurant,Event Space,Empanada Restaurant,Electronics Store,Drugstore
90,Etobicoke,2.0,Park,River,Yoga Studio,Dim Sum Restaurant,Event Space,Empanada Restaurant,Electronics Store,Drugstore,Dog Run,Discount Store
98,York,2.0,Park,Convenience Store,Dim Sum Restaurant,Event Space,Empanada Restaurant,Electronics Store,Drugstore,Dog Run,Discount Store,Diner
100,Etobicoke,2.0,Pizza Place,Park,Bus Line,Mobile Phone Shop,Creperie,Curling Ice,Dance Studio,Deli / Bodega,Department Store,Dessert Shop


#### Cluster 5

In [125]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2.0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
25,North York,2.0,Fast Food Restaurant,Food & Drink Shop,Park,Bus Stop,Discount Store,Event Space,Empanada Restaurant,Electronics Store,Drugstore,Dog Run
30,North York,2.0,Airport,Park,Bus Stop,Yoga Studio,Diner,Event Space,Empanada Restaurant,Electronics Store,Drugstore,Dog Run
40,East York,2.0,Park,Convenience Store,Coffee Shop,Diner,Fast Food Restaurant,Event Space,Empanada Restaurant,Electronics Store,Drugstore,Dog Run
72,North York,2.0,Japanese Restaurant,Park,Pub,Bakery,Dim Sum Restaurant,Event Space,Empanada Restaurant,Electronics Store,Drugstore,Dog Run
74,York,2.0,Park,Fast Food Restaurant,Women's Store,Pharmacy,Market,Diner,Event Space,Empanada Restaurant,Electronics Store,Drugstore
79,North York,2.0,Construction & Landscaping,Park,Bakery,Basketball Court,Discount Store,Fast Food Restaurant,Event Space,Empanada Restaurant,Electronics Store,Drugstore
90,Etobicoke,2.0,Park,River,Yoga Studio,Dim Sum Restaurant,Event Space,Empanada Restaurant,Electronics Store,Drugstore,Dog Run,Discount Store
98,York,2.0,Park,Convenience Store,Dim Sum Restaurant,Event Space,Empanada Restaurant,Electronics Store,Drugstore,Dog Run,Discount Store,Diner
100,Etobicoke,2.0,Pizza Place,Park,Bus Line,Mobile Phone Shop,Creperie,Curling Ice,Dance Studio,Deli / Bodega,Department Store,Dessert Shop
