##  Clustering Neighborhoods in Toronto

In [1]:
import sys
!{sys.executable} -m pip install geocoder
!{sys.executable} -m pip install folium



In [2]:
pip install BeautifulSoup4

Note: you may need to restart the kernel to use updated packages.


In [3]:
from bs4 import BeautifulSoup 

In [4]:

import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import geocoder # import geocoder
import requests 
from bs4 import BeautifulSoup 
import xml
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

from IPython.display import display_html

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

### Extract data form wiki

In [26]:
source = requests.get('http://zims-en.kiwix.campusafrica.gos.orange.com/wikipedia_en_all_nopic/A/List_of_postal_codes_of_Canada:_M').text
soup=BeautifulSoup(source,'lxml')
print(soup.title)
from IPython.display import display_html
tab = str(soup.table)
display_html(tab,raw=True)

<title>List of postal codes of Canada: M</title>


Postcode,Borough,Neighbourhood
M1A,Not assigned,Not assigned
M2A,Not assigned,Not assigned
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,Harbourfront
M6A,North York,Lawrence Heights
M6A,North York,Lawrence Manor
M7A,Queen's Park,Not assigned
M8A,Not assigned,Not assigned
M9A,Downtown Toronto,Queen's Park


##### View of data in dataframe

In [28]:
dfs = pd.read_html(tab)
df=dfs[0]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Depurating data

In [29]:
# Dropping the rows where Borough is 'Not assigned'
df1 = df[df.Borough != 'Not assigned']

# Combining the neighbourhoods with same Postalcode
df2 = df1.groupby(['Postcode','Borough'], sort=False).agg(', '.join)
df2.reset_index(inplace=True)

# Replacing the name of the neighbourhoods which are 'Not assigned' with names of Borough
df2['Neighbourhood'] = np.where(df2['Neighbourhood'] == 'Not assigned',df2['Borough'], df2['Neighbourhood'])

df2


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
101,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So..."


# 1st Data Frame with 103 rows, 3 columns

In [30]:
# Shape of data frame
df2.shape

(103, 3)

In [31]:
#Importing the csv file conatining the latitudes and longitudes for various neighbourhoods in Canada
lat_lon = pd.read_csv('https://cocl.us/Geospatial_data')
lat_lon.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Creating a Dataframe adding latitude and longitude

In [114]:
lat_lon.rename(columns={'Postal Code':'Postcode'},inplace=True)
df3 = pd.merge(df2,lat_lon,on='Postcode')
df3.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494



Toronto Map

In [33]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[43.653963, -79.387207], zoom_start=10)
map_toronto

In [36]:
for lat, lng, borough, neighborhood in zip(
        df_toronto['Latitude'], 
        df_toronto['Longitude'], 
        df_toronto['Borough'], 
        df_toronto['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

map_toronto

### We are going to work only with Tronto Borough

In [44]:
# "denc" = [D]owntown Toronto, [E]ast Toronto, [N]orth Toronto, [C]entral Toronto
df_toronto_denc = df_toronto[df_toronto['Borough'].str.contains("Toronto")].reset_index(drop=True)
df_toronto_denc

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
1,M9A,Downtown Toronto,Queen's Park,43.667856,-79.532242
2,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564
8,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568
9,M6H,West Toronto,"Dovercourt Village, Dufferin",43.669005,-79.442259


In [38]:
df_toronto_denc.shape

(39, 5)

In [40]:
map_toronto_denc = folium.Map(location=[43.653963, -79.387207], zoom_start=12)
for lat, lng, borough, neighborhood in zip(
        df_toronto_denc['Latitude'], 
        df_toronto_denc['Longitude'], 
        df_toronto_denc['Borough'], 
        df_toronto_denc['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_denc)  

map_toronto_denc

### Explore Richmond, Adelaide, King Neigborhood

In [41]:
CLIENT_ID = 'JWOZC2AQDRUIOV213C2N0JDIJZG1FHVBOZF52NVACQMDD22E'
CLIENT_SECRET = '0CDHNDSOMPLV5QJISLJCTWT0WPLDNS51EL4UTO3P1CPC5XXB'
VERSION = '20180605'

In [45]:
neighborhood_name = df_toronto_denc.loc[8, 'Neighbourhood']
print(f" neighborhood's choose is '{neighborhood_name}'.")

 neighborhood's choose is 'Adelaide, King, Richmond'.


In [46]:
neighborhood_latitude = df_toronto_denc.loc[8, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df_toronto_denc.loc[8, 'Longitude'] # neighborhood longitude value

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Adelaide, King, Richmond are 43.65057120000001, -79.3845675.


### Obtaining the top 100 venues of Adelaide, King and Richmond

In [48]:

VERSION = '20180605' # Foursquare API version

neighborhood_name = df_toronto_denc.loc[18, 'Neighbourhood'] # neighborhood name
neighborhood_latitude = df_toronto_denc.loc[18, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df_toronto_denc.loc[18, 'Longitude'] # neighborhood longitude value

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

radius = 500 # define radius
LIMIT = 100 # limit of number of venues returned by Foursquare API

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
     CLIENT_ID, 
     CLIENT_SECRET, 
     VERSION, 
     43.65057120000001, 
     -79.3845675, 
     radius, 
     LIMIT)

#url = 'https://api.foursquare.com/v2/venues/explore?&client_id=JWOZC2AQDRUIOV213C2N0JDIJZG1FHVBOZF52NVACQMDD22E&client_secret=0CDHNDSOMPLV5QJISLJCTWT0WPLDNS51EL4UTO3P1CPC5XXB&v=20180605&ll=43.67635739999999,-79.2930312&radius=500&limit=100'

results = requests.get(url).json()
results

Latitude and longitude values of Lawrence Park are 43.7280205, -79.3887901.


{'meta': {'code': 200, 'requestId': '604e2289c078f966fe5d32be'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Financial District',
  'headerFullLocation': 'Financial District, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 95,
  'suggestedBounds': {'ne': {'lat': 43.65507120450001,
    'lng': -79.37835988143604},
   'sw': {'lat': 43.64607119550001, 'lng': -79.39077511856397}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4ad4c05ef964a520a6f620e3',
       'name': 'Nathan Phillips Square',
       'location': {'address': '100 Queen St W',
        'crossStreet': 'at Bay St',
        'lat': 43.65227047322295,
        'lng': -79.38351631164551,
        'l

## 1st Clustering in Toronto Downtown, east, north and central

In [54]:
k=5
toronto_clustering = df_toronto_denc.drop(['Postcode','Borough','Neighbourhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_clustering)
kmeans.labels_
df_toronto_denc.insert(0, 'Cluster Labels', kmeans.labels_)

In [55]:
df_toronto_denc

Unnamed: 0,Cluster Labels,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,3,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
1,4,M9A,Downtown Toronto,Queen's Park,43.667856,-79.532242
2,3,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
3,3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,2,M4E,East Toronto,The Beaches,43.676357,-79.293031
5,3,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,3,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,0,M6G,Downtown Toronto,Christie,43.669542,-79.422564
8,3,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568
9,0,M6H,West Toronto,"Dovercourt Village, Dufferin",43.669005,-79.442259


In [58]:
# create map
map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(df_toronto_denc['Latitude'], df_toronto_denc['Longitude'], df_toronto_denc['Neighbourhood'], df_toronto_denc['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Viewing detail for each Cluster

In [63]:
# CLUSTER 1

df_toronto_denc.loc[df_toronto_denc['Cluster Labels'] == 0, df_toronto_denc.columns[[3] + list(range(0, df_toronto_denc.shape[1]))]]

Unnamed: 0,Neighbourhood,Cluster Labels,Postcode,Borough,Neighbourhood.1,Latitude,Longitude
7,Christie,0,M6G,Downtown Toronto,Christie,43.669542,-79.422564
9,"Dovercourt Village, Dufferin",0,M6H,West Toronto,"Dovercourt Village, Dufferin",43.669005,-79.442259
11,"Little Portugal, Trinity",0,M6J,West Toronto,"Little Portugal, Trinity",43.647927,-79.41975
14,"Brockton, Exhibition Place, Parkdale Village",0,M6K,West Toronto,"Brockton, Exhibition Place, Parkdale Village",43.636847,-79.428191
22,"High Park, The Junction South",0,M6P,West Toronto,"High Park, The Junction South",43.661608,-79.464763
25,"Parkdale, Roncesvalles",0,M6R,West Toronto,"Parkdale, Roncesvalles",43.64896,-79.456325


In [64]:
# CLUSTER 2

df_toronto_denc.loc[df_toronto_denc['Cluster Labels'] == 1, df_toronto_denc.columns[[3] + list(range(0, df_toronto_denc.shape[1]))]]

Unnamed: 0,Neighbourhood,Cluster Labels,Postcode,Borough,Neighbourhood.1,Latitude,Longitude
18,Lawrence Park,1,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
19,Roselawn,1,M5N,Central Toronto,Roselawn,43.711695,-79.416936
20,Davisville North,1,M4P,Central Toronto,Davisville North,43.712751,-79.390197
21,"Forest Hill North, Forest Hill West",1,M5P,Central Toronto,"Forest Hill North, Forest Hill West",43.696948,-79.411307
23,North Toronto West,1,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
26,Davisville,1,M4S,Central Toronto,Davisville,43.704324,-79.38879
29,"Moore Park, Summerhill East",1,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
31,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",1,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


In [65]:
# CLUSTER 3

df_toronto_denc.loc[df_toronto_denc['Cluster Labels'] == 2, df_toronto_denc.columns[[3] + list(range(0, df_toronto_denc.shape[1]))]]

Unnamed: 0,Neighbourhood,Cluster Labels,Postcode,Borough,Neighbourhood.1,Latitude,Longitude
4,The Beaches,2,M4E,East Toronto,The Beaches,43.676357,-79.293031
12,"The Danforth West, Riverdale",2,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
15,"The Beaches West, India Bazaar",2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
17,Studio District,2,M4M,East Toronto,Studio District,43.659526,-79.340923
38,Business Reply Mail Processing Centre 969 Eastern,2,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558


In [66]:
# CLUSTER 4

df_toronto_denc.loc[df_toronto_denc['Cluster Labels'] == 3, df_toronto_denc.columns[[3] + list(range(0, df_toronto_denc.shape[1]))]]

Unnamed: 0,Neighbourhood,Cluster Labels,Postcode,Borough,Neighbourhood.1,Latitude,Longitude
0,Harbourfront,3,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
2,"Ryerson, Garden District",3,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
3,St. James Town,3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
5,Berczy Park,3,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,Central Bay Street,3,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
8,"Adelaide, King, Richmond",3,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568
10,"Harbourfront East, Toronto Islands, Union Station",3,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",43.640816,-79.381752
13,"Design Exchange, Toronto Dominion Centre",3,M5K,Downtown Toronto,"Design Exchange, Toronto Dominion Centre",43.647177,-79.381576
16,"Commerce Court, Victoria Hotel",3,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel",43.648198,-79.379817
24,"The Annex, North Midtown, Yorkville",3,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.67271,-79.405678


In [67]:
# CLUSTER 5

df_toronto_denc.loc[df_toronto_denc['Cluster Labels'] == 4, df_toronto_denc.columns[[3] + list(range(0, df_toronto_denc.shape[1]))]]

Unnamed: 0,Neighbourhood,Cluster Labels,Postcode,Borough,Neighbourhood.1,Latitude,Longitude
1,Queen's Park,4,M9A,Downtown Toronto,Queen's Park,43.667856,-79.532242
28,"Runnymede, Swansea",4,M6S,West Toronto,"Runnymede, Swansea",43.651571,-79.48445


## Obtaining information from Foursquare

In [68]:
# From the Foursquare lab in the previous module, we know that all the information is in the items key. 
# Before we proceed, let's borrow the get_category_type function from the Foursquare lab.

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [69]:
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

  


Unnamed: 0,name,categories,lat,lng
0,Nathan Phillips Square,Plaza,43.652270,-79.383516
1,The Keg Steakhouse + Bar - York Street,Restaurant,43.649987,-79.384103
2,Four Seasons Centre for the Performing Arts,Concert Hall,43.650592,-79.385806
3,Shangri-La Toronto,Hotel,43.649129,-79.386557
4,Rosalinda,Vegetarian / Vegan Restaurant,43.650252,-79.385156
...,...,...,...,...
90,Fat Bastard Burrito Co.,Burrito Place,43.648147,-79.389219
91,McEwan Foods,Deli / Bodega,43.647160,-79.381044
92,Soup Nutsy,Soup Place,43.647858,-79.380533
93,Starbucks,Coffee Shop,43.646891,-79.381871


In [70]:
from pandas.io.json import json_normalize

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

  """


Unnamed: 0,name,categories,lat,lng
0,Nathan Phillips Square,Plaza,43.65227,-79.383516
1,The Keg Steakhouse + Bar - York Street,Restaurant,43.649987,-79.384103
2,Four Seasons Centre for the Performing Arts,Concert Hall,43.650592,-79.385806
3,Shangri-La Toronto,Hotel,43.649129,-79.386557
4,Rosalinda,Vegetarian / Vegan Restaurant,43.650252,-79.385156


In [71]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

95 venues were returned by Foursquare.


### Now, explore and cluster all neighbours in Toronto

In [74]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [76]:
toronto_denc_venues = getNearbyVenues(names=df_toronto_denc['Neighbourhood'],
                                   latitudes=df_toronto_denc['Latitude'],
                                   longitudes=df_toronto_denc['Longitude']
                                  )

In [77]:
toronto_denc_venues

Unnamed: 0,Neighbourhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Harbourfront,43.654260,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,Harbourfront,43.654260,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,Harbourfront,43.654260,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,Harbourfront,43.654260,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,Harbourfront,43.654260,-79.360636,Impact Kitchen,43.656369,-79.356980,Restaurant
...,...,...,...,...,...,...,...
1559,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558,East End Garden Centre & Hardware,43.664564,-79.324471,Garden Center
1560,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558,The Ashbridge Estate,43.664691,-79.321805,Garden
1561,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558,TTC Russell Division,43.664908,-79.322560,Light Rail Station
1562,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558,Jonathan Ashbridge Park,43.664702,-79.319898,Park


In [79]:
toronto_denc_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",95,95,95,95,95,95
Berczy Park,59,59,59,59,59,59
"Brockton, Exhibition Place, Parkdale Village",23,23,23,23,23,23
Business Reply Mail Processing Centre 969 Eastern,13,13,13,13,13,13
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",17,17,17,17,17,17
"Cabbagetown, St. James Town",44,44,44,44,44,44
Central Bay Street,60,60,60,60,60,60
"Chinatown, Grange Park, Kensington Market",61,61,61,61,61,61
Christie,15,15,15,15,15,15
Church and Wellesley,78,78,78,78,78,78


In [80]:

# find out how many unique categories can be curated from all the returned venues

print('There are {} uniques categories.'.format(len(toronto_denc_venues['Venue Category'].unique())))

There are 231 uniques categories.


In [82]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_denc_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_denc_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighbourhood,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [83]:
# examine the new dataframe size.

toronto_onehot.shape

(1564, 232)

In [84]:

# group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighbourhood,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021053,0.0,...,0.0,0.0,0.0,0.0,0.010526,0.0,0.0,0.0,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.016949,0.0,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.058824,0.058824,0.058824,0.117647,0.117647,0.117647,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.016667,0.0,0.0,0.016667,0.0,0.016667
7,"Chinatown, Grange Park, Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.04918,0.0,0.04918,0.016393,0.0,0.0
8,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Church and Wellesley,0.012821,0.0,0.0,0.0,0.0,0.0,0.0,0.012821,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025641


In [85]:

# confirm the new size

toronto_grouped.shape

(38, 232)

In [86]:
# print each neighborhood along with the top 5 most common venues

num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
            venue  freq
0     Coffee Shop  0.11
1            Café  0.05
2  Clothing Store  0.04
3      Restaurant  0.04
4           Hotel  0.03


----Berczy Park----
            venue  freq
0     Coffee Shop  0.08
1          Bakery  0.05
2    Cocktail Bar  0.05
3      Restaurant  0.03
4  Farmers Market  0.03


----Brockton, Exhibition Place, Parkdale Village----
                venue  freq
0                Café  0.13
1      Breakfast Spot  0.09
2         Coffee Shop  0.09
3                 Bar  0.04
4  Italian Restaurant  0.04


----Business Reply Mail Processing Centre 969 Eastern----
                  venue  freq
0           Pizza Place  0.08
1            Restaurant  0.08
2         Burrito Place  0.08
3  Fast Food Restaurant  0.08
4            Skate Park  0.08


----CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara----
              venue  freq
0    Airport Lounge  0.12
1   Airport Service  0.12

In [87]:
# sort the venues in descending order.

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [89]:

num_top_venues = 3

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Restaurant
1,Berczy Park,Coffee Shop,Cocktail Bar,Bakery
2,"Brockton, Exhibition Place, Parkdale Village",Café,Coffee Shop,Breakfast Spot
3,Business Reply Mail Processing Centre 969 Eastern,Pizza Place,Brewery,Garden Center
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Lounge,Airport Service,Airport Terminal


### Running Kmeans with all neigbourhood in toronto

In [95]:

# Run k-means to cluster the neighborhood into 5 clusters.

# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1) 

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:35]



array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3,
       1, 3, 0, 3, 3, 2, 4, 3, 3, 3, 3, 3, 3])

In [99]:

# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

# add clustering labels
#neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_denc_venues

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Neighbourhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,Harbourfront,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery,3,Coffee Shop,Bakery,Park
1,Harbourfront,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop,3,Coffee Shop,Bakery,Park
2,Harbourfront,43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center,3,Coffee Shop,Bakery,Park
3,Harbourfront,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa,3,Coffee Shop,Bakery,Park
4,Harbourfront,43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant,3,Coffee Shop,Bakery,Park


In [107]:

# Finally, let's visualize the resulting clusters

# create map
map_clusters = folium.Map(location=[43.67635739999999, -79.2930312], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Neighborhood Latitude'], toronto_merged['Neighborhood Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### View of details for each cluster

In [108]:
# CLUSTER 1

toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood Latitude,Venue Longitude,Venue Category,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
1132,43.689574,-79.386841,Trail,0,Gym,Trail,Yoga Studio
1133,43.689574,-79.383449,Gym,0,Gym,Trail,Yoga Studio


In [109]:
# CLUSTER 2

toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood Latitude,Venue Longitude,Venue Category,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
936,43.72802,-79.394382,Park,1,Park,Bus Line,Swim School
937,43.72802,-79.38286,Swim School,1,Park,Bus Line,Swim School
938,43.72802,-79.382805,Bus Line,1,Park,Bus Line,Swim School
949,43.696948,-79.406873,Trail,1,Trail,Jewelry Store,Bus Line
950,43.696948,-79.410274,Bus Line,1,Trail,Jewelry Store,Bus Line
951,43.696948,-79.407957,Sushi Restaurant,1,Trail,Jewelry Store,Bus Line
952,43.696948,-79.407644,Jewelry Store,1,Trail,Jewelry Store,Bus Line


In [110]:
# CLUSTER 3

toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood Latitude,Venue Longitude,Venue Category,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
1226,43.679563,-79.378934,Playground,2,Park,Playground,Trail
1227,43.679563,-79.373788,Park,2,Park,Playground,Trail
1228,43.679563,-79.382773,Park,2,Park,Playground,Trail
1229,43.679563,-79.373842,Trail,2,Park,Playground,Trail


In [111]:
# CLUSTER 4

toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood Latitude,Venue Longitude,Venue Category,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,43.654260,-79.362017,Bakery,3,Coffee Shop,Bakery,Park
1,43.654260,-79.361809,Coffee Shop,3,Coffee Shop,Bakery,Park
2,43.654260,-79.358008,Distribution Center,3,Coffee Shop,Bakery,Park
3,43.654260,-79.359874,Spa,3,Coffee Shop,Bakery,Park
4,43.654260,-79.356980,Restaurant,3,Coffee Shop,Bakery,Park
...,...,...,...,...,...,...,...
1559,43.662744,-79.324471,Garden Center,3,Pizza Place,Brewery,Garden Center
1560,43.662744,-79.321805,Garden,3,Pizza Place,Brewery,Garden Center
1561,43.662744,-79.322560,Light Rail Station,3,Pizza Place,Brewery,Garden Center
1562,43.662744,-79.319898,Park,3,Pizza Place,Brewery,Garden Center


In [112]:
# CLUSTER 5

toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood Latitude,Venue Longitude,Venue Category,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
939,43.711695,-79.411978,Garden,4,Garden,Pool,Yoga Studio
940,43.711695,-79.412127,Pool,4,Garden,Pool,Yoga Studio
