# Segmenting and Clustering Neighbourhoods in Toronto

### Part 1 - Scraping the data

In [78]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import xml
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [22]:
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(url,'html')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"XlKXTwpAMEoAACy8UKEAAADM","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":935851093,"wgRevisionId":935851093,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communi

In [15]:
table = soup.find('table',{'class':'wikitable sortable'})
table

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Harbourfront</a>
</td></tr>
<tr>
<td>M6A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Lawrence_Heights" title="Lawrence Heights">Lawrence Heights</a>
</td></tr>
<tr>
<td>M6A</td>
<td><a href="/wiki/North

In [16]:
links = table.find_all('td')
links

[<td>M1A</td>, <td>Not assigned</td>, <td>Not assigned
 </td>, <td>M2A</td>, <td>Not assigned</td>, <td>Not assigned
 </td>, <td>M3A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
 </td>, <td>M4A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
 </td>, <td>M5A</td>, <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>, <td><a href="/wiki/Regent_Park" title="Regent Park">Harbourfront</a>
 </td>, <td>M6A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Lawrence_Heights" title="Lawrence Heights">Lawrence Heights</a>
 </td>, <td>M6A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Lawrence_Manor" title="Lawrence Manor">Lawrence Manor</a>
 </td>, <td>M7A</td>, <td>

In [17]:
#Scrape
postecode = []
borough = []
neighborhood = []

In [23]:
for i in range(0, len(links), 3):
    postecode.append(links[i].find(text=True))
    borough.append(links[i+1].find(text=True))
    neighborhood.append(links[i+2].find(text=True).rstrip())
    
df = pd.DataFrame(data=[postecode, borough, neighborhood]).transpose()
df.columns = ['Postecode', 'Borough', 'Neighborhood']
df.head(10)

Unnamed: 0,Postecode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


#### Ignore cells with Borough 'Not assigned'

In [24]:
df = df[df.Borough != 'Not assigned']
df.head(10)

Unnamed: 0,Postecode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Not assigned
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


#### Combine similar Neighborhoods with postcodes using comma

In [26]:
df_pc = df.groupby(['Postecode','Borough'])['Neighborhood'].apply(','.join).reset_index()

df_pc.columns=['Postecode','Borough','Neighborhood']
df_pc.head(10)

Unnamed: 0,Postecode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern,Rouge,Malvern,Rouge,Malvern,Roug..."
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union,Highland ..."
2,M1E,Scarborough,"Guildwood,Morningside,West Hill,Guildwood,Morn..."
3,M1G,Scarborough,"Woburn,Woburn,Woburn,Woburn,Woburn"
4,M1H,Scarborough,"Cedarbrae,Cedarbrae,Cedarbrae,Cedarbrae,Cedarbrae"
5,M1J,Scarborough,"Scarborough Village,Scarborough Village,Scarbo..."
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park,East..."
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge,Clairlea,Golden ..."
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West,..."
9,M1N,Scarborough,"Birch Cliff,Cliffside West,Birch Cliff,Cliffsi..."


#### Neighborhoods 'Not assigned' will be same as Borough

In [28]:
df_pc['Neighborhood'].replace('Not assigned', 'Borough', inplace=True)
df_pc.head(10)

Unnamed: 0,Postecode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern,Rouge,Malvern,Rouge,Malvern,Roug..."
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union,Highland ..."
2,M1E,Scarborough,"Guildwood,Morningside,West Hill,Guildwood,Morn..."
3,M1G,Scarborough,"Woburn,Woburn,Woburn,Woburn,Woburn"
4,M1H,Scarborough,"Cedarbrae,Cedarbrae,Cedarbrae,Cedarbrae,Cedarbrae"
5,M1J,Scarborough,"Scarborough Village,Scarborough Village,Scarbo..."
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park,East..."
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge,Clairlea,Golden ..."
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West,..."
9,M1N,Scarborough,"Birch Cliff,Cliffside West,Birch Cliff,Cliffsi..."


In [29]:
df.shape

(1050, 3)

### Part 2 - Foursquare Location Data

#### Get geographical coordinates of the neighborhoods

In [30]:
url1 = "https://cocl.us/Geospatial_data"
geo_data = pd.read_csv(url1)
geo_data.columns = ['Postecode', 'Latitude', 'Longitude']
geo_data.head(10)

Unnamed: 0,Postecode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


#### Merge the two data frames on 'Postcode'

In [32]:
toronto_df = pd.merge(df_pc, geo_data, on='Postecode')
toronto_df.head(10)

Unnamed: 0,Postecode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern,Rouge,Malvern,Rouge,Malvern,Roug...",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union,Highland ...",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill,Guildwood,Morn...",43.763573,-79.188711
3,M1G,Scarborough,"Woburn,Woburn,Woburn,Woburn,Woburn",43.770992,-79.216917
4,M1H,Scarborough,"Cedarbrae,Cedarbrae,Cedarbrae,Cedarbrae,Cedarbrae",43.773136,-79.239476
5,M1J,Scarborough,"Scarborough Village,Scarborough Village,Scarbo...",43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park,East...",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge,Clairlea,Golden ...",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West,...",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West,Birch Cliff,Cliffsi...",43.692657,-79.264848


### Part 3 - Explore and cluster the neighborhoods in Toronto

#### Install Folium

In [33]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    branca-0.4.0               |             py_0          26 KB  conda-forge
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    altair-4.0.1               |             py_0         575 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.0 MB

The following NEW packages will be 

#### Get the geographical coordiantes of Toronto

In [36]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.653963, -79.387207.


#### Dropping all other neighborhoods except Toronto

In [37]:
toronto_data = toronto_df[toronto_df.Borough.str.contains("Toronto")].reset_index(drop = True)
toronto_data.head(10)

Unnamed: 0,Postecode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,"The Beaches,The Beaches,The Beaches,The Beache...",43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West,Riverdale,The Danforth West,...",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar,The Beaches West...",43.668999,-79.315572
3,M4M,East Toronto,"Studio District,Studio District,Studio Distric...",43.659526,-79.340923
4,M4N,Central Toronto,"Lawrence Park,Lawrence Park,Lawrence Park,Lawr...",43.72802,-79.38879
5,M4P,Central Toronto,"Davisville North,Davisville North,Davisville N...",43.712751,-79.390197
6,M4R,Central Toronto,"North Toronto West,North Toronto West,North To...",43.715383,-79.405678
7,M4S,Central Toronto,"Davisville,Davisville,Davisville,Davisville,Da...",43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park,Summerhill East,Moore Park,Summerhi...",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",43.686412,-79.400049


#### Create map of Toronto using Latitude and Longitude values

In [39]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Borough'], toronto_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Define Foursquare Credentials and Version

In [43]:
CLIENT_ID = '2PKR3OO53ZLSCZYTCDF1FQ12XN0HQS2LO21BMGPEGYKCMIYO' # your Foursquare ID
CLIENT_SECRET = 'MCE5RBGJJY3UZVHB1IB2F5JJSMWZLUNGXY0RXOU5URK3WWNO' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 2PKR3OO53ZLSCZYTCDF1FQ12XN0HQS2LO21BMGPEGYKCMIYO
CLIENT_SECRET:MCE5RBGJJY3UZVHB1IB2F5JJSMWZLUNGXY0RXOU5URK3WWNO


#### Exploring the first neighborhood in our dataframe

In [48]:
toronto_data.loc[0, 'Neighborhood']

'The Beaches,The Beaches,The Beaches,The Beaches,The Beaches'

#### Get the neighborhoods Latitude and Longitude values

In [49]:
neighborhood_latitude = toronto_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = toronto_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = toronto_data.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of The Beaches,The Beaches,The Beaches,The Beaches,The Beaches are 43.67635739999999, -79.2930312.


#### Now, let's get the top 100 venues that are in The Beaches within a radius of 500 meters

In [50]:
LIMIT = 100
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

In [51]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e52c7f41a4b0a001b705cd0'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'The Beaches',
  'headerFullLocation': 'The Beaches, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 5,
  'suggestedBounds': {'ne': {'lat': 43.680857404499996,
    'lng': -79.28682091449052},
   'sw': {'lat': 43.67185739549999, 'lng': -79.29924148550948}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bd461bc77b29c74a07d9282',
       'name': 'Glen Manor Ravine',
       'location': {'address': 'Glen Manor',
        'crossStreet': 'Queen St.',
        'lat': 43.67682094413784,
        'lng': -79.29394208780985,
        'labeledLatLngs': [{'labe

In [52]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

#### Now we are ready to clean the json and structure it into a pandas dataframe

In [55]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Glen Manor Ravine,Trail,43.676821,-79.293942
1,Glen Stewart Ravine,Other Great Outdoors,43.6763,-79.294784
2,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
3,Grover Pub and Grub,Pub,43.679181,-79.297215
4,Upper Beaches,Neighborhood,43.680563,-79.292869


In [56]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

5 venues were returned by Foursquare.


#### Let's create a function to repeat the same process to all the neighborhoods in Toronto

In [58]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)

        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Now write the code to run the above function on each neighborhood and create a new dataframe called toronto_venues

In [59]:
toronto_venues = getNearbyVenues(names=toronto_data['Neighborhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

The Beaches,The Beaches,The Beaches,The Beaches,The Beaches
The Danforth West,Riverdale,The Danforth West,Riverdale,The Danforth West,Riverdale,The Danforth West,Riverdale,The Danforth West,Riverdale
The Beaches West,India Bazaar,The Beaches West,India Bazaar,The Beaches West,India Bazaar,The Beaches West,India Bazaar,The Beaches West,India Bazaar
Studio District,Studio District,Studio District,Studio District,Studio District
Lawrence Park,Lawrence Park,Lawrence Park,Lawrence Park,Lawrence Park
Davisville North,Davisville North,Davisville North,Davisville North,Davisville North
North Toronto West,North Toronto West,North Toronto West,North Toronto West,North Toronto West
Davisville,Davisville,Davisville,Davisville,Davisville
Moore Park,Summerhill East,Moore Park,Summerhill East,Moore Park,Summerhill East,Moore Park,Summerhill East,Moore Park,Summerhill East
Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West,Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West,Deer P

In [60]:
#Size of data frame
print(toronto_venues.shape)
toronto_venues.head()

(1720, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"The Beaches,The Beaches,The Beaches,The Beache...",43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,"The Beaches,The Beaches,The Beaches,The Beache...",43.676357,-79.293031,Glen Stewart Ravine,43.6763,-79.294784,Other Great Outdoors
2,"The Beaches,The Beaches,The Beaches,The Beache...",43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
3,"The Beaches,The Beaches,The Beaches,The Beache...",43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
4,"The Beaches,The Beaches,The Beaches,The Beache...",43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood


#### Let's check how many venues were returned for each neighborhood

In [61]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,King,Richmond,Adelaide,King,Richmond,Adelaide,King,Richmond,Adelaide,King,Richmond,Adelaide,King,Richmond",100,100,100,100,100,100
"Berczy Park,Berczy Park,Berczy Park,Berczy Park,Berczy Park",56,56,56,56,56,56
"Brockton,Exhibition Place,Parkdale Village,Brockton,Exhibition Place,Parkdale Village,Brockton,Exhibition Place,Parkdale Village,Brockton,Exhibition Place,Parkdale Village,Brockton,Exhibition Place,Parkdale Village",22,22,22,22,22,22
"Business Reply Mail Processing Centre 969 Eastern,Business Reply Mail Processing Centre 969 Eastern,Business Reply Mail Processing Centre 969 Eastern,Business Reply Mail Processing Centre 969 Eastern,Business Reply Mail Processing Centre 969 Eastern",15,15,15,15,15,15
"CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara,CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara,CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara,CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara,CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara",17,17,17,17,17,17
"Cabbagetown,St. James Town,Cabbagetown,St. James Town,Cabbagetown,St. James Town,Cabbagetown,St. James Town,Cabbagetown,St. James Town",49,49,49,49,49,49
"Central Bay Street,Central Bay Street,Central Bay Street,Central Bay Street,Central Bay Street",82,82,82,82,82,82
"Chinatown,Grange Park,Kensington Market,Chinatown,Grange Park,Kensington Market,Chinatown,Grange Park,Kensington Market,Chinatown,Grange Park,Kensington Market,Chinatown,Grange Park,Kensington Market",84,84,84,84,84,84
"Christie,Christie,Christie,Christie,Christie",18,18,18,18,18,18
"Church and Wellesley,Church and Wellesley,Church and Wellesley,Church and Wellesley,Church and Wellesley",83,83,83,83,83,83


#### Let's find out how many unique categories can be curated from all the returned venues

In [62]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 240 uniques categories.


### Analyse each Neighborhood

In [63]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [64]:
toronto_onehot.shape

(1720, 240)

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [65]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,"Adelaide,King,Richmond,Adelaide,King,Richmond,...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,...,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.01
1,"Berczy Park,Berczy Park,Berczy Park,Berczy Par...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.017857,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton,Exhibition Place,Parkdale Village,Bro...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 East...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",0.0,0.0,0.058824,0.058824,0.058824,0.117647,0.176471,0.117647,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Cabbagetown,St. James Town,Cabbagetown,St. Jam...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020408,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,"Central Bay Street,Central Bay Street,Central ...",0.012195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012195,...,0.0,0.0,0.0,0.012195,0.0,0.0,0.012195,0.0,0.0,0.0
7,"Chinatown,Grange Park,Kensington Market,Chinat...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.035714,0.0,0.059524,0.011905,0.0,0.0,0.0
8,"Christie,Christie,Christie,Christie,Christie",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Church and Wellesley,Church and Wellesley,Chur...",0.012048,0.012048,0.0,0.0,0.0,0.0,0.0,0.0,0.012048,...,0.0,0.0,0.0,0.0,0.0,0.012048,0.0,0.0,0.012048,0.0


In [67]:
#New size
toronto_grouped.shape

(39, 240)

#### Let's print each neighborhood along with the top 5 most common venues

In [68]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide,King,Richmond,Adelaide,King,Richmond,Adelaide,King,Richmond,Adelaide,King,Richmond,Adelaide,King,Richmond----
             venue  freq
0      Coffee Shop  0.06
1  Thai Restaurant  0.04
2              Bar  0.04
3             Café  0.04
4       Restaurant  0.03


----Berczy Park,Berczy Park,Berczy Park,Berczy Park,Berczy Park----
                venue  freq
0         Coffee Shop  0.09
1        Cocktail Bar  0.04
2  Seafood Restaurant  0.04
3                Café  0.04
4         Cheese Shop  0.04


----Brockton,Exhibition Place,Parkdale Village,Brockton,Exhibition Place,Parkdale Village,Brockton,Exhibition Place,Parkdale Village,Brockton,Exhibition Place,Parkdale Village,Brockton,Exhibition Place,Parkdale Village----
                venue  freq
0                Café  0.14
1      Breakfast Spot  0.09
2         Coffee Shop  0.09
3              Bakery  0.05
4  Italian Restaurant  0.05


----Business Reply Mail Processing Centre 969 Eastern,Business Reply Mail Processing Centre 96

#### Put it into a data frame. First, let's write a function to sort the venues in descending order

In [69]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#### Now let's create the new dataframe and display the top 10 venues for each neighborhood

In [71]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond,Adelaide,King,Richmond,...",Coffee Shop,Café,Thai Restaurant,Bar,Burger Joint,Steakhouse,Bakery,Sushi Restaurant,Cosmetics Shop,Restaurant
1,"Berczy Park,Berczy Park,Berczy Park,Berczy Par...",Coffee Shop,Farmers Market,Cocktail Bar,Café,French Restaurant,Cheese Shop,Seafood Restaurant,Bakery,Steakhouse,Beer Bar
2,"Brockton,Exhibition Place,Parkdale Village,Bro...",Café,Coffee Shop,Breakfast Spot,Furniture / Home Store,Italian Restaurant,Stadium,Bar,Intersection,Bakery,Climbing Gym
3,Business Reply Mail Processing Centre 969 East...,Pizza Place,Auto Workshop,Comic Shop,Restaurant,Butcher,Burrito Place,Brewery,Skate Park,Spa,Farmers Market
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Airport Service,Airport Lounge,Airport Terminal,Harbor / Marina,Boat or Ferry,Coffee Shop,Rental Car Location,Sculpture Garden,Boutique,Plane


### Cluster the Neighborhoods

In [75]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

#### Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood

In [76]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns

Unnamed: 0,Postecode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,"The Beaches,The Beaches,The Beaches,The Beache...",43.676357,-79.293031,1,Trail,Pub,Health Food Store,Other Great Outdoors,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Women's Store
1,M4K,East Toronto,"The Danforth West,Riverdale,The Danforth West,...",43.679557,-79.352188,1,Greek Restaurant,Coffee Shop,Italian Restaurant,Bookstore,Furniture / Home Store,Ice Cream Shop,Dessert Shop,Diner,Pub,Caribbean Restaurant
2,M4L,East Toronto,"The Beaches West,India Bazaar,The Beaches West...",43.668999,-79.315572,1,Sandwich Place,Pizza Place,Burrito Place,Sushi Restaurant,Fish & Chips Shop,Brewery,Ice Cream Shop,Pub,Movie Theater,Italian Restaurant
3,M4M,East Toronto,"Studio District,Studio District,Studio Distric...",43.659526,-79.340923,1,Café,Coffee Shop,American Restaurant,Gastropub,Italian Restaurant,Brewery,Bakery,Bar,Stationery Store,Fish Market
4,M4N,Central Toronto,"Lawrence Park,Lawrence Park,Lawrence Park,Lawr...",43.72802,-79.38879,3,Photography Studio,Park,Construction & Landscaping,Swim School,Bus Line,Dim Sum Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant


#### Visualise the resulting Clusters

In [79]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine the Clusters and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, you can then assign a name to each cluster

#### Cluster 1

In [80]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Central Toronto,0,Department Store,Park,Food & Drink Shop,Hotel,Sandwich Place,Breakfast Spot,Gym,Electronics Store,Eastern European Restaurant,Dim Sum Restaurant
8,Central Toronto,0,Playground,Tennis Court,Park,Restaurant,Women's Store,Discount Store,Dessert Shop,Dim Sum Restaurant,Diner,Doner Restaurant
10,Downtown Toronto,0,Park,Trail,Playground,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Women's Store


#### Cluster 2

In [81]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,1,Trail,Pub,Health Food Store,Other Great Outdoors,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Women's Store
1,East Toronto,1,Greek Restaurant,Coffee Shop,Italian Restaurant,Bookstore,Furniture / Home Store,Ice Cream Shop,Dessert Shop,Diner,Pub,Caribbean Restaurant
2,East Toronto,1,Sandwich Place,Pizza Place,Burrito Place,Sushi Restaurant,Fish & Chips Shop,Brewery,Ice Cream Shop,Pub,Movie Theater,Italian Restaurant
3,East Toronto,1,Café,Coffee Shop,American Restaurant,Gastropub,Italian Restaurant,Brewery,Bakery,Bar,Stationery Store,Fish Market
6,Central Toronto,1,Clothing Store,Coffee Shop,Yoga Studio,Spa,Shoe Store,Salon / Barbershop,Café,Restaurant,Chinese Restaurant,Pet Store
7,Central Toronto,1,Dessert Shop,Sandwich Place,Pizza Place,Café,Gym,Italian Restaurant,Sushi Restaurant,Coffee Shop,Deli / Bodega,Indoor Play Area
9,Central Toronto,1,Coffee Shop,Pub,Light Rail Station,American Restaurant,Restaurant,Liquor Store,Fried Chicken Joint,Sports Bar,Supermarket,Sushi Restaurant
11,Downtown Toronto,1,Coffee Shop,Park,Café,Restaurant,Italian Restaurant,Bakery,Pizza Place,Pub,Gastropub,Diner
12,Downtown Toronto,1,Coffee Shop,Japanese Restaurant,Gay Bar,Sushi Restaurant,Restaurant,Café,Fast Food Restaurant,Pub,Gym,Hotel
13,Downtown Toronto,1,Coffee Shop,Pub,Park,Bakery,Theater,Café,Breakfast Spot,Restaurant,Mexican Restaurant,Ice Cream Shop


#### Cluster 3

In [82]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
23,Central Toronto,2,Mexican Restaurant,Trail,Jewelry Store,Sushi Restaurant,Women's Store,Dim Sum Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant


#### Cluster 4

In [83]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Central Toronto,3,Photography Studio,Park,Construction & Landscaping,Swim School,Bus Line,Dim Sum Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant


#### Cluster 5

In [84]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,Central Toronto,4,Pool,Garden,Women's Store,Department Store,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop
