# Segmenting and Clustering Neighborhoods in Toronto

### Import Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

!conda install -c conda-forge folium=0.5.0 --yes
import folium

from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  37.12 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  28.81 MB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00  30.31 MB/s
folium-0.5.0-p 100% |################################| Time: 0:00:00  38.76 MB/s


### Create Toronto province dataframe

In [2]:
wiki_page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(wiki_page.content, 'lxml')
tables = soup.find_all('table')
df_toronto_raw = pd.read_html(str(tables), header=0)[0]

df_toronto_raw.drop(df_toronto_raw.index[df_toronto_raw.Borough=='Not assigned'], inplace=True)
df_toronto_raw.Neighbourhood = ', ' + df_toronto_raw.Neighbourhood
df_Toronto = df_toronto_raw.groupby(['Postcode','Borough'], sort=False).sum()
df_Toronto.reset_index(inplace=True)
df_Toronto.Neighbourhood = df_Toronto.Neighbourhood.apply(lambda x: x[2:])

for x in df_Toronto.index:
    if df_Toronto.Neighbourhood[x] == 'Not assigned':
        df_Toronto.Neighbourhood[x] = df_Toronto.Borough[x]

df_Toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


### Add Latitude / Longitude to Toronto dataframe

In [4]:
# The code was removed by Watson Studio for sharing.

--2018-12-24 23:37:26--  https://cocl.us/Geospatial_data
Resolving cocl.us (cocl.us)... 169.48.113.201
Connecting to cocl.us (cocl.us)|169.48.113.201|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2018-12-24 23:37:26--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com (ibm.box.com)... 107.152.27.197
Connecting to ibm.box.com (ibm.box.com)|107.152.27.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.ent.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2018-12-24 23:37:27--  https://ibm.ent.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.ent.box.com (ibm.ent.box.com)... 107.152.27.211
Connecting to ibm.ent.box.com (ibm.ent.box.com)|107.152.27.211|:443... connected.
HTTP request sent, awaiting response... 302 Found

In [5]:
df_geo = pd.read_csv('Geospatial_Coordinates.csv')
df_geo.head()

lat = pd.Series(np.zeros(len(df_Toronto)))
lon = pd.Series(np.zeros(len(df_Toronto)))

for n in df_Toronto.index:
    for g in df_geo.index:
        if df_Toronto.Postcode[n] == df_geo['Postal Code'][g]:
            lat[n] = df_geo.Latitude[g]
            lon[n] = df_geo.Longitude[g]
            
df_Toronto['Latitude'] = lat
df_Toronto['Longitude'] = lon
df_Toronto.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


### Filter for Toronto Boroughs

In [6]:
df_neighborhoods = df_Toronto[df_Toronto.Borough.str.find('Toronto')>=0]
df_neighborhoods.reset_index(inplace=True, drop=True)
print(len(df_neighborhoods), 'neighborhoods in Toronto Borough.')
df_neighborhoods.head(10)

38 neighborhoods in Toronto Borough.


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
1,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
5,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
6,M6G,Downtown Toronto,Christie,43.669542,-79.422564
7,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568
8,M6H,West Toronto,"Dovercourt Village, Dufferin",43.669005,-79.442259
9,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",43.640816,-79.381752


In [8]:
df_boroughs = df_neighborhoods.groupby(['Borough'], sort=False).median()
df_boroughs


Unnamed: 0_level_0,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1
Downtown Toronto,43.65235,-79.381664
East Toronto,43.668999,-79.321558
West Toronto,43.650265,-79.449292
Central Toronto,43.704324,-79.400049


### Map the Boroughs & Neighborhoods

In [9]:
# map the boroughs & neighborhoods
Tor_Lat=df_neighborhoods.Latitude.median(axis=0)
Tor_Lon=df_neighborhoods.Longitude.median(axis=0)
c_array = ['red','green','blue','cyan']

toronto_map = folium.Map(location=[Tor_Lat,Tor_Lon], zoom_start=12)
for lat, lon, label in zip(df_boroughs.Latitude, df_boroughs.Longitude, df_boroughs.index):
    folium.Popup(label,parse_html=True)
    folium.CircleMarker([lat,lon],
                       radius=7,
                       popup=label,
                       color=c_array[np.where(label == df_boroughs.index)[0][0]],
                       fill=False,
                       parse_html=False).add_to(toronto_map)
    

for boro, label, lat, lon in zip(df_neighborhoods.Borough, df_neighborhoods.Neighbourhood, df_neighborhoods.Latitude, df_neighborhoods.Longitude):
    folium.Popup(label,parse_html=True)
    folium.CircleMarker([lat,lon],
                       radius=5,
                       popup=label,
                       color=c_array[np.where(boro == df_boroughs.index)[0][0]],
                       fill=True,
                       fill_opacity=0.6,
                       parse_html=False).add_to(toronto_map)

toronto_map

#### Hmmm. Downtown neighborhoods are packed pretty tight together.
There's likely to be a lot off overlap in venues.  They'll probably be extremely similar in clustering.  Radius and # of venues chosen might have a major effect on results.

In [10]:
# The code was removed by Watson Studio for sharing.

In [11]:
#from Foursquare Lab
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [12]:
def getNearbyVenues(names, latitudes, longitudes, radius=800):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
                    
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    print('Found {} venues in {} neighborhoods.'.format(nearby_venues.shape[0], len(venues_list)))
    return(nearby_venues)

### Pull in venues.  1/2 km radius, up to 100 venues per neighborhood

In [13]:
LIMIT = 100
venues_toronto = getNearbyVenues(names=df_neighborhoods.Neighbourhood,
                                latitudes=df_neighborhoods.Latitude,
                                longitudes=df_neighborhoods.Longitude,
                                radius=500)


Found 1691 venues in 38 neighborhoods.


In [14]:
print(venues_toronto.shape)
venues_toronto.head()

(1691, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Harbourfront, Regent Park",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Harbourfront, Regent Park",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Harbourfront, Regent Park",43.65426,-79.360636,Toronto Cooper Koo Family Cherry St YMCA Centre,43.653191,-79.357947,Gym / Fitness Center
3,"Harbourfront, Regent Park",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
4,"Harbourfront, Regent Park",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa


In [12]:
print('There are {} distinct venues in {} categories.'.format(
    len(venues_toronto['Venue'].unique()),len(venues_toronto['Venue Category'].unique())))

There are 1098 distinct venues in 231 categories.


In [15]:
onehot_toronto = venues_toronto[['Neighborhood']]
onehot_toronto = pd.concat([onehot_toronto, pd.get_dummies(venues_toronto[['Venue Category']], prefix='')], axis=1)
print(onehot_toronto.shape)
onehot_toronto.head()

(1691, 232)


Unnamed: 0,Neighborhood,_Adult Boutique,_Afghan Restaurant,_Airport,_Airport Food Court,_Airport Gate,_Airport Lounge,_Airport Service,_Airport Terminal,_American Restaurant,...,_Toy / Game Store,_Trail,_Train Station,_Vegetarian / Vegan Restaurant,_Video Game Store,_Vietnamese Restaurant,_Wine Bar,_Wings Joint,_Women's Store,_Yoga Studio
0,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
grouped_toronto = onehot_toronto.groupby('Neighborhood').mean().reset_index()
print(grouped_toronto.shape)
grouped_toronto

(38, 232)


Unnamed: 0,Neighborhood,_Adult Boutique,_Afghan Restaurant,_Airport,_Airport Food Court,_Airport Gate,_Airport Lounge,_Airport Service,_Airport Terminal,_American Restaurant,...,_Toy / Game Store,_Trail,_Train Station,_Vegetarian / Vegan Restaurant,_Video Game Store,_Vietnamese Restaurant,_Wine Bar,_Wings Joint,_Women's Store,_Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.01,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business reply mail Processing Centre969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0,0.076923,0.076923,0.076923,0.153846,0.153846,0.153846,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012346,...,0.0,0.0,0.0,0.012346,0.0,0.0,0.012346,0.0,0.0,0.012346
7,"Chinatown, Grange Park, Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.052083,0.0,0.052083,0.010417,0.0,0.010417,0.0
8,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Church and Wellesley,0.011765,0.011765,0.0,0.0,0.0,0.0,0.0,0.0,0.011765,...,0.0,0.0,0.0,0.0,0.011765,0.011765,0.0,0.011765,0.0,0.011765


In [17]:
# From NY clustering lab
def return_most_common_venues(row, num_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_venues]

In [18]:
# From NY clustering lab

num_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
sorted_neighborhood_venues = pd.DataFrame(columns=columns)
sorted_neighborhood_venues['Neighborhood'] = grouped_toronto['Neighborhood']

for ind in np.arange(grouped_toronto.shape[0]):
    sorted_neighborhood_venues.iloc[ind, 1:] = return_most_common_venues(grouped_toronto.iloc[ind, :], num_venues)

sorted_neighborhood_venues

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",_Coffee Shop,_Café,_Thai Restaurant,_Steakhouse,_American Restaurant,_Hotel,_Clothing Store,_Bar,_Bakery,_Gym
1,Berczy Park,_Coffee Shop,_Restaurant,_Cocktail Bar,_Pub,_Café,_Cheese Shop,_Farmers Market,_Italian Restaurant,_Steakhouse,_Bakery
2,"Brockton, Exhibition Place, Parkdale Village",_Coffee Shop,_Café,_Breakfast Spot,_Grocery Store,_Italian Restaurant,_Pet Store,_Convenience Store,_Gym,_Climbing Gym,_Caribbean Restaurant
3,Business reply mail Processing Centre969 Eastern,_Yoga Studio,_Auto Workshop,_Garden Center,_Garden,_Light Rail Station,_Fast Food Restaurant,_Farmers Market,_Comic Shop,_Park,_Restaurant
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",_Airport Lounge,_Airport Service,_Airport Terminal,_Sculpture Garden,_Boat or Ferry,_Airport,_Airport Food Court,_Airport Gate,_Harbor / Marina,_Boutique
5,"Cabbagetown, St. James Town",_Coffee Shop,_Restaurant,_Pizza Place,_Indian Restaurant,_Italian Restaurant,_Bakery,_Café,_Pub,_Park,_Breakfast Spot
6,Central Bay Street,_Coffee Shop,_Sandwich Place,_Italian Restaurant,_Bubble Tea Shop,_Bar,_Ice Cream Shop,_Burger Joint,_Café,_Thai Restaurant,_Salad Place
7,"Chinatown, Grange Park, Kensington Market",_Café,_Bar,_Vietnamese Restaurant,_Vegetarian / Vegan Restaurant,_Coffee Shop,_Bakery,_Mexican Restaurant,_Chinese Restaurant,_Gaming Cafe,_Dessert Shop
8,Christie,_Grocery Store,_Café,_Park,_Convenience Store,_Diner,_Italian Restaurant,_Restaurant,_Nightclub,_Coffee Shop,_Baby Store
9,Church and Wellesley,_Japanese Restaurant,_Sushi Restaurant,_Coffee Shop,_Gay Bar,_Restaurant,_Burger Joint,_Café,_Pub,_Men's Store,_Mediterranean Restaurant


In [19]:
from sklearn.cluster import KMeans

In [20]:
clustering_toronto = grouped_toronto.drop('Neighborhood',1)

#### How many clusters should we use?

In [21]:
print('Ratio of neighborhoods:')
print('')
for k in range(10):
    print('With',k+1,'clusters:')
    KMns = KMeans(n_clusters=k+1, random_state=2).fit(clustering_toronto)
    clustercount=pd.Series(KMns.labels_).value_counts()
    clustercountn=pd.Series(KMns.labels_).value_counts(normalize=True)
    print(pd.concat([clustercount,clustercountn], axis=1))
    print('')




Ratio of neighborhoods:

With 1 clusters:
    0    1
0  38  1.0

With 2 clusters:
    0         1
0  37  0.973684
1   1  0.026316

With 3 clusters:
    0         1
1  34  0.894737
0   3  0.078947
2   1  0.026316

With 4 clusters:
    0         1
0  34  0.894737
1   2  0.052632
3   1  0.026316
2   1  0.026316

With 5 clusters:
    0         1
1  30  0.789474
4   4  0.105263
0   2  0.052632
3   1  0.026316
2   1  0.026316

With 6 clusters:
    0         1
4  20  0.526316
0  13  0.342105
5   2  0.052632
3   1  0.026316
2   1  0.026316
1   1  0.026316

With 7 clusters:
    0         1
0  31  0.815789
2   2  0.052632
6   1  0.026316
5   1  0.026316
4   1  0.026316
3   1  0.026316
1   1  0.026316

With 8 clusters:
    0         1
0  31  0.815789
7   1  0.026316
6   1  0.026316
5   1  0.026316
4   1  0.026316
3   1  0.026316
2   1  0.026316
1   1  0.026316

With 9 clusters:
    0         1
1  23  0.605263
2   8  0.210526
8   1  0.026316
7   1  0.026316
6   1  0.026316
5   1  0.026316
4   1  0

## SOOOOOOO many of the clusters have only 1 neighborhood.
I guess we'll go with 3 clusters

In [22]:
df_neighborhoods.Borough.value_counts()

Downtown Toronto    18
Central Toronto      9
West Toronto         6
East Toronto         5
Name: Borough, dtype: int64

In [23]:
sorted_neighborhood_venues.columns

Index(['Neighborhood', '1st Most Common Venue', '2nd Most Common Venue',
       '3rd Most Common Venue', '4th Most Common Venue',
       '5th Most Common Venue', '6th Most Common Venue',
       '7th Most Common Venue', '8th Most Common Venue',
       '9th Most Common Venue', '10th Most Common Venue'],
      dtype='object')

In [24]:
KMns = KMeans(n_clusters=3, random_state=2).fit(clustering_toronto)
clustered_toronto = df_neighborhoods[['Borough','Neighbourhood','Latitude','Longitude']].sort_values(by='Neighbourhood')
clustered_toronto.rename(columns={'Neighbourhood':'Neighborhood'}, inplace=True)
clustered_toronto['Cluster_label'] = KMns.labels_
clustered_toronto = clustered_toronto.join(sorted_neighborhood_venues.set_index('Neighborhood'), on='Neighborhood')
clustered_toronto


Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster_label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
7,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568,1,_Coffee Shop,_Café,_Thai Restaurant,_Steakhouse,_American Restaurant,_Hotel,_Clothing Store,_Bar,_Bakery,_Gym
4,Downtown Toronto,Berczy Park,43.644771,-79.373306,1,_Coffee Shop,_Restaurant,_Cocktail Bar,_Pub,_Café,_Cheese Shop,_Farmers Market,_Italian Restaurant,_Steakhouse,_Bakery
13,West Toronto,"Brockton, Exhibition Place, Parkdale Village",43.636847,-79.428191,1,_Coffee Shop,_Café,_Breakfast Spot,_Grocery Store,_Italian Restaurant,_Pet Store,_Convenience Store,_Gym,_Climbing Gym,_Caribbean Restaurant
37,East Toronto,Business reply mail Processing Centre969 Eastern,43.662744,-79.321558,1,_Yoga Studio,_Auto Workshop,_Garden Center,_Garden,_Light Rail Station,_Fast Food Restaurant,_Farmers Market,_Comic Shop,_Park,_Restaurant
31,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442,1,_Airport Lounge,_Airport Service,_Airport Terminal,_Sculpture Garden,_Boat or Ferry,_Airport,_Airport Food Court,_Airport Gate,_Harbor / Marina,_Boutique
34,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675,1,_Coffee Shop,_Restaurant,_Pizza Place,_Indian Restaurant,_Italian Restaurant,_Bakery,_Café,_Pub,_Park,_Breakfast Spot
5,Downtown Toronto,Central Bay Street,43.657952,-79.387383,1,_Coffee Shop,_Sandwich Place,_Italian Restaurant,_Bubble Tea Shop,_Bar,_Ice Cream Shop,_Burger Joint,_Café,_Thai Restaurant,_Salad Place
29,Downtown Toronto,"Chinatown, Grange Park, Kensington Market",43.653206,-79.400049,1,_Café,_Bar,_Vietnamese Restaurant,_Vegetarian / Vegan Restaurant,_Coffee Shop,_Bakery,_Mexican Restaurant,_Chinese Restaurant,_Gaming Cafe,_Dessert Shop
6,Downtown Toronto,Christie,43.669542,-79.422564,1,_Grocery Store,_Café,_Park,_Convenience Store,_Diner,_Italian Restaurant,_Restaurant,_Nightclub,_Coffee Shop,_Baby Store
36,Downtown Toronto,Church and Wellesley,43.66586,-79.38316,1,_Japanese Restaurant,_Sushi Restaurant,_Coffee Shop,_Gay Bar,_Restaurant,_Burger Joint,_Café,_Pub,_Men's Store,_Mediterranean Restaurant


In [25]:
print('Neigborhoods:',len(df_neighborhoods))
print('Neighborhoods w/clusters:',len(grouped_toronto))

Neigborhoods: 38
Neighborhoods w/clusters: 38


In [26]:
KMns = KMeans(n_clusters=3, random_state=2).fit(clustering_toronto)
clustered_toronto = df_neighborhoods[['Borough','Neighbourhood','Latitude','Longitude']].sort_values(by='Neighbourhood')
clustered_toronto.rename(columns={'Neighbourhood':'Neighborhood'}, inplace=True)
clustered_toronto.drop(clustered_toronto[clustered_toronto.Neighborhood.isin(grouped_toronto.Neighborhood)==False].index, inplace=True)
clustered_toronto['Cluster_label'] = KMns.labels_
clustered_toronto = clustered_toronto.join(sorted_neighborhood_venues.set_index('Neighborhood'), on='Neighborhood')
clustered_toronto



Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster_label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
7,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568,1,_Coffee Shop,_Café,_Thai Restaurant,_Steakhouse,_American Restaurant,_Hotel,_Clothing Store,_Bar,_Bakery,_Gym
4,Downtown Toronto,Berczy Park,43.644771,-79.373306,1,_Coffee Shop,_Restaurant,_Cocktail Bar,_Pub,_Café,_Cheese Shop,_Farmers Market,_Italian Restaurant,_Steakhouse,_Bakery
13,West Toronto,"Brockton, Exhibition Place, Parkdale Village",43.636847,-79.428191,1,_Coffee Shop,_Café,_Breakfast Spot,_Grocery Store,_Italian Restaurant,_Pet Store,_Convenience Store,_Gym,_Climbing Gym,_Caribbean Restaurant
37,East Toronto,Business reply mail Processing Centre969 Eastern,43.662744,-79.321558,1,_Yoga Studio,_Auto Workshop,_Garden Center,_Garden,_Light Rail Station,_Fast Food Restaurant,_Farmers Market,_Comic Shop,_Park,_Restaurant
31,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442,1,_Airport Lounge,_Airport Service,_Airport Terminal,_Sculpture Garden,_Boat or Ferry,_Airport,_Airport Food Court,_Airport Gate,_Harbor / Marina,_Boutique
34,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675,1,_Coffee Shop,_Restaurant,_Pizza Place,_Indian Restaurant,_Italian Restaurant,_Bakery,_Café,_Pub,_Park,_Breakfast Spot
5,Downtown Toronto,Central Bay Street,43.657952,-79.387383,1,_Coffee Shop,_Sandwich Place,_Italian Restaurant,_Bubble Tea Shop,_Bar,_Ice Cream Shop,_Burger Joint,_Café,_Thai Restaurant,_Salad Place
29,Downtown Toronto,"Chinatown, Grange Park, Kensington Market",43.653206,-79.400049,1,_Café,_Bar,_Vietnamese Restaurant,_Vegetarian / Vegan Restaurant,_Coffee Shop,_Bakery,_Mexican Restaurant,_Chinese Restaurant,_Gaming Cafe,_Dessert Shop
6,Downtown Toronto,Christie,43.669542,-79.422564,1,_Grocery Store,_Café,_Park,_Convenience Store,_Diner,_Italian Restaurant,_Restaurant,_Nightclub,_Coffee Shop,_Baby Store
36,Downtown Toronto,Church and Wellesley,43.66586,-79.38316,1,_Japanese Restaurant,_Sushi Restaurant,_Coffee Shop,_Gay Bar,_Restaurant,_Burger Joint,_Café,_Pub,_Men's Store,_Mediterranean Restaurant


### Splitting the clusters into their own dataframes.

In [27]:
cluster_1 = clustered_toronto.loc[clustered_toronto['Cluster_label']==0, clustered_toronto.columns[[0,1] + list(range(4,clustered_toronto.shape[1]))]].reset_index(drop=True)
cluster_2 = clustered_toronto.loc[clustered_toronto['Cluster_label']==1, clustered_toronto.columns[[0,1] + list(range(4,clustered_toronto.shape[1]))]].reset_index(drop=True)
cluster_3 = clustered_toronto.loc[clustered_toronto['Cluster_label']==2, clustered_toronto.columns[[0,1] + list(range(4,clustered_toronto.shape[1]))]].reset_index(drop=True)
#cluster_4 = clustered_toronto.loc[clustered_toronto['Cluster_label']==3, clustered_toronto.columns[[0,1] + list(range(4,clustered_toronto.shape[1]))]].reset_index(drop=True)
#cluster_5 = clustered_toronto.loc[clustered_toronto['Cluster_label']==4, clustered_toronto.columns[[0,1] + list(range(4,clustered_toronto.shape[1]))]].reset_index(drop=True)

### Scoring the venue categories
Scoring 10 points for most common category down to 1 for the 10th most common.  
Score is normalized by dividing the score by the # of neighborhoods in the cluster.  Essentially a weighted average.  Didn't use mean (average) because, for example, a category that is the most common in one neighborhood but is not in the top 10 in any other neighborhood will have an average score of 10.

#### Cluster 1

In [30]:
ven_cat = pd.Series()
ven_score = pd.Series()

for x in range(3,13):
    ven_cat = pd.concat([ven_cat,cluster_1[cluster_1.columns[x]]])
    ven_score = pd.concat([ven_score,pd.Series(np.zeros(cluster_1.shape[0])+(13-x)/len(cluster_1))])
    
cluster_score = pd.concat([ven_cat,ven_score],axis=1)
cluster_score.rename(columns={0:'Category',1:'Score'}, inplace=True)
print('CLUSTER 1 -- ',len(cluster_1), 'neighborhoods')
cluster_score.groupby('Category').sum().sort_values(by='Score', ascending=False)[:10]

CLUSTER 1 --  3 neighborhoods


Unnamed: 0_level_0,Score
Category,Unnamed: 1_level_1
_Park,9.333333
_Trail,5.666667
_Falafel Restaurant,5.333333
_Event Space,4.333333
_Bus Line,3.333333
_Ethiopian Restaurant,3.333333
_Playground,3.0
_Dim Sum Restaurant,3.0
_Jewelry Store,2.666667
_Electronics Store,2.333333


#### Let's call cluster 1 'The Park'.

#### Cluster 2

In [31]:
ven_cat = pd.Series()
ven_score = pd.Series()

for x in range(3,13):
    ven_cat = pd.concat([ven_cat,cluster_2[cluster_2.columns[x]]])
    ven_score = pd.concat([ven_score,pd.Series(np.zeros(cluster_2.shape[0])+(13-x)/len(cluster_2))])
    
cluster_score = pd.concat([ven_cat,ven_score],axis=1)
cluster_score.rename(columns={0:'Category',1:'Score'}, inplace=True)
print('CLUSTER2 -- ',len(cluster_2),'neighborhoods')
cluster_score.groupby('Category').sum().sort_values(by='Score', ascending=False)[:10]

CLUSTER2 --  34 neighborhoods


Unnamed: 0_level_0,Score
Category,Unnamed: 1_level_1
_Coffee Shop,6.735294
_Café,5.176471
_Restaurant,3.176471
_Italian Restaurant,2.470588
_Bakery,1.735294
_Hotel,1.529412
_Pub,1.441176
_Sandwich Place,1.411765
_Bar,1.382353
_Park,1.235294


#### Cluster 2 can be 'The Coffee house'.

#### Cluster 3

In [32]:
ven_cat = pd.Series()
ven_score = pd.Series()

for x in range(3,13):
    ven_cat = pd.concat([ven_cat,cluster_3[cluster_3.columns[x]]])
    ven_score = pd.concat([ven_score,pd.Series(np.zeros(cluster_3.shape[0])+(13-x)/len(cluster_3))])
    
cluster_score = pd.concat([ven_cat,ven_score],axis=1)
cluster_score.rename(columns={0:'Category',1:'Score'}, inplace=True)
print('CLUSTER 3 -- ', len(cluster_3), 'neighborhoods')
cluster_score.groupby('Category').mean().sort_values(by='Score', ascending=False)[:10]

CLUSTER 3 --  1 neighborhoods


Unnamed: 0_level_0,Score
Category,Unnamed: 1_level_1
_Garden,10.0
_Dim Sum Restaurant,9.0
_Farmers Market,8.0
_Falafel Restaurant,7.0
_Event Space,6.0
_Ethiopian Restaurant,5.0
_Electronics Store,4.0
_Eastern European Restaurant,3.0
_Dumpling Restaurant,2.0
_Donut Shop,1.0


#### and cluster 3 is 'Community Garden'

In [33]:
cluster_name = ['The Park','The Coffee house','Community Garden']

In [34]:
# map the neighborhoods & clusters
Tor_Lat=df_neighborhoods.Latitude.median(axis=0)
Tor_Lon=df_neighborhoods.Longitude.median(axis=0)
c_array = ['red','green','blue','cyan','magenta','yellow','white','black']

cluster_toronto_map = folium.Map(location=[Tor_Lat,Tor_Lon], zoom_start=12)
#for lat, lon, label in zip(df_boroughs.Latitude, df_boroughs.Longitude, df_boroughs.index):
#    folium.Popup(label,parse_html=True)
#    folium.CircleMarker([lat,lon],
#                       radius=7,
#                       popup=label,
#                       color=c_array[np.where(label == df_boroughs.index)[0][0]],
#                       fill=False,
#                       parse_html=False).add_to(toronto_map)
    

for label, cluster, lat, lon in zip(clustered_toronto.Neighborhood, clustered_toronto.Cluster_label, clustered_toronto.Latitude, clustered_toronto.Longitude):
    folium.Popup(label+' ('+cluster_name[cluster]+')',parse_html=True)
    folium.CircleMarker([lat,lon],
                       radius=5,
                       popup=label+' ('+cluster_name[cluster]+')',
                       color=c_array[cluster],
                       fill=True,
                       fill_opacity=0.6,
                       parse_html=False).add_to(cluster_toronto_map)


cluster_toronto_map

### Let's change the parameters...
Expand the radius to 1 km and limit the results to 50 venues per neighborhood.

In [35]:
LIMIT = 50
venues_toronto1 = getNearbyVenues(names=df_neighborhoods.Neighbourhood,
                                latitudes=df_neighborhoods.Latitude,
                                longitudes=df_neighborhoods.Longitude,
                                radius=1000)


Found 1752 venues in 38 neighborhoods.


In [36]:
onehot_toronto1 = venues_toronto1[['Neighborhood']]
onehot_toronto1 = pd.concat([onehot_toronto1, pd.get_dummies(venues_toronto1[['Venue Category']], prefix='')], axis=1)
print('One Hot shape:',onehot_toronto1.shape)
#onehot_toronto.head()

grouped_toronto1 = onehot_toronto1.groupby('Neighborhood').mean().reset_index()
print('Grouped shape:',grouped_toronto1.shape)
#grouped_toronto

num_venues = 10
indicators = ['st', 'nd', 'rd']

columns = ['Neighborhood']
for ind in np.arange(num_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

sorted_neighborhood_venues1 = pd.DataFrame(columns=columns)
sorted_neighborhood_venues1['Neighborhood'] = grouped_toronto1['Neighborhood']

for ind in np.arange(grouped_toronto1.shape[0]):
    sorted_neighborhood_venues1.iloc[ind, 1:] = return_most_common_venues(grouped_toronto1.iloc[ind, :], num_venues)

#sorted_neighborhood_venues

clustering_toronto1 = grouped_toronto1.drop('Neighborhood',1)

print('Ratio of neighborhoods:')
print('')
for k in range(10):
    print('With',k+1,'clusters:')
    KMns = KMeans(n_clusters=k+1, random_state=2).fit(clustering_toronto1)
    clustercount=pd.Series(KMns.labels_).value_counts()
    clustercountn=pd.Series(KMns.labels_).value_counts(normalize=True)
    print(pd.concat([clustercount,clustercountn], axis=1))
    #print(clustercountn)
    print('')




One Hot shape: (1752, 235)
Grouped shape: (38, 235)
Ratio of neighborhoods:

With 1 clusters:
    0    1
0  38  1.0

With 2 clusters:
    0         1
1  36  0.947368
0   2  0.052632

With 3 clusters:
    0         1
2  17  0.447368
1  15  0.394737
0   6  0.157895

With 4 clusters:
    0         1
1  16  0.421053
2  12  0.315789
3   9  0.236842
0   1  0.026316

With 5 clusters:
    0         1
2  12  0.315789
3  10  0.263158
1  10  0.263158
0   5  0.131579
4   1  0.026316

With 6 clusters:
    0         1
0  11  0.289474
1  10  0.263158
3   9  0.236842
5   6  0.157895
4   1  0.026316
2   1  0.026316

With 7 clusters:
    0         1
1  15  0.394737
4   9  0.236842
3   9  0.236842
0   2  0.052632
6   1  0.026316
5   1  0.026316
2   1  0.026316

With 8 clusters:
    0         1
2  10  0.263158
1  10  0.263158
0   6  0.157895
3   5  0.131579
7   4  0.105263
6   1  0.026316
5   1  0.026316
4   1  0.026316

With 9 clusters:
    0         1
2  10  0.263158
4   8  0.210526
5   7  0.184211
8   

## OK that looks better.
Staying with 3 clusters though.

In [37]:
KMns = KMeans(n_clusters=3, random_state=2).fit(clustering_toronto1)
clustered_toronto1 = df_neighborhoods[['Borough','Neighbourhood','Latitude','Longitude']].sort_values(by='Neighbourhood')
clustered_toronto1.rename(columns={'Neighbourhood':'Neighborhood'}, inplace=True)
clustered_toronto1['Cluster_label'] = KMns.labels_
clustered_toronto1 = clustered_toronto1.join(sorted_neighborhood_venues1.set_index('Neighborhood'), on='Neighborhood')
clustered_toronto1

cluster_1 = clustered_toronto1.loc[clustered_toronto1['Cluster_label']==0, clustered_toronto1.columns[[0,1] + list(range(4,clustered_toronto1.shape[1]))]].reset_index(drop=True)
cluster_2 = clustered_toronto1.loc[clustered_toronto1['Cluster_label']==1, clustered_toronto1.columns[[0,1] + list(range(4,clustered_toronto1.shape[1]))]].reset_index(drop=True)
cluster_3 = clustered_toronto1.loc[clustered_toronto1['Cluster_label']==2, clustered_toronto1.columns[[0,1] + list(range(4,clustered_toronto1.shape[1]))]].reset_index(drop=True)
#cluster_4 = clustered_toronto1.loc[clustered_toronto1['Cluster_label']==3, clustered_toronto1.columns[[0,1] + list(range(4,clustered_toronto1.shape[1]))]].reset_index(drop=True)
#cluster_5 = clustered_toronto1.loc[clustered_toronto1['Cluster_label']==4, clustered_toronto1.columns[[0,1] + list(range(4,clustered_toronto1.shape[1]))]].reset_index(drop=True)


In [38]:
ven_cat = pd.Series()
ven_score = pd.Series()

for x in range(3,13):
    ven_cat = pd.concat([ven_cat,cluster_1[cluster_1.columns[x]]])
    ven_score = pd.concat([ven_score,pd.Series(np.zeros(cluster_1.shape[0])+(13-x)/len(cluster_1))])
    
cluster_score = pd.concat([ven_cat,ven_score],axis=1)
cluster_score.rename(columns={0:'Category',1:'Score'}, inplace=True)
print('CLUSTER 1 --',len(cluster_1), 'neighborhoods')
cluster_score.groupby('Category').sum().sort_values(by='Score', ascending=False)[:10]


CLUSTER 1 -- 6 neighborhoods


Unnamed: 0_level_0,Score
Category,Unnamed: 1_level_1
_Café,9.5
_Coffee Shop,7.333333
_Park,3.333333
_Gym / Fitness Center,2.666667
_Italian Restaurant,2.333333
_Bar,2.166667
_Trail,2.0
_Bakery,1.833333
_Harbor / Marina,1.666667
_Mexican Restaurant,1.5


In [39]:
ven_cat = pd.Series()
ven_score = pd.Series()

for x in range(3,13):
    ven_cat = pd.concat([ven_cat,cluster_2[cluster_2.columns[x]]])
    ven_score = pd.concat([ven_score,pd.Series(np.zeros(cluster_2.shape[0])+(13-x)/len(cluster_2))])
    
cluster_score = pd.concat([ven_cat,ven_score],axis=1)
cluster_score.rename(columns={0:'Category',1:'Score'}, inplace=True)
print('CLUSTER 2 --',len(cluster_2), 'neighborhoods')
cluster_score.groupby('Category').sum().sort_values(by='Score', ascending=False)[:10]


CLUSTER 2 -- 15 neighborhoods


Unnamed: 0_level_0,Score
Category,Unnamed: 1_level_1
_Café,7.666667
_Coffee Shop,5.533333
_Hotel,4.133333
_Restaurant,4.0
_Bakery,2.333333
_Gastropub,1.933333
_American Restaurant,1.866667
_Italian Restaurant,1.8
_Steakhouse,1.4
_Cocktail Bar,1.266667


In [40]:
ven_cat = pd.Series()
ven_score = pd.Series()

for x in range(3,13):
    ven_cat = pd.concat([ven_cat,cluster_3[cluster_3.columns[x]]])
    ven_score = pd.concat([ven_score,pd.Series(np.zeros(cluster_3.shape[0])+(13-x)/len(cluster_2))])
    
cluster_score = pd.concat([ven_cat,ven_score],axis=1)
cluster_score.rename(columns={0:'Category',1:'Score'}, inplace=True)
print('CLUSTER 3 --',len(cluster_3), 'neighborhoods')
cluster_score.groupby('Category').sum().sort_values(by='Score', ascending=False)[:10]


CLUSTER 3 -- 17 neighborhoods


Unnamed: 0_level_0,Score
Category,Unnamed: 1_level_1
_Coffee Shop,10.0
_Italian Restaurant,5.933333
_Café,5.533333
_Park,3.866667
_Pizza Place,2.8
_Bakery,2.733333
_Sushi Restaurant,2.666667
_Gym,1.733333
_Indian Restaurant,1.666667
_Pub,1.666667


In [41]:
pd.options.display.max_rows = None
onehot_toronto1.sum()

Neighborhood                      Harbourfront, Regent ParkHarbourfront, Regent ...
_Adult Boutique                                                                   1
_Airport                                                                          1
_Airport Lounge                                                                   1
_American Restaurant                                                             25
_Amphitheater                                                                     1
_Animal Shelter                                                                   1
_Antique Shop                                                                     2
_Aquarium                                                                         2
_Art Gallery                                                                     11
_Art Museum                                                                       1
_Arts & Crafts Store                                                        

#### Man, Coffee Shops & Cafes dominate all 3 clusters.
Moving down a level,  
Cluster 1 becomes 'Parks',   
Cluster 2 is now 'Hotels', and  
Cluster 3 is 'Italian Restaurants'

In [42]:
pd.reset_option('display.max_rows')

In [43]:
cluster_name1 = ['Parks','Hotels','Italian Restaurants']
# map the neighborhoods & clusters
Tor_Lat=df_neighborhoods.Latitude.median(axis=0)
Tor_Lon=df_neighborhoods.Longitude.median(axis=0)
c_array = ['red','green','blue','cyan','magenta','yellow','white','black']

cluster_toronto_map1 = folium.Map(location=[Tor_Lat,Tor_Lon], zoom_start=12)
#for lat, lon, label in zip(df_boroughs.Latitude, df_boroughs.Longitude, df_boroughs.index):
#    folium.Popup(label,parse_html=True)
#    folium.CircleMarker([lat,lon],
#                       radius=7,
#                       popup=label,
#                       color=c_array[np.where(label == df_boroughs.index)[0][0]],
#                       fill=False,
#                       parse_html=False).add_to(toronto_map)
    

for label, cluster, lat, lon in zip(clustered_toronto1.Neighborhood, clustered_toronto1.Cluster_label, clustered_toronto1.Latitude, clustered_toronto1.Longitude):
    folium.Popup(label+' ('+cluster_name1[cluster]+')',parse_html=True)
    folium.CircleMarker([lat,lon],
                       radius=5,
                       popup=label+' ('+cluster_name1[cluster]+')',
                       color=c_array[cluster],
                       fill=True,
                       fill_opacity=0.6,
                       parse_html=False).add_to(cluster_toronto_map1)


cluster_toronto_map1