# IBM Data Science Capstone Project

Note: this notebook is dedicated for the IBM Data Science certificate capstone project

### Week 3 - Segmenting and Clustering Neighborhoods in Toronto

In [1]:
from bs4 import BeautifulSoup
import requests
import lxml

import pandas as pd
import numpy as np

#### Webscrapping - wikipedia

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r = requests.get(url)
html = r.text
bs = BeautifulSoup(html)
#print(bs.prettify())  Check out the prettified html

In [3]:
result =[]
for item in bs.find_all('td'):
    result.append(item.get_text())

#### Data cleaning

In [4]:
#remove unwanted items
result=result[:-33]

In [5]:
postal_code=result[::3]
postal_code = [x[:-1] for x in postal_code]

In [6]:
borough = result[1::3]
borough = [x[:-1] for x in borough]

In [7]:
neighborhood = result[2::3]
neighborhood = [x[:-1] for x in neighborhood]

In [8]:
data = {'PostalCode': postal_code, 'borough': borough, 'neighborhood': neighborhood}
df = pd.DataFrame(data, columns=data.keys())

In [9]:
df.head()

Unnamed: 0,PostalCode,borough,neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [10]:
#ignore rows without an assigned borough
df = df[df.borough!='Not assigned']
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,PostalCode,borough,neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [11]:
#check if there are any 'Not assigned' neighborhood
sum(df.neighborhood=='Not assigned')

0

In [12]:
df.shape

(103, 3)

#### Get lat and long coordinates of each neighborhood

In [13]:
coords = pd.read_csv('Geospatial_Coordinates.csv', names=['PostalCode','Latitude','Longitude'], header=0)

In [14]:
coords.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [15]:
df = pd.merge(df, coords, on=['PostalCode'])
df.head(11)

Unnamed: 0,PostalCode,borough,neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


#### Cluster the neighborhoods in Toronto

Create a map of Toronto with neighborhoods superimposed

In [16]:
import folium

latitude = df.Latitude.mean()
longitude = df.Longitude.mean()
map_toronto = folium.Map(location=[latitude,longitude], zoom_start=10)

#add markers to map
for lat, long, borough, neighborhood in zip(df.Latitude, df.Longitude, df.borough, df.neighborhood):
    label = '{}.{}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat,long],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

map_toronto

Explore the first neighborhood in Toronto via Foursquares API

In [17]:
CLIENT_ID = 'CLIENT_ID_HIDDEN'
CLIENT_SECRET = 'CLIENT_SECRET_HIDDEN'
VERSION = '20180605'
LIMIT = 100
RADIUS = 500

In [18]:
# get the name of the first neighborhood on the list
df.loc[0,'neighborhood']

'Parkwoods'

In [19]:
# get the lat and long of this neighorbood
neighborhood_latitude = df.loc[0,'Latitude']
neighborhood_longitude = df.loc[0,'Longitude']

In [20]:
neighborhood_longitude

-79.3296565

In [21]:
# get top 100 venues in Parkwoods

url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(
    CLIENT_ID, CLIENT_SECRET,neighborhood_latitude, neighborhood_longitude, VERSION, RADIUS, LIMIT)

In [22]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5f0df55761e04d7e7c55e685'},
  'headerLocation': 'Parkwoods - Donalda',
  'headerFullLocation': 'Parkwoods - Donalda, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 3,
  'suggestedBounds': {'ne': {'lat': 43.757758604500005,
    'lng': -79.32343823984928},
   'sw': {'lat': 43.7487585955, 'lng': -79.33587476015072}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4e8d9dcdd5fbbbb6b3003c7b',
       'name': 'Brookbanks Park',
       'location': {'address': 'Toronto',
        'lat': 43.751976046055574,
        'lng': -79.33214044722958,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.751976046055574,
          'lng': -79.33214044722958}],
        'distance': 245,
        'cc': 'CA',
        'c

In [23]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [24]:
# all the information is in the items key
venues = results['response']['groups'][0]['items']

In [25]:
# flatten JSON
nearby_venues = pd.json_normalize(venues)
nearby_venues

Unnamed: 0,referralId,reasons.count,reasons.items,venue.id,venue.name,venue.location.address,venue.location.lat,venue.location.lng,venue.location.labeledLatLngs,venue.location.distance,venue.location.cc,venue.location.city,venue.location.state,venue.location.country,venue.location.formattedAddress,venue.categories,venue.photos.count,venue.photos.groups
0,e-0-4e8d9dcdd5fbbbb6b3003c7b-0,0,"[{'summary': 'This spot is popular', 'type': '...",4e8d9dcdd5fbbbb6b3003c7b,Brookbanks Park,Toronto,43.751976,-79.33214,"[{'label': 'display', 'lat': 43.75197604605557...",245,CA,Toronto,ON,Canada,"[Toronto, Toronto ON, Canada]","[{'id': '4bf58dd8d48988d163941735', 'name': 'P...",0,[]
1,e-0-4e2f203a7d8bf59d2913aadf-1,0,"[{'summary': 'This spot is popular', 'type': '...",4e2f203a7d8bf59d2913aadf,Brookbanks Pool,15 Brookbanks dr,43.751389,-79.332184,"[{'label': 'display', 'lat': 43.75138897139317...",290,CA,North York,ON,Canada,"[15 Brookbanks dr, North York ON, Canada]","[{'id': '4bf58dd8d48988d15e941735', 'name': 'P...",0,[]
2,e-0-4cb11e2075ebb60cd1c4caad-2,0,"[{'summary': 'This spot is popular', 'type': '...",4cb11e2075ebb60cd1c4caad,Variety Store,29 Valley Woods Road,43.751974,-79.333114,"[{'label': 'display', 'lat': 43.75197441585782...",312,CA,Toronto,ON,Canada,"[29 Valley Woods Road, Toronto ON, Canada]","[{'id': '4bf58dd8d48988d1f9941735', 'name': 'F...",0,[]


In [26]:
filtered_col = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:,filtered_col]

In [27]:
nearby_venues.head()

Unnamed: 0,venue.name,venue.categories,venue.location.lat,venue.location.lng
0,Brookbanks Park,"[{'id': '4bf58dd8d48988d163941735', 'name': 'P...",43.751976,-79.33214
1,Brookbanks Pool,"[{'id': '4bf58dd8d48988d15e941735', 'name': 'P...",43.751389,-79.332184
2,Variety Store,"[{'id': '4bf58dd8d48988d1f9941735', 'name': 'F...",43.751974,-79.333114


In [28]:
# filter the category row, apply function defined
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

In [29]:
# clean column headings
nearby_venues.columns = [col.split('.')[-1] for col in nearby_venues.columns]

In [30]:
nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Brookbanks Park,Park,43.751976,-79.33214
1,Brookbanks Pool,Pool,43.751389,-79.332184
2,Variety Store,Food & Drink Shop,43.751974,-79.333114


In [31]:
print('There are {} venues in {} returned by Foursquare'.format(nearby_venues.shape[0], df.loc[0,'neighborhood']))

There are 3 venues in Parkwoods returned by Foursquare


Explore all neighborhoods in Toronto via Foursquares API

In [32]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    #set up an empty venue_list
    venues_list=[]
    
    #create a list of (name, lat, long), and print name
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
    
        #create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, lat, lng, VERSION, RADIUS, LIMIT)
    
        #make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']
    
        #retrive venues from Foursquare
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [33]:
toronto_venues = getNearbyVenues(names=df['neighborhood'],
                                latitudes=df['Latitude'],
                                longitudes=df['Longitude'])

In [34]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Brookbanks Pool,43.751389,-79.332184,Pool
2,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


In [35]:
print(toronto_venues.shape)

(2125, 7)


In [36]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,5,5,5,5,5,5
"Alderwood, Long Branch",7,7,7,7,7,7
"Bathurst Manor, Wilson Heights, Downsview North",22,22,22,22,22,22
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",26,26,26,26,26,26
...,...,...,...,...,...,...
"Willowdale, Willowdale West",6,6,6,6,6,6
Woburn,4,4,4,4,4,4
Woodbine Heights,8,8,8,8,8,8
York Mills West,2,2,2,2,2,2


In [37]:
print('There are {} unique categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 267 unique categories.


Analyze each neighborhood

In [38]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot.head()

Unnamed: 0,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
#add neighborhood col to the dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']
toronto_onehot.head()

Unnamed: 0,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
toronto_onehot.columns[185]

'Music Venue'

In [41]:
# move Neighborhood as the first col
fixed_columns = [toronto_onehot.columns[185]] + list(toronto_onehot.columns[:185]) + list(toronto_onehot.columns[186:])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_onehot.head()

Unnamed: 0,Music Venue,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

In [43]:
toronto_grouped.shape

(95, 267)

Create a dataframe with top venues in each neighborhood

In [44]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

In [45]:
num_top_venues = 10

indicators = ['st','nd','rd']

#create columns accoding to the number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

#create a new dataframe
neighborhood_venues_sorted = pd.DataFrame(columns=columns)
neighborhood_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhood_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind,:], num_top_venues)

neighborhood_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Lounge,Latin American Restaurant,Clothing Store,Skating Rink,Breakfast Spot,Distribution Center,Department Store,Dessert Shop,Dim Sum Restaurant,Diner
1,"Alderwood, Long Branch",Pizza Place,Gym,Coffee Shop,Skating Rink,Sandwich Place,Pub,Dim Sum Restaurant,Dance Studio,Deli / Bodega,Department Store
2,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Shopping Mall,Frozen Yogurt Shop,Supermarket,Sushi Restaurant,Restaurant,Middle Eastern Restaurant,Mobile Phone Shop,Deli / Bodega
3,Bayview Village,Café,Bank,Japanese Restaurant,Chinese Restaurant,Yoga Studio,Deli / Bodega,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store
4,"Bedford Park, Lawrence Manor East",Italian Restaurant,Coffee Shop,Restaurant,Sandwich Place,Juice Bar,Pharmacy,Indian Restaurant,Café,Pub,Sushi Restaurant


### Clustering Neighborhoods

In [46]:
from sklearn.cluster import KMeans

k = 10

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood',1)
kmeans = KMeans(n_clusters=k, random_state=0).fit(toronto_grouped_clustering)
kmeans.labels_[0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [47]:
#add clustering labels to the neighborhood_venues_sorted df

neighborhood_venues_sorted.insert(0,'Cluster Label',kmeans.labels_)


In [48]:
neighborhood_venues_sorted.head()

Unnamed: 0,Cluster Label,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0,Agincourt,Lounge,Latin American Restaurant,Clothing Store,Skating Rink,Breakfast Spot,Distribution Center,Department Store,Dessert Shop,Dim Sum Restaurant,Diner
1,0,"Alderwood, Long Branch",Pizza Place,Gym,Coffee Shop,Skating Rink,Sandwich Place,Pub,Dim Sum Restaurant,Dance Studio,Deli / Bodega,Department Store
2,0,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Shopping Mall,Frozen Yogurt Shop,Supermarket,Sushi Restaurant,Restaurant,Middle Eastern Restaurant,Mobile Phone Shop,Deli / Bodega
3,0,Bayview Village,Café,Bank,Japanese Restaurant,Chinese Restaurant,Yoga Studio,Deli / Bodega,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store
4,0,"Bedford Park, Lawrence Manor East",Italian Restaurant,Coffee Shop,Restaurant,Sandwich Place,Juice Bar,Pharmacy,Indian Restaurant,Café,Pub,Sushi Restaurant


In [49]:
toronto_data = df.drop('PostalCode',1)
toronto_data.rename(columns={'borough':'Borough','neighborhood':'Neighborhood'}, inplace=True)

In [50]:
toronto_merged = pd.merge(toronto_data, neighborhood_venues_sorted, on='Neighborhood')

In [51]:
toronto_merged.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,North York,Parkwoods,43.753259,-79.329656,7,Park,Pool,Food & Drink Shop,Yoga Studio,Discount Store,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner
1,North York,Victoria Village,43.725882,-79.315572,0,Hockey Arena,Coffee Shop,Intersection,Portuguese Restaurant,French Restaurant,Yoga Studio,Dance Studio,Deli / Bodega,Department Store,Dessert Shop
2,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Coffee Shop,Bakery,Park,Breakfast Spot,Café,Pub,Theater,Yoga Studio,Spa,Shoe Store
3,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,0,Furniture / Home Store,Clothing Store,Event Space,Accessories Store,Miscellaneous Shop,Boutique,Vietnamese Restaurant,Coffee Shop,Discount Store,Dessert Shop
4,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0,Coffee Shop,Diner,Yoga Studio,Sandwich Place,Park,Mexican Restaurant,Italian Restaurant,Hobby Shop,General Entertainment,Fried Chicken Joint


In [52]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine Clusters

Cluster 1

In [57]:
cluster1=toronto_merged.loc[toronto_merged['Cluster Label'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
cluster1.head(5)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Victoria Village,Hockey Arena,Coffee Shop,Intersection,Portuguese Restaurant,French Restaurant,Yoga Studio,Dance Studio,Deli / Bodega,Department Store,Dessert Shop
2,"Regent Park, Harbourfront",Coffee Shop,Bakery,Park,Breakfast Spot,Café,Pub,Theater,Yoga Studio,Spa,Shoe Store
3,"Lawrence Manor, Lawrence Heights",Furniture / Home Store,Clothing Store,Event Space,Accessories Store,Miscellaneous Shop,Boutique,Vietnamese Restaurant,Coffee Shop,Discount Store,Dessert Shop
4,"Queen's Park, Ontario Provincial Government",Coffee Shop,Diner,Yoga Studio,Sandwich Place,Park,Mexican Restaurant,Italian Restaurant,Hobby Shop,General Entertainment,Fried Chicken Joint
6,Don Mills,Gym,Beer Store,Japanese Restaurant,Restaurant,Coffee Shop,Clothing Store,Italian Restaurant,Supermarket,Discount Store,Caribbean Restaurant


In [75]:
#Most common venues from each col

common_categories=[]

n = len(cluster1.columns)+1
for col in cluster1.columns[1:n]:
    common_categories.append(cluster1[col].value_counts().idxmax())
    
common_categories

['Coffee Shop',
 'Coffee Shop',
 'Restaurant',
 'Bakery',
 'Restaurant',
 'Food Truck',
 'Italian Restaurant',
 'Diner',
 'Discount Store',
 'Dessert Shop']

In [78]:
#Top 3 most common venues from cluster 1

cluster1['1st Most Common Venue'].value_counts()[:3]

Coffee Shop      17
Café              8
Grocery Store     7
Name: 1st Most Common Venue, dtype: int64

In [104]:
def find_most_common_venues(df):
    for i in range(3,0,-1):
        while True:
            try:
                venues = df['1st Most Common Venue'].value_counts()[:i].index.tolist()
                print(venues)
                break
            except:
                venues = df['1st Most Common Venue'].value_counts()[:i-1].index.tolist()

In [109]:
cluster2=toronto_merged.loc[toronto_merged['Cluster Label'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
cluster3=toronto_merged.loc[toronto_merged['Cluster Label'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
cluster4=toronto_merged.loc[toronto_merged['Cluster Label'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
cluster5=toronto_merged.loc[toronto_merged['Cluster Label'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
cluster6=toronto_merged.loc[toronto_merged['Cluster Label'] == 5, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
cluster7=toronto_merged.loc[toronto_merged['Cluster Label'] == 6, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
cluster8=toronto_merged.loc[toronto_merged['Cluster Label'] == 7, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
cluster9=toronto_merged.loc[toronto_merged['Cluster Label'] == 8, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
cluster10=toronto_merged.loc[toronto_merged['Cluster Label'] == 9, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

In [110]:
find_most_common_venues(cluster2)

['Playground']
['Playground']
['Playground']


In [111]:
find_most_common_venues(cluster3)

['Paper / Office Supplies Store', 'Baseball Field']
['Paper / Office Supplies Store', 'Baseball Field']
['Paper / Office Supplies Store']


In [112]:
find_most_common_venues(cluster4)

['Basketball Court', 'Park']
['Basketball Court', 'Park']
['Basketball Court']


In [113]:
find_most_common_venues(cluster5)

['River']
['River']
['River']


In [114]:
find_most_common_venues(cluster6)

['Cafeteria']
['Cafeteria']
['Cafeteria']


In [115]:
find_most_common_venues(cluster7)

['Golf Course']
['Golf Course']
['Golf Course']


In [116]:
find_most_common_venues(cluster8)

['Park', 'Trail']
['Park', 'Trail']
['Park']


In [117]:
find_most_common_venues(cluster9)

['Fast Food Restaurant']
['Fast Food Restaurant']
['Fast Food Restaurant']


In [118]:
find_most_common_venues(cluster10)

['Convenience Store', 'Park']
['Convenience Store', 'Park']
['Convenience Store']
