# IBM Applied Data Science Capstone project

In [1]:
import pandas as pd
import numpy as np

In [2]:
import requests
import lxml
from bs4 import BeautifulSoup

In [3]:
from geopy.geocoders import Nominatim 

In [101]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

## Table of Contents

<div class="alert alert-block alert-info" style="margin-top: 20px">

<font size = 3>

1. <a href="#item1">Part 1. Create DataFrama</a>

2. <a href="#item2">Part 2. Add coordinates</a>

3. <a href="#item3">Part 3. Explore and cluster</a>

</font>
</div>

# Part1. Create DataFrame with Postal Codes <a class="anchor" id="item1"></a>

### Get data into data frame

In [4]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url)
page_soup = BeautifulSoup(page.content, 'html.parser')     

In [5]:
table_raw = page_soup.table

In [6]:
tbl_header = []
for l in table_raw.find_all('th'):
    tbl_header.append(l.string.strip('\n'))

In [7]:
tbl_header

['Postal code', 'Borough', 'Neighborhood']

In [8]:
tbl_content = []
for l in table_raw.find_all('td'):
    tbl_content.append(l.string.strip('\n'))

In [9]:
len(tbl_content)

540

In [10]:
n_cols = len(tbl_header)
tbl_content_split = [tbl_content[x:x+n_cols] for x in range(0, len(tbl_content), n_cols)]

In [11]:
toronto_postal_codes_raw = pd.DataFrame(tbl_content_split, columns=tbl_header)

### Cleanup data

In [12]:
new_df = toronto_postal_codes_raw[toronto_postal_codes_raw['Borough'] != 'Not assigned']

In [13]:
# There are no duplicates unlike have been told in the exercise
new_df[new_df.duplicated('Postal code')]

Unnamed: 0,Postal code,Borough,Neighborhood


In [14]:
# There are no Neighborhood with 'Not assigned' or empty 
new_df[(new_df['Neighborhood'] == 'Not assigned') | (new_df['Neighborhood'] == '')]

Unnamed: 0,Postal code,Borough,Neighborhood


In [15]:
# Replace / in Neighborhood with ,
new_df['Neighborhood'].replace(' /', ',', regex=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [16]:
toronto_postal_codes = new_df.reset_index(drop=True)

In [17]:
toronto_postal_codes.rename(columns={'Postal code': 'Postal Code'}, inplace=True)

In [18]:
toronto_postal_codes.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [19]:
toronto_postal_codes.tail()

Unnamed: 0,Postal Code,Borough,Neighborhood
98,M8X,Etobicoke,"The Kingsway, Montgomery Road , Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing CentrE
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."
102,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [20]:
toronto_postal_codes.shape

(103, 3)

# Part2. Add Latitude and Longitude to Postal Codes <a class="anchor" id="item2"></a>

In [None]:
import geocoder

In [None]:
geocoder.google('{}, Toronto, Ontario'.format('M8X'))

In [179]:
print(g.latlng)

None


In [23]:
def get_location(postal_code):

    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
      lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    return latitude, longitude

Ok, lets simply add csv with coordinates from Coursera

In [24]:
url = 'https://cocl.us/Geospatial_data'

In [25]:
df_lat_lon = pd.read_csv(url)

In [26]:
df_lat_lon.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [27]:
df_lat_lon.shape

(103, 3)

In [28]:
toronto_postal_codes_w_coords = pd.merge(
    toronto_postal_codes,
    df_lat_lon,
    on='Postal Code')

In [30]:
toronto_postal_codes_w_coords.head(11)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


# Part3. Explore neighborhoods and cluster <a class="anchor" id="item3"></a>

In [31]:
import folium

### Create map

In [32]:
latitude, longitude = 43.753259, -79.329656
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(
    toronto_postal_codes_w_coords['Latitude'],
    toronto_postal_codes_w_coords['Longitude'], 
    toronto_postal_codes_w_coords['Borough'], 
    toronto_postal_codes_w_coords['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Lets explore and cluster East York area

In [33]:
east_york = toronto_postal_codes_w_coords[toronto_postal_codes_w_coords['Borough'] == 'East York']
east_york.reset_index(inplace=True, drop=True)

In [34]:
east_york.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
1,M4C,East York,Woodbine Heights,43.695344,-79.318389
2,M4G,East York,Leaside,43.70906,-79.363452
3,M4H,East York,Thorncliffe Park,43.705369,-79.349372
4,M4J,East York,East Toronto,43.685347,-79.338106


In [63]:
east_york.shape

(5, 5)

Only 5 neighborhoods, not that much to cluster, lets see some other boroughs

In [69]:
toronto_postal_codes_w_coords['Borough'].value_counts()

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East York            5
York                 5
East Toronto         5
Mississauga          1
Name: Borough, dtype: int64

**North York has top number of neighborhoods, lets cluster this one**

In [73]:
borough_df = toronto_postal_codes_w_coords[toronto_postal_codes_w_coords['Borough'] == 'North York']
borough_df.reset_index(inplace=True, drop=True)
borough_df.shape

(24, 5)

In [74]:
# Lets get Borough coordinates
address = 'North York, Toronto'

geolocator = Nominatim(user_agent="new_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))

The geograpical coordinate of North York, Toronto are 43.7543263, -79.44911696639593.


In [75]:
# We visualize it
# create map of borough using latitude and longitude values
map_borough = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(
    borough_df['Latitude'], 
    borough_df['Longitude'], 
    borough_df['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_borough)  
    
map_borough

### Clustering

In [39]:
# The code was removed by Watson Studio for sharing.

In [72]:
# Some definitions
LIMIT = 100

Helper function

In [40]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [76]:
borough_venues = getNearbyVenues(names=borough_df['Neighborhood'],
                                   latitudes=borough_df['Latitude'],
                                   longitudes=borough_df['Longitude']
                                  )

Parkwoods
Victoria Village
Lawrence Manor, Lawrence Heights
Don Mills
Glencairn
Don Mills
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Fairview, Henry Farm, Oriole
Northwood Park, York University
Bayview Village
Downsview
York Mills, Silver Hills
Downsview
North Park, Maple Leaf Park, Upwood Park
Humber Summit
Willowdale, Newtonbrook
Downsview
Bedford Park, Lawrence Manor East
Humberlea, Emery
Willowdale
Downsview
York Mills West
Willowdale


In [77]:
print(borough_venues.shape)
borough_venues.head()

(635, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Allwyn's Bakery,43.75984,-79.324719,Caribbean Restaurant
1,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
2,Parkwoods,43.753259,-79.329656,Tim Hortons,43.760668,-79.326368,Café
3,Parkwoods,43.753259,-79.329656,A&W,43.760643,-79.326865,Fast Food Restaurant
4,Parkwoods,43.753259,-79.329656,Bruno's valu-mart,43.746143,-79.32463,Grocery Store


In [78]:
# How many venues per neighborhood
borough_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Bathurst Manor, Wilson Heights, Downsview North",30,30,30,30,30,30
Bayview Village,15,15,15,15,15,15
"Bedford Park, Lawrence Manor East",42,42,42,42,42,42
Don Mills,76,76,76,76,76,76
Downsview,66,66,66,66,66,66
"Fairview, Henry Farm, Oriole",44,44,44,44,44,44
Glencairn,31,31,31,31,31,31
Hillcrest Village,20,20,20,20,20,20
Humber Summit,11,11,11,11,11,11
"Humberlea, Emery",8,8,8,8,8,8


In [79]:
# How many uniq categories
print('There are {} uniques categories.'.format(len(borough_venues['Venue Category'].unique())))

There are 152 uniques categories.


### Analyze each neighborhood

In [80]:
# one hot encoding
borough_onehot = pd.get_dummies(borough_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
borough_onehot['Neighborhood'] = borough_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [borough_onehot.columns[-1]] + list(borough_onehot.columns[:-1])
borough_onehot = borough_onehot[fixed_columns]

borough_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Baby Store,Bagel Shop,Bakery,...,Theater,Toy / Game Store,Trail,Train Station,Turkish Restaurant,Video Store,Vietnamese Restaurant,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [81]:
borough_onehot.shape

(635, 153)

In [82]:
# Lets regroup by frequency of occuring categories
borough_grouped = borough_onehot.groupby('Neighborhood').mean().reset_index()
borough_grouped

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Baby Store,Bagel Shop,Bakery,...,Theater,Toy / Game Store,Trail,Train Station,Turkish Restaurant,Video Store,Vietnamese Restaurant,Wings Joint,Women's Store,Yoga Studio
0,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.033333,0.0,0.0,0.033333,0.0,0.0,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bedford Park, Lawrence Manor East",0.0,0.0,0.02381,0.0,0.0,0.0,0.02381,0.02381,0.02381,...,0.0,0.0,0.0,0.0,0.0,0.02381,0.0,0.02381,0.0,0.0
3,Don Mills,0.0,0.0,0.013158,0.0,0.039474,0.013158,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013158,0.0
4,Downsview,0.0,0.015152,0.015152,0.0,0.0,0.030303,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.030303,0.0,0.075758,0.0,0.0,0.0
5,"Fairview, Henry Farm, Oriole",0.0,0.0,0.022727,0.0,0.022727,0.0,0.0,0.0,0.045455,...,0.022727,0.022727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Glencairn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Hillcrest Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Humber Summit,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.090909,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Humberlea, Emery",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [83]:
borough_grouped.shape

(19, 153)

In [58]:
# Print each neighborhood with 5 top venues
num_top_venues = 5

for hood in borough_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = borough_grouped[borough_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----East Toronto----
                  venue  freq
0      Greek Restaurant  0.08
1                  Café  0.08
2           Coffee Shop  0.08
3           Pizza Place  0.03
4  Fast Food Restaurant  0.03


----Leaside----
                    venue  freq
0     Sporting Goods Shop  0.06
1             Coffee Shop  0.06
2  Furniture / Home Store  0.05
3       Electronics Store  0.05
4           Grocery Store  0.05


----Parkview Hill, Woodbine Gardens----
                  venue  freq
0  Fast Food Restaurant  0.09
1  Gym / Fitness Center  0.09
2           Coffee Shop  0.09
3                Bakery  0.09
4           Pizza Place  0.09


----Thorncliffe Park----
               venue  freq
0        Coffee Shop  0.10
1  Indian Restaurant  0.06
2      Grocery Store  0.06
3               Bank  0.04
4            Brewery  0.04


----Woodbine Heights----
                venue  freq
0                Park  0.13
1         Coffee Shop  0.10
2  Athletics & Sports  0.07
3     Thai Restaurant  0.07
4      Sand

Helper function

In [84]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [98]:
# Let's crete new dataframe with 10 top venues
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = borough_grouped['Neighborhood']

for ind in np.arange(borough_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(borough_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Pizza Place,Bank,Mediterranean Restaurant,Shopping Mall,Sandwich Place,Dog Run,Diner,Restaurant,Men's Store
1,Bayview Village,Gas Station,Bank,Japanese Restaurant,Intersection,Café,Chinese Restaurant,Restaurant,Skating Rink,Shopping Mall,Trail
2,"Bedford Park, Lawrence Manor East",Italian Restaurant,Coffee Shop,Pizza Place,Sushi Restaurant,Sandwich Place,Fast Food Restaurant,Restaurant,Bank,Butcher,Café
3,Don Mills,Restaurant,Coffee Shop,Japanese Restaurant,Gym,Bank,Supermarket,Asian Restaurant,Burger Joint,Mobile Phone Shop,Pizza Place
4,Downsview,Vietnamese Restaurant,Coffee Shop,Hotel,Pizza Place,Grocery Store,Gas Station,Park,Sandwich Place,Fast Food Restaurant,Liquor Store


In [86]:
neighborhoods_venues_sorted.shape

(19, 11)

### Cluster

In [88]:
from sklearn.cluster import KMeans

In [96]:
# set number of clusters
kclusters = 5

borough_grouped_clustering = borough_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(borough_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 1, 1, 1, 1, 1, 4, 3], dtype=int32)

In [99]:
# New dataframe with cluster and top venues
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

borough_merged = borough_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
borough_merged = borough_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

borough_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,1,Park,Convenience Store,Pharmacy,Bus Stop,Shopping Mall,Fast Food Restaurant,Pizza Place,Chinese Restaurant,Caribbean Restaurant,Café
1,M4A,North York,Victoria Village,43.725882,-79.315572,0,Park,Coffee Shop,Portuguese Restaurant,Boxing Gym,Gym / Fitness Center,Grocery Store,Golf Course,Lounge,Men's Store,Sporting Goods Shop
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,1,Clothing Store,Restaurant,Fast Food Restaurant,Coffee Shop,Sushi Restaurant,Vietnamese Restaurant,Women's Store,Furniture / Home Store,Fried Chicken Joint,Dessert Shop
3,M3B,North York,Don Mills,43.745906,-79.352188,1,Restaurant,Coffee Shop,Japanese Restaurant,Gym,Bank,Supermarket,Asian Restaurant,Burger Joint,Mobile Phone Shop,Pizza Place
4,M6B,North York,Glencairn,43.709577,-79.445073,1,Grocery Store,Gym,Fast Food Restaurant,Gas Station,Coffee Shop,Park,Pizza Place,Italian Restaurant,Mediterranean Restaurant,Photography Lab


In [102]:
# Put on the map

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(
    borough_merged['Latitude'], 
    borough_merged['Longitude'], 
    borough_merged['Neighborhood'], 
    borough_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine cluster

In [164]:
for i in range(kclusters):
    print('Cluster {}:'.format(i))
    #print(borough_merged.loc[borough_merged['Cluster Labels'] == i, borough_merged.columns[[2] + list(range(5, borough_merged.shape[1]))]])
    print(borough_merged.loc[borough_merged['Cluster Labels'] == i, borough_merged.columns[[2] + list(range(5, 7))]])

Cluster 0:
        Neighborhood  Cluster Labels 1st Most Common Venue
1   Victoria Village               0                  Park
22   York Mills West               0           Coffee Shop
Cluster 1:
                                       Neighborhood  Cluster Labels  \
0                                         Parkwoods               1   
2                  Lawrence Manor, Lawrence Heights               1   
3                                         Don Mills               1   
4                                         Glencairn               1   
5                                         Don Mills               1   
6                                 Hillcrest Village               1   
7   Bathurst Manor, Wilson Heights, Downsview North               1   
8                      Fairview, Henry Farm, Oriole               1   
9                   Northwood Park, York University               1   
10                                  Bayview Village               1   
11                  

In [169]:
borough_merged.loc[borough_merged['Cluster Labels'] == 0, borough_merged.columns[[2] + list(range(5, borough_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Victoria Village,0,Park,Coffee Shop,Portuguese Restaurant,Boxing Gym,Gym / Fitness Center,Grocery Store,Golf Course,Lounge,Men's Store,Sporting Goods Shop
22,York Mills West,0,Coffee Shop,Park,Restaurant,Tennis Court,Spa,Optical Shop,Bowling Alley,French Restaurant,Dog Run,Business Service


In [170]:
borough_merged.loc[borough_merged['Cluster Labels'] == 1, borough_merged.columns[[2] + list(range(5, borough_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Parkwoods,1,Park,Convenience Store,Pharmacy,Bus Stop,Shopping Mall,Fast Food Restaurant,Pizza Place,Chinese Restaurant,Caribbean Restaurant,Café
2,"Lawrence Manor, Lawrence Heights",1,Clothing Store,Restaurant,Fast Food Restaurant,Coffee Shop,Sushi Restaurant,Vietnamese Restaurant,Women's Store,Furniture / Home Store,Fried Chicken Joint,Dessert Shop
3,Don Mills,1,Restaurant,Coffee Shop,Japanese Restaurant,Gym,Bank,Supermarket,Asian Restaurant,Burger Joint,Mobile Phone Shop,Pizza Place
4,Glencairn,1,Grocery Store,Gym,Fast Food Restaurant,Gas Station,Coffee Shop,Park,Pizza Place,Italian Restaurant,Mediterranean Restaurant,Photography Lab
5,Don Mills,1,Restaurant,Coffee Shop,Japanese Restaurant,Gym,Bank,Supermarket,Asian Restaurant,Burger Joint,Mobile Phone Shop,Pizza Place
6,Hillcrest Village,1,Pharmacy,Coffee Shop,Park,Korean Restaurant,Convenience Store,Pizza Place,Recreation Center,Residential Building (Apartment / Condo),Restaurant,Chinese Restaurant
7,"Bathurst Manor, Wilson Heights, Downsview North",1,Coffee Shop,Pizza Place,Bank,Mediterranean Restaurant,Shopping Mall,Sandwich Place,Dog Run,Diner,Restaurant,Men's Store
8,"Fairview, Henry Farm, Oriole",1,Clothing Store,Coffee Shop,Restaurant,Sandwich Place,Bank,Japanese Restaurant,Bakery,Electronics Store,Burger Joint,Shopping Mall
9,"Northwood Park, York University",1,Pizza Place,Restaurant,Coffee Shop,Furniture / Home Store,Metro Station,Middle Eastern Restaurant,Fast Food Restaurant,Caribbean Restaurant,Sandwich Place,Sports Bar
10,Bayview Village,1,Gas Station,Bank,Japanese Restaurant,Intersection,Café,Chinese Restaurant,Restaurant,Skating Rink,Shopping Mall,Trail


In [171]:
borough_merged.loc[borough_merged['Cluster Labels'] == 2, borough_merged.columns[[2] + list(range(5, borough_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
12,"York Mills, Silver Hills",2,Park,Pool,Discount Store,Falafel Restaurant,Event Space,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run


In [172]:
borough_merged.loc[borough_merged['Cluster Labels'] == 3, borough_merged.columns[[2] + list(range(5, borough_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
19,"Humberlea, Emery",3,Convenience Store,Park,Storage Facility,Discount Store,Gas Station,Bakery,Golf Course,Intersection,Dim Sum Restaurant,Dessert Shop


In [173]:
borough_merged.loc[borough_merged['Cluster Labels'] == 4, borough_merged.columns[[2] + list(range(5, borough_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
15,Humber Summit,4,Electronics Store,Pizza Place,Empanada Restaurant,Arts & Crafts Store,Pharmacy,Shopping Mall,Park,Italian Restaurant,Bakery,Bank


It looks like:

Cluster 0 has Parks and Coffee Shops 

Cluster 1 has Restaurants, Groceries and Coffee Shops, also this cluster is the most dense

Cluster 2 has Parks 

Cluster 3 has Conveniece Stores 

Cluster 4 has Electronics Stores 
