# Segmenting and Clustering neighbourhoods of Toronto

### **This Section contains initial process of extracting data from website and preprocessing the dataframe as per requirements mentioned in first part of assignment**

### **Part 1**


In [5]:
#Installing beautifulSoup and xml parser

!pip install beautifulsoup4
!pip install lxml

Collecting beautifulsoup4
[?25l  Downloading https://files.pythonhosted.org/packages/3b/c8/a55eb6ea11cd7e5ac4bacdf92bac4693b90d3ba79268be16527555e186f0/beautifulsoup4-4.8.1-py3-none-any.whl (101kB)
[K     |████████████████████████████████| 102kB 19.0MB/s ta 0:00:01
[?25hCollecting soupsieve>=1.2 (from beautifulsoup4)
  Downloading https://files.pythonhosted.org/packages/81/94/03c0f04471fc245d08d0a99f7946ac228ca98da4fa75796c507f61e688c2/soupsieve-1.9.5-py2.py3-none-any.whl
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.8.1 soupsieve-1.9.5
Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/ec/be/5ab8abdd8663c0386ec2dd595a5bc0e23330a0549b8a91e32f38c20845b6/lxml-4.4.1-cp36-cp36m-manylinux1_x86_64.whl (5.8MB)
[K     |████████████████████████████████| 5.8MB 10.3MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.4.1


In [6]:
#Importing required libraries
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [7]:
#Getting the wikipedia page into soup object html response

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

wikipagecontent = requests.get(url)

wikipagecontent

soup = BeautifulSoup(wikipagecontent.text,'lxml')

#### a.Scrape the wikipedia page to create a dataframe

In [8]:

row_dataframe=[]
table_soup=soup.find("table",class_="wikitable sortable")
#print(table_soup)


#Get table Heading and row contents from soup object
headings=table_soup.tbody.find_all('th')
table_data=table_soup.tbody.find_all('tr')


#extract Columns of dataframe     
columns_dataframe=[th.text.strip() for th in headings]

for tr in table_data:
 #Extract rows of dataframe   
    if tr.find_all('th'):
        continue;
    tds=tr.find_all('td')
    data = [td.text.strip() for td in tds];
    row_dataframe.append(data)
    

#Get the dataframe

postal_df=pd.DataFrame(row_dataframe,columns=columns_dataframe)

postal_df






Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West
286,M8Z,Etobicoke,South of Bloor


#### b. Drop data with Borough as unassigned

In [9]:
postal_df.drop(postal_df[postal_df['Borough']=='Not assigned'].index,inplace=True)

postal_df.reset_index(drop=True)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
...,...,...,...
206,M8Z,Etobicoke,Kingsway Park South West
207,M8Z,Etobicoke,Mimico NW
208,M8Z,Etobicoke,The Queensway West
209,M8Z,Etobicoke,Royal York South West


#### c. Merge the data within neighbourhood into comma seperated values for those which belong to same group of postal code

In [10]:
postal_grp=postal_df.groupby(['Postcode','Borough'],as_index=False)['Neighbourhood'].apply(lambda x: ','.join(x)).reset_index()
#postal_grp.rename(columns={"0":"Neighbourhood"},inplace=True)
postal_grp

postal_grp.rename(columns={0:"Neighbourhood"},inplace=True)

postal_grp





Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie..."
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."


#### d. For records which has Borough but unassigned Neighbourhood.Make Neighbourhood same as borough

In [11]:


## Unassigned neighbourhood corresponding to Postcode M7A

postal_grp[postal_grp['Borough'] == 'Not assigned']
unassigned_neighbourhood=postal_grp[postal_grp['Neighbourhood']=='Not assigned']

print(unassigned_neighbourhood)

not_assigned_borough=unassigned_neighbourhood['Borough']
postal_grp['Neighbourhood'].replace(['Not assigned'],not_assigned_borough,inplace=True)

##THere is no unassigned Neighbourhood after replacement.After replacement The neighbourhood corresponding to M7A is assigned to corresponding Borough


postal_grp[postal_grp['Postcode'] == 'M7A']

   Postcode       Borough Neighbourhood
85      M7A  Queen's Park  Not assigned


Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Queen's Park


#### e. shape of dataframe

In [12]:
postal_grp.shape

(103, 3)

### **This Section contains  process of getting coordinates of neighbourhoods and merging it with the dataframe.**

### **Part 2**

In [54]:
cordinatesdf=pd.read_csv('http://cocl.us/Geospatial_data')
cordinatesdf.head(5)
for code in postal_grp['Postcode']:
    
    #print(cordinatesdf[['Latitude','Longitude']].loc[cordinatesdf['Postal Code']==code])

    postal_grp['Latitude'].loc[postal_grp['Postcode']==code] =cordinatesdf['Latitude'].loc[cordinatesdf['Postal Code']==code]
    postal_grp['Longitude'].loc[postal_grp['Postcode']==code] =cordinatesdf['Longitude'].loc[cordinatesdf['Postal Code']==code]

    
    #print(postal_grp['Latitude'])
    
    #postal_grp[['Latitude','Longitude']]

postal_grp



Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",43.739416,-79.588437


### **This Section describes the clustering exploring of neighbourhood of Downtown Toronto region and displying map of venues around it**

### **Part 3**

In [71]:
toronto_data=postal_grp[postal_grp['Borough'] =='Downtown Toronto'].reset_index(drop=True)
toronto_data=toronto_data[['Neighbourhood','Latitude','Longitude']]
toronto_data

Unnamed: 0,Neighbourhood,Latitude,Longitude
0,Rosedale,43.679563,-79.377529
1,"Cabbagetown,St. James Town",43.667967,-79.367675
2,Church and Wellesley,43.66586,-79.38316
3,"Harbourfront,Regent Park",43.65426,-79.360636
4,"Ryerson,Garden District",43.657162,-79.378937
5,St. James Town,43.651494,-79.375418
6,Berczy Park,43.644771,-79.373306
7,Central Bay Street,43.657952,-79.387383
8,"Adelaide,King,Richmond",43.650571,-79.384568
9,"Harbourfront East,Toronto Islands,Union Station",43.640816,-79.381752


### a.Importing required Libraries

In [59]:
import numpy as np # library to handle data in a vectorized manner


import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          91 KB

The following NEW packages will be INSTALLED:

    geographiclib: 1.50-py_0   conda-forge
    geopy:         1.20.0-py_0 conda-forge


Downloading and Extracting Packages
geopy-1.20.0         | 57 KB     | ##################################### | 100% 
geographiclib-1.50   | 34 KB     | ##

#### Getting the coordinates of Downtown Toronto using geolocator and creating a map

In [62]:
address = 'Downtown Toronto,Canada'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Downtown Toronto are 43.655115, -79.380219.


In [72]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, Neighbourhood in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighbourhood']):
    label = '{}'.format(Neighbourhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### c. Explore venues around Manhattan by using Fousquare API

In [73]:
CLIENT_ID = 'QJ3LCZ30QJFWZGOFIDWMN5MTLB0DANIGIWOGT3VE3JXHNGL0' # your Foursquare ID
CLIENT_SECRET = 'FTYTSLO110IBMENPFOCH53BNUHUM53XP4UCTQDVS3YRF4LGR' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: QJ3LCZ30QJFWZGOFIDWMN5MTLB0DANIGIWOGT3VE3JXHNGL0
CLIENT_SECRET:FTYTSLO110IBMENPFOCH53BNUHUM53XP4UCTQDVS3YRF4LGR


#### d.Create a function to get details of 10 venues within radius of 500 meters

In [74]:
def getNearbyVenues(neighbour, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(neighbour, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            10)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [76]:
toronto_venues = getNearbyVenues(neighbour=toronto_data['Neighbourhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )


Rosedale
Cabbagetown,St. James Town
Church and Wellesley
Harbourfront,Regent Park
Ryerson,Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide,King,Richmond
Harbourfront East,Toronto Islands,Union Station
Design Exchange,Toronto Dominion Centre
Commerce Court,Victoria Hotel
Harbord,University of Toronto
Chinatown,Grange Park,Kensington Market
CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place,Underground city
Christie


In [77]:
toronto_venues.head(5)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Rosedale,43.679563,-79.377529,Rosedale Park,43.682328,-79.378934,Playground
1,Rosedale,43.679563,-79.377529,Whitney Park,43.682036,-79.373788,Park
2,Rosedale,43.679563,-79.377529,Alex Murray Parkette,43.6783,-79.382773,Park
3,Rosedale,43.679563,-79.377529,Milkman's Lane,43.676352,-79.373842,Trail
4,"Cabbagetown,St. James Town",43.667967,-79.367675,Butter Chicken Factory,43.667072,-79.369184,Indian Restaurant


In [80]:
toronto_venues.groupby('Neighborhood',as_index=True)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f7e4a0d3a58>

#### One hot encoding different categories of venues in neighbourhood

In [88]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

#print(toronto_onehot)
# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

#toronto_onehot.head()

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()


Unnamed: 0,Vegetarian / Vegan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Arts & Crafts Store,Asian Restaurant,BBQ Joint,...,Speakeasy,Sporting Goods Shop,Steakhouse,Supermarket,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Theme Restaurant,Trail
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Get frequency of each category for every neighbourood

In [90]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Vegetarian / Vegan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Arts & Crafts Store,Asian Restaurant,...,Speakeasy,Sporting Goods Shop,Steakhouse,Supermarket,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Theme Restaurant,Trail
0,"Adelaide,King,Richmond",0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,...,0.1,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Berczy Park,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"CN Tower,Bathurst Quay,Island airport,Harbourf...",0.0,0.1,0.1,0.2,0.1,0.2,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Cabbagetown,St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0
5,"Chinatown,Grange Park,Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Church and Wellesley,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.1,0.0
8,"Commerce Court,Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Design Exchange,Toronto Dominion Centre",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Get top 5 venue categories for each neighbourhood in Toronto

In [95]:
for neighbour in toronto_grouped['Neighborhood']:
    print("----"+neighbour+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == neighbour].T.reset_index()
    temp.columns = ['venue','freq']
    #print(temp)
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(5))
    print('\n')

----Adelaide,King,Richmond----
                           venue  freq
0               Asian Restaurant   0.2
1  Vegetarian / Vegan Restaurant   0.1
2                          Plaza   0.1
3                    Opera House   0.1
4                   Concert Hall   0.1


----Berczy Park----
                           venue  freq
0  Vegetarian / Vegan Restaurant   0.1
1                     Steakhouse   0.1
2                           Park   0.1
3                   Concert Hall   0.1
4                 Breakfast Spot   0.1


----CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara----
                venue  freq
0      Airport Lounge   0.2
1    Airport Terminal   0.2
2     Harbor / Marina   0.1
3  Airport Food Court   0.1
4     Airport Service   0.1


----Cabbagetown,St. James Town----
                 venue  freq
0                 Café   0.2
1  Japanese Restaurant   0.1
2           Restaurant   0.1
3                Diner   0.1
4        Jewelry S

#### Function to return sorted top venues


In [96]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [98]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Asian Restaurant,Steakhouse,Pizza Place,Plaza,Hotel,Speakeasy,Concert Hall,Vegetarian / Vegan Restaurant,Opera House,Creperie
1,Berczy Park,Vegetarian / Vegan Restaurant,Concert Hall,French Restaurant,Museum,Breakfast Spot,Liquor Store,Cocktail Bar,Park,Steakhouse,Farmers Market
2,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Airport Lounge,Airport Terminal,Airport,Airport Food Court,Airport Service,Coffee Shop,Harbor / Marina,Boutique,Trail,Dance Studio
3,"Cabbagetown,St. James Town",Café,Italian Restaurant,Jewelry Store,General Entertainment,Diner,Indian Restaurant,Bakery,Japanese Restaurant,Restaurant,Clothing Store
4,Central Bay Street,Coffee Shop,Park,Italian Restaurant,Gastropub,Modern European Restaurant,Sushi Restaurant,Spa,Ramen Restaurant,Café,Candy Store


#### Clustering data

In [101]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
print(kmeans.labels_[0:10])
toronto_grouped_clustering

[1 1 0 2 3 2 2 1 2 3]


Unnamed: 0,Vegetarian / Vegan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Arts & Crafts Store,Asian Restaurant,BBQ Joint,...,Speakeasy,Sporting Goods Shop,Steakhouse,Supermarket,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Theme Restaurant,Trail
0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,...,0.1,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.1,0.1,0.2,0.1,0.2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.1,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### MErge the toronto grouped data with neighbourhood sorted venue data

In [104]:
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Rosedale,43.679563,-79.377529,4,Park,Trail,Playground,Comic Shop,Café,Candy Store,Clothing Store,Cocktail Bar,Coffee Shop,College Gym
1,"Cabbagetown,St. James Town",43.667967,-79.367675,2,Café,Italian Restaurant,Jewelry Store,General Entertainment,Diner,Indian Restaurant,Bakery,Japanese Restaurant,Restaurant,Clothing Store
2,Church and Wellesley,43.66586,-79.38316,1,Breakfast Spot,Dance Studio,Bubble Tea Shop,Mexican Restaurant,Ramen Restaurant,Restaurant,Salon / Barbershop,Theme Restaurant,Tea Room,Gastropub
3,"Harbourfront,Regent Park",43.65426,-79.360636,2,Breakfast Spot,Park,Restaurant,Gym / Fitness Center,Historic Site,Coffee Shop,Pub,Spa,Bakery,Candy Store
4,"Ryerson,Garden District",43.657162,-79.378937,1,Comic Shop,Burger Joint,Café,Pizza Place,Clothing Store,Plaza,Ramen Restaurant,Burrito Place,Tea Room,Theater


#### Visulize the clusterdata

In [106]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters