# Webscrapping and data preprocessing

In [3]:
# install packages
! pip install beautifulsoup4
! pip install requests



In [4]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

# open the url and get the html
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = urlopen(url)
print(type(html))

<class 'http.client.HTTPResponse'>


In [5]:
# use BeautifulSoup to parse the html
soup = BeautifulSoup(html, "html.parser")
print(type(soup))

<class 'bs4.BeautifulSoup'>


In [157]:
# locate the table in the html
my_table = soup.find('table',{'class':'wikitable sortable'})
# print(my_table)

## Convert the html table to a Pandas Dataframe

In [13]:
import pandas as pd
import numpy as np


toronto = pd.read_html(str(my_table))[0] 
toronto.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Etobicoke,Islington Avenue


## Now drop the 'Not assigned' Boroughs

In [14]:
not_assigned_idx = toronto[toronto['Borough']=='Not assigned'].index
toronto_clean = toronto.drop(not_assigned_idx)
toronto_clean.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Etobicoke,Islington Avenue
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


Combine the Neighbourhoods with the same postcode, using `groupby`\
Also reset the index

In [15]:
toronto_clean = toronto_clean.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index()
toronto_clean.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


Check if there is a Not assigned neighborhood

In [16]:
toronto_clean.loc[toronto_clean['Neighbourhood']=='Not assigned']


Unnamed: 0,Postcode,Borough,Neighbourhood


No 'Not assigned' neighborhood, so we can go on.

## Add coordination to the locations
Use geocoder, `arcgis` has the best performance. `Nominatim` and `google` gives some None value.

In [17]:
!pip install geocoder

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 6.3MB/s ta 0:00:01
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [19]:
import geocoder # import geocoder
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

not_found = 0
print('Updating postcode ',end =" ")

for idx, row in toronto_clean.iterrows():    
    postcode = toronto_clean.at[idx,'Postcode']
    print('{}...'.format(postcode),end =" ")
    address='{}, Toronto, Ontario'.format(postcode)    
    g = geocoder.arcgis(address)
    if g is not None:
        latitude = g.latlng[0]
        longitude = g.latlng[1]
        toronto_clean.at[idx,'Latitude'] = latitude
        toronto_clean.at[idx,'Longitude'] = longitude        
    else:
        toronto_clean.at[idx,'Latitude'] = np.NaN
        toronto_clean.at[idx,'Longitude'] = np.NaN
        not_found += 1

print('\n Updated all, {} coordinates not found!'.format(not_found))            
    

Updating postcode  M1B... M1C... M1E... M1G... M1H... M1J... M1K... M1L... M1M... M1N... M1P... M1R... M1S... M1T... M1V... M1W... M1X... M2H... M2J... M2K... M2L... M2M... M2N... M2P... M2R... M3A... M3B... M3C... M3H... M3J... M3K... M3L... M3M... M3N... M4A... M4B... M4C... M4E... M4G... M4H... M4J... M4K... M4L... M4M... M4N... M4P... M4R... M4S... M4T... M4V... M4W... M4X... M4Y... M5A... M5B... M5C... M5E... M5G... M5H... M5J... M5K... M5L... M5M... M5N... M5P... M5R... M5S... M5T... M5V... M5W... M5X... M6A... M6B... M6C... M6E... M6G... M6H... M6J... M6K... M6L... M6M... M6N... M6P... M6R... M6S... M7A... M7R... M7Y... M8V... M8W... M8X... M8Y... M8Z... M9A... M9B... M9C... M9L... M9M... M9N... M9P... M9R... M9V... M9W... 
 Updated all, 0 coordinates not found!


In [20]:
# Double-check NaN values
mask = toronto_clean['Latitude'].isna()
toronto_clean[mask]

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude


In [21]:
toronto_clean.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.811525,-79.195517
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.785665,-79.158725
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.765815,-79.175193
3,M1G,Scarborough,Woburn,43.768369,-79.21759
4,M1H,Scarborough,Cedarbrae,43.769688,-79.23944
5,M1J,Scarborough,Scarborough Village,43.743125,-79.23175
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.726276,-79.263625
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.713054,-79.285055
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.724235,-79.227925
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.69677,-79.259967


## Create a map of Toronto with neighbourhoods

In [22]:
# Use geopy library to get the latitude and longitude values of Toronto.
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="my_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [23]:
!pip install folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 11.0MB/s eta 0:00:01
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/81/6d/31c83485189a2521a75b4130f1fee5364f772a0375f81afff619004e5237/branca-0.4.0-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.0 folium-0.10.1


In [25]:
import folium 
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
neighbourhoods = toronto_clean
# add markers to map
for lat, lng, borough, neighbourhood in zip(neighbourhoods['Latitude'], neighbourhoods['Longitude'], neighbourhoods['Borough'], neighbourhoods['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Utilizing the Foursquare API to explore the neighborhoods and segment them.


In [158]:
# @hidden_cell
# Define Foursquare Credentials and Version
CLIENT_ID = 'Q4WKZIE1B3YD0XKAPAYAAHVG35CUDOBLRNRDUSTF0USKYLBI' # your Foursquare ID
CLIENT_SECRET = '1GE355PLZOQDLOG4BG1JU2OORHSJAC5BDLUK35MQAJH2GYMK' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: xxxxxx')
print('CLIENT_SECRET:xxxxxx' )

Your credentails:
CLIENT_ID: xxxxxx
CLIENT_SECRET:xxxxxx


### use the function from the course to get the nearby venues in all the neighborhood
Use postcode instead of neighbourhood's name

In [159]:
import requests
def getNearbyVenues(postcodes, latitudes, longitudes, radius=500, limit=100):
    
    venues_list=[]
    for postcode, lat, lng in zip(postcodes, latitudes, longitudes):
        print(postcode, end='...')
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            postcode, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postcode', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Run the above function on each neighborhood and create a new dataframe.

In [49]:
Toronto_venues = getNearbyVenues(postcodes=toronto_clean['Postcode'],
                                   latitudes=toronto_clean['Latitude'],
                                   longitudes=toronto_clean['Longitude'],
                                  limit=100)

M1B...M1C...M1E...M1G...M1H...M1J...M1K...M1L...M1M...M1N...M1P...M1R...M1S...M1T...M1V...M1W...M1X...M2H...M2J...M2K...M2L...M2M...M2N...M2P...M2R...M3A...M3B...M3C...M3H...M3J...M3K...M3L...M3M...M3N...M4A...M4B...M4C...M4E...M4G...M4H...M4J...M4K...M4L...M4M...M4N...M4P...M4R...M4S...M4T...M4V...M4W...M4X...M4Y...M5A...M5B...M5C...M5E...M5G...M5H...M5J...M5K...M5L...M5M...M5N...M5P...M5R...M5S...M5T...M5V...M5W...M5X...M6A...M6B...M6C...M6E...M6G...M6H...M6J...M6K...M6L...M6M...M6N...M6P...M6R...M6S...M7A...M7R...M7Y...M8V...M8W...M8X...M8Y...M8Z...M9A...M9B...M9C...M9L...M9M...M9N...M9P...M9R...M9V...M9W...

Check how many venues were returned for each neighborhood

In [160]:
Toronto_venues[['Postcode','Venue']].groupby('Postcode').count().head(10)

Unnamed: 0_level_0,Venue
Postcode,Unnamed: 1_level_1
M1C,2
M1E,3
M1G,4
M1H,2
M1J,4
M1K,5
M1L,10
M1M,7
M1N,6
M1P,4


In [161]:
Toronto_venues.head()

Unnamed: 0,Postcode,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M1C,43.785665,-79.158725,Scarborough Historical Society,43.788755,-79.162438,History Museum
1,M1C,43.785665,-79.158725,Royal Canadian Legion,43.782533,-79.163085,Bar
2,M1E,43.765815,-79.175193,Homestead Roofing Repair,43.76514,-79.178663,Construction & Landscaping
3,M1E,43.765815,-79.175193,Heron Park Community Centre,43.768867,-79.176958,Gym / Fitness Center
4,M1E,43.765815,-79.175193,Heron Park,43.769327,-79.177201,Park


### Some neighbourhoods has a small amount of venues, let's drop them.

In [163]:
# postcodes with less than n venues
n = 20
venue_counts = Toronto_venues[['Postcode','Venue']].groupby('Postcode').count()
venues_mask = venue_counts['Venue'] < n
low_venue = venue_counts[venues_mask]
low_venue.index.to_list()

# new dataframe without the low venue postcodes:
postcode_mask = ~Toronto_venues['Postcode'].isin(low_venue.index.to_list())
toronto_venues_new = Toronto_venues[postcode_mask]

# check the venue count of the new dataframe
print(np.shape(toronto_venues_new.groupby('Postcode').count()))
toronto_venues_new.groupby('Postcode').count().head(10)


(33, 6)


Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M2J,51,51,51,51,51,51
M2M,22,22,22,22,22,22
M2N,29,29,29,29,29,29
M3N,22,22,22,22,22,22
M4C,20,20,20,20,20,20
M4G,34,34,34,34,34,34
M4M,54,54,54,54,54,54
M4S,27,27,27,27,27,27
M4X,42,42,42,42,42,42
M4Y,84,84,84,84,84,84


Check how many unique categories can be curated from all the returned venues

In [164]:
print('There are {} uniques categories.'.format(len(Toronto_venues['Venue Category'].unique())))

There are 253 uniques categories.


## Analyze Each Neighborhood

In [165]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues_new[['Venue Category']], prefix="", prefix_sep="")

# add Postcode column back to dataframe
toronto_onehot['Postcode'] = toronto_venues_new['Postcode'] 

# move Postcode column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

print(toronto_onehot.shape)
toronto_onehot.describe()

(2037, 222)


Unnamed: 0,Afghan Restaurant,American Restaurant,Antique Shop,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Dealership,BBQ Joint,Bagel Shop,...,Train Station,Tram Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
count,2037.0,2037.0,2037.0,2037.0,2037.0,2037.0,2037.0,2037.0,2037.0,2037.0,...,2037.0,2037.0,2037.0,2037.0,2037.0,2037.0,2037.0,2037.0,2037.0,2037.0
mean,0.000491,0.017182,0.000982,0.006873,0.0054,0.006873,0.000491,0.000491,0.002946,0.001473,...,0.000982,0.000491,0.009818,0.001964,0.006873,0.0054,0.000491,0.000982,0.002455,0.001473
std,0.022157,0.129982,0.031327,0.082638,0.073305,0.082638,0.022157,0.022157,0.054206,0.038358,...,0.031327,0.022157,0.098624,0.044281,0.082638,0.073305,0.022157,0.031327,0.049495,0.038358
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


#### group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [166]:
toronto_grouped = toronto_onehot.groupby('Postcode').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Postcode,Afghan Restaurant,American Restaurant,Antique Shop,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Dealership,BBQ Joint,...,Train Station,Tram Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M2J,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.019608,0.0,0.0,0.0,0.0,0.058824,0.0
1,M2M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M2N,0.0,0.034483,0.0,0.0,0.034483,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M3N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0
4,M4C,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### function to sort the venues in descending order.

In [91]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [96]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postcode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Postcode'] = toronto_grouped['Postcode']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M2J,Clothing Store,Fast Food Restaurant,Coffee Shop,Women's Store,Japanese Restaurant,Juice Bar,Toy / Game Store,Restaurant,Electronics Store,Spa
1,M2M,Korean Restaurant,Middle Eastern Restaurant,Pizza Place,Café,Hookah Bar,Supermarket,Fried Chicken Joint,Japanese Restaurant,Dessert Shop,Park
2,M2N,Ramen Restaurant,Café,Coffee Shop,Shopping Mall,Fast Food Restaurant,Lounge,Steakhouse,Juice Bar,Middle Eastern Restaurant,Plaza
3,M3N,Fast Food Restaurant,Grocery Store,Pizza Place,Vietnamese Restaurant,Discount Store,Beer Store,Pharmacy,Coffee Shop,Caribbean Restaurant,Sandwich Place
4,M4C,Bus Line,Fast Food Restaurant,Gas Station,Pet Store,Pharmacy,Pizza Place,Doctor's Office,Pub,Coffee Shop,Middle Eastern Restaurant


## Cluster the Neighborhoods

In [99]:
# import k-means from clustering stage
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Postcode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 0, 0, 0, 1, 0, 0, 0, 0, 0], dtype=int32)

In [112]:
# add clustering labels
labels_df = pd.Series(kmeans.labels_, name='Cluster Labels')
neighborhoods_venues_sorted.update(labels_df)
neighborhoods_venues_sorted

toronto_merged = toronto_clean

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.merge(neighborhoods_venues_sorted, on='Postcode')

toronto_merged.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M2J,North York,"Fairview,Henry Farm,Oriole",43.78097,-79.347813,2,Clothing Store,Fast Food Restaurant,Coffee Shop,Women's Store,Japanese Restaurant,Juice Bar,Toy / Game Store,Restaurant,Electronics Store,Spa
1,M2M,North York,"Newtonbrook,Willowdale",43.791475,-79.413605,0,Korean Restaurant,Middle Eastern Restaurant,Pizza Place,Café,Hookah Bar,Supermarket,Fried Chicken Joint,Japanese Restaurant,Dessert Shop,Park
2,M2N,North York,Willowdale South,43.768165,-79.40742,0,Ramen Restaurant,Café,Coffee Shop,Shopping Mall,Fast Food Restaurant,Lounge,Steakhouse,Juice Bar,Middle Eastern Restaurant,Plaza
3,M3N,North York,Downsview Northwest,43.755371,-79.51959,0,Fast Food Restaurant,Grocery Store,Pizza Place,Vietnamese Restaurant,Discount Store,Beer Store,Pharmacy,Coffee Shop,Caribbean Restaurant,Sandwich Place
4,M4C,East York,Woodbine Heights,43.68964,-79.306874,1,Bus Line,Fast Food Restaurant,Gas Station,Pet Store,Pharmacy,Pizza Place,Doctor's Office,Pub,Coffee Shop,Middle Eastern Restaurant


#### Visualize the resulting clusters on the map:

In [115]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Seems red is most prevalent. Let check the count:

In [117]:
toronto_merged.groupby('Cluster Labels').count()

Unnamed: 0_level_0,Postcode,Borough,Neighbourhood,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,27,27,27,27,27,27,27,27,27,27,27,27,27,27,27
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


Indeed most neighbourhoods are labled as 0, let's change the clustering method: 

In [140]:
# import k-means from clustering stage
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 8

toronto_grouped_clustering = toronto_grouped.drop('Postcode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# add clustering labels
labels_df = pd.Series(kmeans.labels_, name='Cluster Labels')
neighborhoods_venues_sorted.update(labels_df)
neighborhoods_venues_sorted

toronto_merged = toronto_clean

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.merge(neighborhoods_venues_sorted, on='Postcode')

toronto_merged.groupby('Cluster Labels').count()

Unnamed: 0_level_0,Postcode,Borough,Neighbourhood,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
5,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
7,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3


### Looks better...let's put it on the map (using different colors)

In [147]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters_1 = folium.Map(location=[latitude, longitude], zoom_start=13)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.brg(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters_1)
       
map_clusters_1

### Examine clusters

In [155]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]


Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
9,Downtown Toronto,0,Coffee Shop,Japanese Restaurant,Restaurant,Sushi Restaurant,Gay Bar,Café,Men's Store,Dance Studio,Bubble Tea Shop,Gastropub
10,Downtown Toronto,0,Coffee Shop,Bakery,Boat or Ferry,Café,Theater,Park,Breakfast Spot,Gastropub,Distribution Center,Mexican Restaurant
12,Downtown Toronto,0,Coffee Shop,Café,Restaurant,Hotel,Bakery,Seafood Restaurant,Beer Bar,Breakfast Spot,Clothing Store,Cosmetics Shop
13,Downtown Toronto,0,Coffee Shop,Restaurant,Cocktail Bar,Bakery,Seafood Restaurant,Hotel,Café,Farmers Market,Cheese Shop,Beer Bar
15,Downtown Toronto,0,Coffee Shop,Café,Restaurant,Hotel,Japanese Restaurant,Gym,Steakhouse,Breakfast Spot,Asian Restaurant,Gastropub
16,Downtown Toronto,0,Coffee Shop,Café,Restaurant,Hotel,Bakery,American Restaurant,Japanese Restaurant,Gastropub,Italian Restaurant,Seafood Restaurant
17,Downtown Toronto,0,Coffee Shop,Café,Restaurant,Hotel,Japanese Restaurant,Gym,American Restaurant,Seafood Restaurant,Beer Bar,Italian Restaurant
20,Downtown Toronto,0,Café,Coffee Shop,Bakery,Restaurant,Bookstore,Italian Restaurant,Bar,Japanese Restaurant,Gym,Cheese Shop
23,Downtown Toronto,0,Coffee Shop,Restaurant,Hotel,Bar,Seafood Restaurant,Pizza Place,Café,Steakhouse,Sushi Restaurant,Pub
24,Downtown Toronto,0,Coffee Shop,Café,Restaurant,Hotel,American Restaurant,Gastropub,Gym,Seafood Restaurant,Deli / Bodega,Asian Restaurant


In [156]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 6, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]


Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,East Toronto,6,Diner,Brewery,Italian Restaurant,Sushi Restaurant,American Restaurant,Sandwich Place,Gastropub,Café,Arts & Crafts Store,Bar
7,Central Toronto,6,Dessert Shop,Italian Restaurant,Sandwich Place,Café,Coffee Shop,Pizza Place,Thai Restaurant,Salon / Barbershop,Farmers Market,Fast Food Restaurant
8,Downtown Toronto,6,Coffee Shop,Restaurant,Park,Pizza Place,Café,Italian Restaurant,Bakery,Pharmacy,Playground,Market
18,North York,6,Coffee Shop,Sandwich Place,Italian Restaurant,Comfort Food Restaurant,Thai Restaurant,Pharmacy,Pub,Restaurant,Café,Butcher
19,Central Toronto,6,Sandwich Place,Café,French Restaurant,Pub,Coffee Shop,Middle Eastern Restaurant,Modern European Restaurant,Burger Joint,Italian Restaurant,Pizza Place
21,Downtown Toronto,6,Bar,Vietnamese Restaurant,Café,Coffee Shop,Bakery,Dumpling Restaurant,Chinese Restaurant,Mexican Restaurant,Bubble Tea Shop,Burger Joint
22,Downtown Toronto,6,Coffee Shop,Italian Restaurant,Restaurant,Café,Bar,Electronics Store,Gym / Fitness Center,Sandwich Place,Pizza Place,Bakery
