# Capstone Toronto project

### to scrape postal code, borough, neighbourhood info from https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M and then analyse using four square api

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
htmltoronto= requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup=BeautifulSoup(htmltoronto, "lxml")
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );
  </script>
  <script>
   (window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":890001695,"wgRevisionId":890001695,"wgArticleId":539066,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario","Postal codes in Canada","Toronto","Ontario-related lists"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wg

### get the table portion of the wiki page|

In [3]:
table=soup.find("table")
print(table)

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>
<tr>
<td>M6A</td>

### iterate through the tr td tags creating a list  of postal codes, bourough and neighbourhood

In [4]:
i = 0 ;
# mzke a list of database rows and add it all together at he end to the dataframe
list_pc = []
list_b = []
list_n = []
for tr in table.find_all('tr'):
    #skip first tr as it is header
    if i == 0:
        i+=1
        continue
    j=0
    for td in tr.find_all("td"):
        if (j==0):
            #postal code
            pc = td.text
            j+=1
        elif (j==1):
            # bourough
            b = td.text
            j+=1
        elif (j==2):
            #neighbourhood
            n = td.text
            j+=1
        else:
            break
    
    # remove any \n at end of string
    if (b[-1]=='\n'):
        b=b[:-1]
    if (n[-1]=='\n'):
        n=n[:-1]
        
    # if bourough is not asssigned skip
    if (b=='Not assigned'):
        continue
    # if neighbourhood is not assigned make neighbourhood = bourough
    if (n=='Not assigned'):
        n = b
      
    if (len(list_pc)==0):
        list_pc.append(pc)
        list_b.append(b)
        list_n.append(n)
    else:
        if (list_pc[-1]==pc):
            # if same postal code as last append the neighbourhood
            last_n = list_n[-1]
            list_n[-1]= last_n+', '+n
        else:
            list_pc.append(pc)
            list_b.append(b)
            list_n.append(n)


### create dataframe from above list

In [5]:
d = {'PostalCode':list_pc, 'Borough':list_b, 'Neighbourhood':list_n}
df_toronto = pd.DataFrame(d)
        
print(df_toronto.head(11))

   PostalCode           Borough                     Neighbourhood
0         M3A        North York                         Parkwoods
1         M4A        North York                  Victoria Village
2         M5A  Downtown Toronto         Harbourfront, Regent Park
3         M6A        North York  Lawrence Heights, Lawrence Manor
4         M7A      Queen's Park                      Queen's Park
5         M9A         Etobicoke                  Islington Avenue
6         M1B       Scarborough                    Rouge, Malvern
7         M3B        North York                   Don Mills North
8         M4B         East York   Woodbine Gardens, Parkview Hill
9         M5B  Downtown Toronto          Ryerson, Garden District
10        M6B        North York                         Glencairn


In [6]:
df_toronto.shape

(103, 3)

### Get the lat and longitude using geocoder

In [7]:
import geocoder # import geocoder

pclist = df_toronto['PostalCode']
list_pc=[]
list_lat=[]
list_long=[]
unsuccess = False
for pc in pclist:
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    i = 0
    while(lat_lng_coords is None):
        print(f'Getting latitude longitude for {pc}')
        g = geocoder.google(f'{pc}, Toronto, Ontario')
        lat_lng_coords = g.latlng
        i+=1
        if (i == 10):
            print("Not getting return value even after 10 tries...stopping")
            unsuccess = True
            break

    if (not unsuccess):
        #update to list
        list_pc.append(pc)
        list_lat.append(lat_lng_coords[0])
        list_long.append(lat_lng_coords[1])
    else:
        # not getting return value
        break

Getting latitude longitude for M3A
Getting latitude longitude for M3A
Getting latitude longitude for M3A
Getting latitude longitude for M3A
Getting latitude longitude for M3A
Getting latitude longitude for M3A
Getting latitude longitude for M3A
Getting latitude longitude for M3A
Getting latitude longitude for M3A
Getting latitude longitude for M3A
Not getting return value even after 10 tries...stopping


### did not get lat long using csv given

In [8]:
df_geo = pd.read_csv("Geospatial_Coordinates.csv")
df_geo.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge the two data from on Postal Code to make new required dataframe

In [9]:
df_toronto_latlong=pd.merge(df_toronto, df_geo, how='inner')
df_toronto_latlong.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


### Exploring and clustering the neighbourhoods where borough contains Toronto

In [167]:
list_toronto = df_toronto_latlong['Borough'].str.contains('York')
df_subtoronto = df_toronto_latlong[list_toronto]
df_subtoronto.shape

(34, 5)

In [168]:
df_subtoronto['Borough']

0     North York
1     North York
3     North York
7     North York
8      East York
10    North York
13    North York
14     East York
16          York
21          York
23     East York
27    North York
28    North York
29     East York
33    North York
34    North York
35     East York
39    North York
40    North York
45    North York
46    North York
49    North York
50    North York
52    North York
53    North York
55    North York
56          York
57    North York
59    North York
60    North York
63          York
64          York
66    North York
72    North York
Name: Borough, dtype: object

### Preparing Foursquare required parameters

In [169]:
CLIENT_ID = 'DWEXM50DMIASXE1ICCHVYPI04OUZVWWERCNWTLAXFKWFTLIP' # your Foursquare ID
CLIENT_SECRET = 'HWYPBBRK1N5R2XT3BCXDPKZQAXMYU4VSWUE23Q2LO4HASPZP' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

LIMIT=100

In [170]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


### Create a map of toronto showing the locations in df_subtoronto

In [171]:
# create map of Manhattan using latitude and longitude values
map_subtoronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_subtoronto['Latitude'], df_subtoronto['Longitude'], df_subtoronto['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_subtoronto)  
    
map_subtoronto

#### Let's create a function to get the venues to all the neighborhoods in toronto area

In [172]:
def getNearbyVenues(postalcode, names, latitudes, longitudes, radius=300):
    
    venues_list=[]
    for pc, name, lat, lng in zip(postalcode, names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            pc, 
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['PostalCode',
                   'Neighbourhood', 
                  'Postal Latitude', 
                  'Postal Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Getting the toronto venues

In [173]:
toronto_venues = getNearbyVenues(postalcode=df_subtoronto['PostalCode'],
                                 names=df_subtoronto['Neighbourhood'],
                                 latitudes=df_subtoronto['Latitude'],
                                 longitudes=df_subtoronto['Longitude']
                                 )

Parkwoods
Victoria Village
Lawrence Heights, Lawrence Manor
Don Mills North
Woodbine Gardens, Parkview Hill
Glencairn
Flemingdon Park, Don Mills South
Woodbine Heights
Humewood-Cedarvale
Caledonia-Fairbanks
Leaside
Hillcrest Village
Bathurst Manor, Downsview North, Wilson Heights
Thorncliffe Park
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto
Bayview Village
CFB Toronto, Downsview East
Silver Hills, York Mills
Downsview West
Downsview, North Park, Upwood Park
Humber Summit
Newtonbrook, Willowdale
Downsview Central
Bedford Park, Lawrence Manor East
Del Ray, Keelesdale, Mount Dennis, Silverthorn
Emery, Humberlea
Willowdale South
Downsview Northwest
The Junction North, Runnymede
Weston
York Mills West
Willowdale West


In [174]:
print(toronto_venues.shape)
toronto_venues.head()

(200, 8)


Unnamed: 0,PostalCode,Neighbourhood,Postal Latitude,Postal Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M3A,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,M3A,Parkwoods,43.753259,-79.329656,KFC,43.754387,-79.333021,Fast Food Restaurant
2,M4A,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,M4A,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,M4A,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


### check how many venues are there

In [175]:
toronto_venues.groupby('PostalCode').count()

Unnamed: 0_level_0,Neighbourhood,Postal Latitude,Postal Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
M2H,6,6,6,6,6,6,6
M2J,58,58,58,58,58,58,58
M2N,1,1,1,1,1,1,1
M2P,2,2,2,2,2,2,2
M2R,1,1,1,1,1,1,1
M3A,2,2,2,2,2,2,2
M3B,2,2,2,2,2,2,2
M3C,15,15,15,15,15,15,15
M3H,15,15,15,15,15,15,15
M3L,2,2,2,2,2,2,2


### lets see how many unique categories

In [176]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 90 uniques categories.


## Lets analyze each neighbourhood

In [177]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['PostalCode'] = toronto_venues['PostalCode']
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-2]]+[toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-2])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,PostalCode,Neighbourhood,Accessories Store,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bank,Bar,...,Tennis Court,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Video Store,Warehouse Store,Wings Joint,Women's Store,Yoga Studio
0,M3A,Parkwoods,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M3A,Parkwoods,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M4A,Victoria Village,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4A,Victoria Village,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4A,Victoria Village,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [178]:
toronto_onehot.shape

(200, 92)

In [179]:
toronto_grouped = toronto_onehot.groupby('PostalCode').mean().reset_index()
toronto_grouped=df_subtoronto.merge(toronto_grouped, on='PostalCode')

# move neighborhood column to the second column
#fixed_columns = [toronto_grouped.columns[0]]+[toronto_grouped.columns[-1]] + list(toronto_grouped.columns[1:-1])
#toronto_grouped = toronto_grouped[fixed_columns]
toronto_grouped=toronto_grouped.drop(['Borough', 'Latitude','Longitude'], axis=1)
toronto_grouped

Unnamed: 0,PostalCode,Neighbourhood,Accessories Store,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bank,Bar,...,Tennis Court,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Video Store,Warehouse Store,Wings Joint,Women's Store,Yoga Studio
0,M3A,Parkwoods,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4A,Victoria Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M6A,"Lawrence Heights, Lawrence Manor",0.090909,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0
3,M3B,Don Mills North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M4B,"Woodbine Gardens, Parkview Hill",0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,M6B,Glencairn,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,M3C,"Flemingdon Park, Don Mills South",0.0,0.0,0.0,0.133333,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,M4C,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,M6C,Humewood-Cedarvale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,M6E,Caledonia-Fairbanks,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0


### Lets check the new size

In [180]:
toronto_grouped.shape

(28, 92)

#### Let's print each neighborhood along with the top 5 most common venues

In [196]:
num_top_venues = 5

for pc, hood in zip(toronto_grouped['PostalCode'], toronto_grouped['Neighbourhood']):
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['PostalCode'] == pc].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[2:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Parkwoods----
                   venue  freq
0                   Park   0.5
1   Fast Food Restaurant   0.5
2      Accessories Store   0.0
3  Portuguese Restaurant   0.0
4             Playground   0.0


----Victoria Village----
                   venue  freq
0           Intersection   0.2
1           Hockey Arena   0.2
2            Coffee Shop   0.2
3            Pizza Place   0.2
4  Portuguese Restaurant   0.2


----Lawrence Heights, Lawrence Manor----
                    venue  freq
0  Furniture / Home Store  0.27
1          Clothing Store  0.18
2       Accessories Store  0.09
3     Arts & Crafts Store  0.09
4           Women's Store  0.09


----Don Mills North----
          venue  freq
0          Pool   0.5
1  Tennis Court   0.5
2   Men's Store   0.0
3    Playground   0.0
4   Pizza Place   0.0


----Woodbine Gardens, Parkview Hill----
            venue  freq
0  Breakfast Spot  0.12
1    Intersection  0.12
2     Pizza Place  0.12
3        Pharmacy  0.12
4            Bank  0.12


--

### Let's put that into a *pandas* dataframe
#### First, let's write a function to sort the venues in descending order.

In [182]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[2:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#### Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [197]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['PostalCode', 'Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['PostalCode'] = toronto_grouped['PostalCode']
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 2:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,PostalCode,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,Parkwoods,Park,Fast Food Restaurant,Yoga Studio,Electronics Store,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Discount Store
1,M4A,Victoria Village,Hockey Arena,Intersection,Portuguese Restaurant,Coffee Shop,Pizza Place,Gastropub,Dim Sum Restaurant,Golf Course,Gift Shop,Convenience Store
2,M6A,"Lawrence Heights, Lawrence Manor",Furniture / Home Store,Clothing Store,Accessories Store,Women's Store,Coffee Shop,Event Space,Boutique,Arts & Crafts Store,Gastropub,Gift Shop
3,M3B,Don Mills North,Pool,Tennis Court,Event Space,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Discount Store
4,M4B,"Woodbine Gardens, Parkview Hill",Pharmacy,Pet Store,Gastropub,Intersection,Bank,Café,Pizza Place,Breakfast Spot,Dog Run,Cosmetics Shop


## Cluster neighbourhoods

### Run k-means to cluster the neighborhood into 5 clusters.

In [217]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop(['PostalCode', 'Neighbourhood'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 2, 2, 2, 2, 2, 2, 4, 2, 2])

### Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [218]:
# add clustering labels
neighborhoods_venues_sorted.insert(2, 'Cluster Labels', kmeans.labels_)

In [219]:
toronto_merged = df_subtoronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.merge(neighborhoods_venues_sorted, on=['PostalCode', 'Neighbourhood'])

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,0,Park,Fast Food Restaurant,Yoga Studio,Electronics Store,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Discount Store
1,M4A,North York,Victoria Village,43.725882,-79.315572,2,Hockey Arena,Intersection,Portuguese Restaurant,Coffee Shop,Pizza Place,Gastropub,Dim Sum Restaurant,Golf Course,Gift Shop,Convenience Store
2,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763,2,Furniture / Home Store,Clothing Store,Accessories Store,Women's Store,Coffee Shop,Event Space,Boutique,Arts & Crafts Store,Gastropub,Gift Shop
3,M3B,North York,Don Mills North,43.745906,-79.352188,2,Pool,Tennis Court,Event Space,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Discount Store
4,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937,2,Pharmacy,Pet Store,Gastropub,Intersection,Bank,Café,Pizza Place,Breakfast Spot,Dog Run,Cosmetics Shop


### Lets visualize the results

In [220]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examining the Clusters and Name them

### Cluster 1 - Park

In [221]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[0]+[2] + list(range(6, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,Parkwoods,Park,Fast Food Restaurant,Yoga Studio,Electronics Store,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Discount Store
15,M4J,East Toronto,Park,Bakery,Furniture / Home Store,Yoga Studio,Electronics Store,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant
25,M9N,Weston,Park,Yoga Studio,Construction & Landscaping,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Discount Store,Dog Run
26,M2P,York Mills West,Park,Bank,Yoga Studio,Event Space,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Discount Store


### Cluster 2 - Landscaping Company

In [222]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[0]+[2] + list(range(6, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,M9M,"Emery, Humberlea",Construction & Landscaping,Yoga Studio,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Discount Store,Dog Run,Electronics Store


### Cluster 3 - Happening Places

In [223]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[0]+[2] + list(range(6, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,M4A,Victoria Village,Hockey Arena,Intersection,Portuguese Restaurant,Coffee Shop,Pizza Place,Gastropub,Dim Sum Restaurant,Golf Course,Gift Shop,Convenience Store
2,M6A,"Lawrence Heights, Lawrence Manor",Furniture / Home Store,Clothing Store,Accessories Store,Women's Store,Coffee Shop,Event Space,Boutique,Arts & Crafts Store,Gastropub,Gift Shop
3,M3B,Don Mills North,Pool,Tennis Court,Event Space,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Discount Store
4,M4B,"Woodbine Gardens, Parkview Hill",Pharmacy,Pet Store,Gastropub,Intersection,Bank,Café,Pizza Place,Breakfast Spot,Dog Run,Cosmetics Shop
5,M6B,Glencairn,Pizza Place,Asian Restaurant,Japanese Restaurant,Sushi Restaurant,Electronics Store,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop
6,M3C,"Flemingdon Park, Don Mills South",Asian Restaurant,Coffee Shop,Chinese Restaurant,Sporting Goods Shop,Fast Food Restaurant,Discount Store,Restaurant,Bike Shop,Beer Store,Japanese Restaurant
8,M6C,Humewood-Cedarvale,Playground,Field,Dog Run,Electronics Store,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant
9,M6E,Caledonia-Fairbanks,Women's Store,Pharmacy,Park,Fast Food Restaurant,Market,Yoga Studio,Electronics Store,Deli / Bodega,Department Store,Dessert Shop
10,M4G,Leaside,Sporting Goods Shop,Sports Bar,Burger Joint,Sandwich Place,Electronics Store,Sushi Restaurant,Restaurant,Breakfast Spot,Bank,Mexican Restaurant
11,M2H,Hillcrest Village,Pool,Athletics & Sports,Dog Run,Mediterranean Restaurant,Fast Food Restaurant,Golf Course,Gift Shop,Grocery Store,Cosmetics Shop,Deli / Bodega


### Cluster 4 - Residential

In [224]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[0]+[2] + list(range(6, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
23,M2N,Willowdale South,Coffee Shop,Yoga Studio,Event Space,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Discount Store,Dog Run
27,M2R,Willowdale West,Coffee Shop,Yoga Studio,Event Space,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Discount Store,Dog Run


### Cluster 5 - Eateries

In [225]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[0]+[2] + list(range(6, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
7,M4C,Woodbine Heights,Beer Store,Yoga Studio,Fast Food Restaurant,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Discount Store,Dog Run
