Import required libraries

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as BS
import requests

Get the URL text and read it into a Soup Object (Beautiful Soup)

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

source = requests.get(url).text
soup = BS(source, 'lxml')    

Find the Table from the Wikipedia page which has the Postal Codes, Borough and Neighbourhood information; Append the rows into a list

In [3]:
table = soup.find('table', class_ = 'wikitable sortable')
row_data = []

for row in table.find_all("tr"):
    row_data.append(row.text)

#print(parsed_table_data) 

Split (delimiter = '\n') each row list value so that each cell value is an individual element of a list.
Since there is a new line character at the start and end of each row, clean up each list element by deleting the empty charaters

In [4]:
cell_data = []

for i in row_data:
    words = i.split('\n')
    del words[0]
    del words[3]
    cell_data.append(words)
    
del cell_data[0]

Import the data from the list into a DataFrame

In [5]:
from pandas import DataFrame

df = DataFrame.from_records(cell_data, columns=['Postcode', 'Borough', 'Neighborhood'])

Clean up the Borough and Neighbourhood values where they are 'Not assigned'

In [6]:
# Drop records where Borough is 'Not assigned'
df_clean = df.drop(df[(df.Borough == 'Not assigned')].index)

#Where Neighbourhood is 'Not assigned', set to the Borough value
df_clean.loc[df_clean['Neighborhood'] == 'Not assigned', 'Neighborhood'] = df_clean.Borough

#Reset Index
df_clean = df_clean.reset_index(drop=True)

Group the Neighbourhoods with the same PostCode, separated by comma

In [7]:
df_final = df_clean.groupby(['Postcode', 'Borough'])['Neighborhood'].apply(lambda x: "%s" % ', '.join(x)).reset_index()
#df_final

Get the shape of the dataframe

In [8]:
df_final.shape

(103, 3)

Install GeoCoder

In [9]:
!conda install -c conda-forge geocoder 

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/DSX-Python35

  added / updated specs: 
    - geocoder


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    orderedset-2.0             |           py35_0         685 KB  conda-forge
    geocoder-1.38.1            |             py_0          52 KB  conda-forge
    ratelim-0.1.6              |           py35_0           5 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         742 KB

The following NEW packages will be INSTALLED:

    geocoder:   1.38.1-py_0  conda-forge
    orderedset: 2.0-py35_0   conda-forge
    ratelim:    0.1.6-py35_0 conda-forge


Downloading and Extracting Packages
orderedset-2.0       | 685 KB    | ##################################### | 100% 
geocoder-1.38.1      | 52 KB     | #######################

Since Geocoder isnt giving results, import the data from theGeospatial dataset provided in the course

In [10]:
!wget -q -O 'Toronto_Geospatial_data.csv' http://cocl.us/Geospatial_data
print('Data downloaded!')

Data downloaded!


Load data into Dataframe

In [11]:
toronto_data = pd.read_csv("Toronto_Geospatial_data.csv")
toronto_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
toronto_data.shape

(103, 3)

Concatenate Latitude and Longitude with original Toronto DF

In [13]:
toronto_data.columns=['PostalCode', 'Latitude', 'Longitude']

toronto_lat_long = pd.concat([df_final, toronto_data], axis=1)
del toronto_lat_long['PostalCode']
toronto_lat_long.head()


Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [14]:
toronto_lat_long.loc[toronto_lat_long['Neighborhood'] == 'Woodbine Heights',  'Latitude']

36    43.695344
Name: Latitude, dtype: float64

In [15]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Libraries imported.


Get the latitude and longitude of Toronto

In [16]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

In [17]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


Create a Map of Toronto

In [18]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)
map_toronto

Superimpose the Toronto neighbourhoods onto the map

In [19]:
for lat, lng, borough, neighborhood in zip(toronto_lat_long['Latitude'], toronto_lat_long['Longitude'], toronto_lat_long['Borough'], toronto_lat_long['Neighborhood']):
    label = '{} - {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Define Foursquare Credentials

In [20]:
CLIENT_ID = 'NTM1GC2C0GRDRTHGP3ZFAGSHRKWMTAJNJXY0I4G4G2XJDMKM' # your Foursquare ID
CLIENT_SECRET = 'MVHFSY3A2OQHVLVUNQOCDGPYHRPL0Y3WQAZZG3CFIDFKUAEP' # your Foursquare Secret
VERSION = '20190523' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: NTM1GC2C0GRDRTHGP3ZFAGSHRKWMTAJNJXY0I4G4G2XJDMKM
CLIENT_SECRET:MVHFSY3A2OQHVLVUNQOCDGPYHRPL0Y3WQAZZG3CFIDFKUAEP


Get the lat and lng values of the Clairlea, Golden Mile, Oakridge neighbourhoods in the dataframe

In [21]:
neighborhood_latitude = toronto_lat_long.loc[7, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = toronto_lat_long.loc[7, 'Longitude'] # neighborhood longitude value

neighborhood_name = toronto_lat_long.loc[7, 'Neighborhood'] # neighborhood name

print('Neighbourhood : {} ; Lat: {} ; Lng: {}'.format(neighborhood_name, neighborhood_latitude, neighborhood_longitude))
#print('Latitude and longitude values of {} neighbourhood(s) are {}, {}.'.format(neighborhood_name, 
#                                                               neighborhood_latitude, 
#                                                               neighborhood_longitude))

Neighbourhood : Clairlea, Golden Mile, Oakridge ; Lat: 43.711111700000004 ; Lng: -79.2845772


Get the top 10 venues in the Clairlea, Golden Mile, Oakridge neighbourhoods, within a radius of 500 meters

In [22]:
limit = 100
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    limit)

url

'https://api.foursquare.com/v2/venues/explore?&client_id=NTM1GC2C0GRDRTHGP3ZFAGSHRKWMTAJNJXY0I4G4G2XJDMKM&client_secret=MVHFSY3A2OQHVLVUNQOCDGPYHRPL0Y3WQAZZG3CFIDFKUAEP&v=20190523&ll=43.711111700000004,-79.2845772&radius=500&limit=100'

In [23]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5d0165d238f2160025bab9d6'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
       'items': [{'reasonName': 'globalInteractionReason',
         'summary': 'This spot is popular',
         'type': 'general'}]},
      'referralId': 'e-0-4bc8e605b6c49c740e5b8d91-0',
      'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/fastfood_',
          'suffix': '.png'},
         'id': '4bf58dd8d48988d16e941735',
         'name': 'Fast Food Restaurant',
         'pluralName': 'Fast Food Restaurants',
         'primary': True,
         'shortName': 'Fast Food'}],
       'id': '4bc8e605b6c49c740e5b8d91',
       'location': {'address': '625 Pharmacy Avenue',
        'cc': 'CA',
        'city': 'Scarborough',
        'country': 'Canada',
        'distance': 499,
        'formattedAddress': ['625 Pharmacy Avenue',
         'Scarborough ON M1L 3H3',
         'Canada'],
        'labeledLatLngs': [{'label': 'display',

Function to extract the category of the venue

In [24]:

def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Structure the venue details into a dataframe

In [25]:
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

In [26]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

Unnamed: 0,name,categories,lat,lng
0,Dairy Queen,Fast Food Restaurant,43.710378,-79.290701
1,Warden Ave & St. Clair Ave E,Intersection,43.712057,-79.281005
2,TTC Bus #68 Warden,Bus Line,43.711778,-79.279714
3,Warden Subway Station,Metro Station,43.711229,-79.279602
4,TTC Bus 102 Markham Road,Bus Line,43.711381,-79.279588
5,Warden Station Bus Loop,Bus Station,43.711241,-79.279576
6,Bakery On The Go,Bakery,43.711271,-79.279506
7,Cafe on the go,Bakery,43.711151,-79.279469
8,Warden Woods Park,Park,43.710527,-79.278966
9,Clairlea Futbol Centre,Soccer Field,43.715234,-79.286506


Explore all neighbourhoods

In [27]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [28]:
toronto_venues = getNearbyVenues(names=toronto_lat_long['Neighborhood'],
                                   latitudes=toronto_lat_long['Latitude'],
                                   longitudes=toronto_lat_long['Longitude']
                                  )

Rouge, Malvern
Highland Creek, Rouge Hill, Port Union
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park, Ionview, Kennedy Park
Clairlea, Golden Mile, Oakridge
Cliffcrest, Cliffside, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Scarborough Town Centre, Wexford Heights
Maryvale, Wexford
Agincourt
Clarks Corners, Sullivan, Tam O'Shanter
Agincourt North, L'Amoreaux East, Milliken, Steeles East
L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
Silver Hills, York Mills
Newtonbrook, Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park, Don Mills South
Bathurst Manor, Downsview North, Wilson Heights
Northwood Park, York University
CFB Toronto, Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens, Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West, 

In [29]:
print(toronto_venues.shape)
toronto_venues.head()

(2254, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Rouge, Malvern",43.806686,-79.194353,Interprovincial Group,43.80563,-79.200378,Print Shop
2,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Chris Effects Painting,43.784343,-79.163742,Construction & Landscaping
3,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
4,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place


What are the types of Venues found in Toronto

In [30]:
toronto_venues['Venue Category'].unique()

array(['Fast Food Restaurant', 'Print Shop', 'Construction & Landscaping',
       'Bar', 'Pizza Place', 'Electronics Store', 'Spa',
       'Mexican Restaurant', 'Tech Startup', 'Rental Car Location',
       'Medical Center', 'Intersection', 'Breakfast Spot', 'Coffee Shop',
       'Korean Restaurant', 'Convenience Store', 'Hakka Restaurant',
       'Caribbean Restaurant', 'Thai Restaurant', 'Athletics & Sports',
       'Bank', 'Bakery', 'Lounge', 'Fried Chicken Joint', 'Playground',
       'Department Store', 'Discount Store', 'Bus Line', 'Metro Station',
       'Bus Station', 'Park', 'Soccer Field', 'Motel',
       'American Restaurant', 'Café', 'General Entertainment',
       'Skating Rink', 'College Stadium', 'Indian Restaurant',
       'Chinese Restaurant', 'Vietnamese Restaurant', 'Pet Store',
       'Latin American Restaurant', 'Sandwich Place',
       'Middle Eastern Restaurant', 'Shopping Mall', 'Auto Garage',
       'Italian Restaurant', 'Noodle House', 'Pharmacy',
       'Asia

Number of venues in each Neighbourhood

In [31]:
toronto_venues[['Neighborhood', 'Venue']].groupby('Neighborhood').count()

Unnamed: 0_level_0,Venue
Neighborhood,Unnamed: 1_level_1
"Adelaide, King, Richmond",100
Agincourt,4
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",3
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown",11
"Alderwood, Long Branch",10
"Bathurst Manor, Downsview North, Wilson Heights",18
Bayview Village,4
"Bedford Park, Lawrence Manor East",25
Berczy Park,55
"Birch Cliff, Cliffside West",4


In [32]:
toronto_venues.loc[toronto_venues['Neighborhood'] == 'Adelaide, King, Richmond',  'Venue Category'].unique()

array(['Concert Hall', 'Steakhouse', 'Plaza',
       'Vegetarian / Vegan Restaurant', 'Hotel', 'Speakeasy',
       'Asian Restaurant', 'Greek Restaurant', 'Seafood Restaurant', 'Bar',
       'Coffee Shop', 'Opera House', 'Pizza Place', 'Food Court',
       'Gym / Fitness Center', 'Café', 'Neighborhood',
       'American Restaurant', 'Lounge', 'Noodle House', 'Sushi Restaurant',
       'Smoke Shop', 'Monument / Landmark', 'Gastropub', 'Burger Joint',
       'Breakfast Spot', 'Deli / Bodega', 'Brazilian Restaurant',
       'Colombian Restaurant', 'Burrito Place', 'Record Shop', 'Jazz Club',
       'Thai Restaurant', 'Japanese Restaurant', 'General Travel',
       'Salon / Barbershop', 'Mediterranean Restaurant', 'Restaurant',
       'Salad Place', 'Bakery', 'Gym', 'New American Restaurant',
       'Theater', 'Department Store', 'Bookstore', 'Juice Bar',
       'Ice Cream Shop', 'Shopping Mall', 'Clothing Store', 'Art Museum',
       'Indian Restaurant', 'Gluten-free Restaurant', 'Electro

Analyze each neighbourhood

In [33]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot

Unnamed: 0,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 
toronto_onehot.columns[:]


Index(['Accessories Store', 'Adult Boutique', 'Afghan Restaurant', 'Airport',
       'Airport Food Court', 'Airport Gate', 'Airport Lounge',
       'Airport Service', 'Airport Terminal', 'American Restaurant',
       ...
       'Train Station', 'Vegetarian / Vegan Restaurant', 'Video Game Store',
       'Video Store', 'Vietnamese Restaurant', 'Warehouse Store', 'Wine Bar',
       'Wings Joint', 'Women's Store', 'Yoga Studio'],
      dtype='object', length=279)

In [35]:
# Move neighbors column to the first

neigh = toronto_onehot['Neighborhood']
toronto_onehot.drop(labels=['Neighborhood'], axis=1,inplace = True)
toronto_onehot.insert(0, 'Neighborhood', neigh)
toronto_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Highland Creek, Rouge Hill, Port Union",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Highland Creek, Rouge Hill, Port Union",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Print each neighborhood along with the top 5 most common venues

In [36]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
                 venue  freq
0                 Café  0.05
1          Coffee Shop  0.05
2                  Bar  0.04
3  American Restaurant  0.04
4           Steakhouse  0.04


----Agincourt----
                venue  freq
0      Sandwich Place  0.25
1      Breakfast Spot  0.25
2              Lounge  0.25
3  Chinese Restaurant  0.25
4   Mobile Phone Shop  0.00


----Agincourt North, L'Amoreaux East, Milliken, Steeles East----
               venue  freq
0         Playground  0.33
1   Asian Restaurant  0.33
2               Park  0.33
3  Mobile Phone Shop  0.00
4      Movie Theater  0.00


----Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown----
           venue  freq
0  Grocery Store  0.18
1    Pizza Place  0.09
2       Pharmacy  0.09
3     Beer Store  0.09
4   Liquor Store  0.09


----Alderwood, Long Branch----
            venue  freq
0     Pizza Place   0.2
1        Pharmacy   0.1
2            

Sort venues and display top 10 venues as columns

In [42]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]


num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,American Restaurant,Bar,Steakhouse,Cosmetics Shop,Hotel,Burger Joint,Bakery,Thai Restaurant
1,Agincourt,Lounge,Sandwich Place,Breakfast Spot,Chinese Restaurant,Yoga Studio,Dumpling Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Park,Playground,Asian Restaurant,Yoga Studio,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Pharmacy,Fried Chicken Joint,Pizza Place,Coffee Shop,Sandwich Place,Liquor Store,Beer Store,Fast Food Restaurant,Video Store
4,"Alderwood, Long Branch",Pizza Place,Coffee Shop,Skating Rink,Dance Studio,Pharmacy,Pool,Pub,Sandwich Place,Gym,Airport Terminal


Cluster Neighbourhoods

In [43]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 4, 2, 4, 4, 4, 4, 4, 4, 4], dtype=int32)

Combine cluster and the top 10 venues for each neighbourhood

In [48]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

ValueError: cannot insert Cluster Labels, already exists

In [49]:
toronto_merged = toronto_lat_long

# merge toronto_grouped with toronto_data to add latitude/longitude for each Neighbourhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,4.0,Fast Food Restaurant,Print Shop,Department Store,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,4.0,Construction & Landscaping,Bar,Yoga Studio,Dumpling Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Eastern European Restaurant
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,4.0,Rental Car Location,Mexican Restaurant,Intersection,Spa,Pizza Place,Electronics Store,Tech Startup,Breakfast Spot,Medical Center,Diner
3,M1G,Scarborough,Woburn,43.770992,-79.216917,4.0,Coffee Shop,Korean Restaurant,Convenience Store,Yoga Studio,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,4.0,Hakka Restaurant,Lounge,Fried Chicken Joint,Athletics & Sports,Bakery,Caribbean Restaurant,Thai Restaurant,Bank,Drugstore,Doner Restaurant


Drop rows where Cluster Labels is NaN and convert the cluster label to int

In [87]:
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].fillna(-1)
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype(int)
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype(str)
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].replace('-1', 'Invalid')
#toronto_merged[pd.isnull(toronto_merged['Cluster Labels'])]

ValueError: invalid literal for int() with base 10: 'Invalid'

In [90]:
toronto_cluster_clean = toronto_merged.drop(toronto_merged[(toronto_merged['Cluster Labels'] == 'Invalid')].index)

array(['4', '2', '1', '0', '3'], dtype=object)

In [97]:
#toronto_cluster_clean['Cluster Labels'].astype(int)
toronto_cluster_clean = toronto_cluster_clean.astype({'Cluster Labels': int})
toronto_cluster_clean['Cluster Labels'].unique()
type(toronto_cluster_clean['Cluster Labels'][0])

numpy.int64

Visualize the clusters

In [65]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

In [98]:
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_cluster_clean['Latitude'], toronto_cluster_clean['Longitude'], toronto_cluster_clean['Neighborhood'], toronto_cluster_clean['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters