First we scrape the Wikipedia page and create a dataframe with the table.

In [63]:
import requests
import pandas as pd
from pandas.io.json import json_normalize

#!conda install -c conda-forge folium=0.5.0 --yes
!pip install folium
import folium

#!conda install -c conda-forge geopy --yes
!pip install geopy
from geopy.geocoders import Nominatim

r = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
dataframes=pd.read_html(r.text, header=0)
df = dataframes[0]



After that we remove all the rows with unassigned boroughs and reset the indexes because of the removed data.

In [64]:
df.drop(df[df['Borough'] == 'Not assigned'].index, inplace=True)
df = df.reset_index().drop('index', axis=1)
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


Now we assign the boroughs to the unassigned neighborhoods (at the time I scraped the data there were no rows where this happened but anyways I'm adding it in case the data changes in the future).

In [65]:
df[(df['Borough'] != 'Not assigned') & (df['Neighborhood'] == 'Not assigned')]['Neighborhood'] = df[(df['Borough'] != 'Not assigned') & (df['Neighborhood'] == 'Not assigned')]['Borough']

We check that there's no duplicated postal codes left

In [66]:
len(df['Postal Code'].unique()) == len(df)

True

We print the dataframe shape

In [30]:
df.shape

(103, 3)

Now we download the csv file with latitudes and longitudes

In [67]:
df_lt_lg = pd.read_csv('http://cocl.us/Geospatial_data')
df_lt_lg.sort_values(by=['Postal Code'])
df_lt_lg.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


We merge both dataframes by the 'Postal Code' column

In [68]:
df.sort_values(by=['Postal Code'])
df = pd.merge(df, df_lt_lg, on='Postal Code')

And we have our merged dataframe

In [33]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


Now we select the boroughs that have "Toronto" in their name

In [69]:
df_toronto = df[df['Borough'].str.contains('Toronto', na=False)]
df_toronto

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


Now let's create a map with the data points from the last dataframe.

In [132]:
address = 'Toronto, CA'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Canada are {}, {}.'.format(latitude, longitude))

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

The geograpical coordinates of Canada are 43.6534817, -79.3839347.


Now let's explore the neighborhoods. First let's setup Foursquare credentials

In [71]:
CLIENT_ID = 'HNUUTNPHSEDUFHAXHQUO5JFYGABTZ4CLH1XOCJO14UHLJIBW'
CLIENT_SECRET = 'WOHLD0PHKILZN1MCWVJTXL3TWBJH0AENXMSOKPXPRDK2ABKL'
VERSION = '20180605'

In [92]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            'HNUUTNPHSEDUFHAXHQUO5JFYGABTZ4CLH1XOCJO14UHLJIBW', 
            'WOHLD0PHKILZN1MCWVJTXL3TWBJH0AENXMSOKPXPRDK2ABKL', 
            VERSION, 
            neighborhood_latitude, 
            neighborhood_longitude, 
            radius, 
            100)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        print()
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Here we will get the nearby venues for the city of Toronto

In [118]:
toronto_venues = getNearbyVenues(names=df_toronto['Neighborhood'], latitudes=df_toronto['Latitude'], longitudes=df_toronto['Longitude'])
toronto_venues.head()

Regent Park, Harbourfront

Queen's Park, Ontario Provincial Government

Garden District, Ryerson

St. James Town

The Beaches

Berczy Park

Central Bay Street

Christie

Richmond, Adelaide, King

Dufferin, Dovercourt Village

Harbourfront East, Union Station, Toronto Islands

Little Portugal, Trinity

The Danforth West, Riverdale

Toronto Dominion Centre, Design Exchange

Brockton, Parkdale Village, Exhibition Place

India Bazaar, The Beaches West

Commerce Court, Victoria Hotel

Studio District

Lawrence Park

Roselawn

Davisville North

Forest Hill North & West

High Park, The Junction South

North Toronto West

The Annex, North Midtown, Yorkville

Parkdale, Roncesvalles

Davisville

University of Toronto, Harbord

Runnymede, Swansea

Moore Park, Summerhill East

Kensington Market, Chinatown, Grange Park

Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park

CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport

Roseda

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Brookbanks Park,43.751976,-79.33214,Park
1,"Regent Park, Harbourfront",43.65426,-79.360636,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,Brookbanks Park,43.751976,-79.33214,Park
3,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,Variety Store,43.751974,-79.333114,Food & Drink Shop
4,"Garden District, Ryerson",43.657162,-79.378937,Brookbanks Park,43.751976,-79.33214,Park


Let's perform one-hot encoding to transform the values of "Venue Category" into integers

In [119]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head(10)

Unnamed: 0,Neighborhood,Food & Drink Shop,Park
0,"Regent Park, Harbourfront",0,1
1,"Regent Park, Harbourfront",1,0
2,"Queen's Park, Ontario Provincial Government",0,1
3,"Queen's Park, Ontario Provincial Government",1,0
4,"Garden District, Ryerson",0,1
5,"Garden District, Ryerson",1,0
6,St. James Town,0,1
7,St. James Town,1,0
8,The Beaches,0,1
9,The Beaches,1,0


In [120]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Food & Drink Shop,Park
0,Berczy Park,0.5,0.5
1,"Brockton, Parkdale Village, Exhibition Place",0.5,0.5
2,Business reply mail Processing Centre,0.5,0.5
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.5,0.5
4,Central Bay Street,0.5,0.5
5,Christie,0.5,0.5
6,Church and Wellesley,0.5,0.5
7,"Commerce Court, Victoria Hotel",0.5,0.5
8,Davisville,0.5,0.5
9,Davisville North,0.5,0.5


Now let's begin with the clustering. First let's declare the model.

In [121]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 2

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

  return_n_iter=True)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

Now let's merge it with the data of the neighborhoods.

In [123]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 2

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

neighborhoods_venues_sorted.head() # check the last columns!

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue
0,0,Berczy Park,Park,Food & Drink Shop
1,0,"Brockton, Parkdale Village, Exhibition Place",Park,Food & Drink Shop
2,0,Business reply mail Processing Centre,Park,Food & Drink Shop
3,0,"CN Tower, King and Spadina, Railway Lands, Har...",Park,Food & Drink Shop
4,0,Central Bay Street,Park,Food & Drink Shop


Finally let's plot it.

In [125]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

It seems the API only retrieved two venues for the whole Toronto area and the first one was always the most common one, that's why we get this not so useful map.