# Part 1: Getting data from Wikipedia, cleaning and formatting

First, we import Pandas and load the tables from the Wikipedia page. The table we want is the first one on the page.

In [108]:
import pandas as pd
tables = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
df = pd.DataFrame(tables[0])

Next we 1) rename the columns, 2) drop the rows where Borough is "Not assigned", and 3) on rows where Neighborhood is "Not assigned" change it to same as Borough.

In [109]:
df.columns = ["Postal code", "Borough", "Neighborhood"]
df.drop(df[df["Borough"] == "Not assigned"].index, inplace = True)
df["Neighborhood"].loc[df["Neighborhood"] == "Not assigned"] = df["Borough"]

Now we group the dataframe so that we have only one row per postal code and neighborhoods under that postal code are listed as a comma-separated list in the Neighborhood column.

In [110]:
nbrs = df.groupby(["Postal code", "Borough"])["Neighborhood"].apply(lambda hoods: ", ".join(hoods)).reset_index()

Let's see how it looks like.

In [111]:
nbrs.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Finally, we check the shape of the resulting dataframe.

In [112]:
nbrs.shape

(103, 3)

# Part 2: Adding location data

First we install and import the Geocoder library and define a function to fetch coordinates for a given postal code.

In [113]:
!conda install -c conda-forge geocoder --yes
import geocoder

def lat_long(postal_code):
    coords = None
    while(coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        # (Using arcgis instead of Google, because Google doesn't seem to work.)
        coords = g.latlng
    return coords[0], coords[1]

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



Next, we go through the rows of the dataframe, get the latitude and longitude for each postal code, and add them to two separate lists. Finally we insert the lists into the dataframe as new columns Latitude and Longitude.

In [115]:
lat_list = []
long_list = []
counter = 1
for pc in nbrs["Postal code"]:
    print("{} postal codes".format(counter), end = '\r')
    lat, long = lat_long(pc)
    lat_list.append(lat)
    long_list.append(long)
    counter += 1
print("\nDone.")
nbrs["Latitude"] = lat_list
nbrs["Longitude"] = long_list

103 postal codes
Done.


Let's check that it looks right:

In [116]:
nbrs.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.81153,-79.19552
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.78564,-79.15871
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.76575,-79.1752
3,M1G,Scarborough,Woburn,43.7682,-79.21761
4,M1H,Scarborough,Cedarbrae,43.76969,-79.23944


# Part 3: Exploring & clustering

First we install Folium, then we import it plus a bunch of other stuff we'll need.

In [117]:
!conda install -c conda-forge folium=0.5.0
import folium
import requests
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



Next we get the coordinates of the city from arcgis and initialize a Folium map with said coordinates. Then we go through our dataframe and postal code areas as markers on the map. Finally, we draw the map.

In [118]:
g = geocoder.arcgis('Toronto, Ontario')
lat, long = g.latlng

map_toronto = folium.Map(location=[lat, long], zoom_start = 11)

for lat, long, postal_code, borough in zip(nbrs['Latitude'], nbrs['Longitude'], nbrs['Postal code'], nbrs['Borough']):
    label = folium.Popup('{} ({})'.format(postal_code, borough), parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=4,
        popup=label,
        color='green',
        fill=True,
        fill_color='green',
        fill_opacit=0.5,
        parse_html=False).add_to(map_toronto)
    
map_toronto

Credentials for using the FourSquare API:

In [119]:
CLIENT_ID = 'FT1PQUXPHAHMRO2TFZYPVSTR01FEFMIK1TYWBKEG1H3ARI4N' # your Foursquare ID
CLIENT_SECRET = '3VHE41ATGOMRT5FGTWLLYC2NLK5SJ3V0QXS1LVGYTQ4K5TEB' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

We'll borrow the getNearbyVenues function from the Manhattan example and use it to get nearby venues for all Toronto neighborhoods.

In [120]:
def getNearbyVenues(codes, latitudes, longitudes, radius=500, limit=1000):
    
    venues_list=[]
    for code, lat, lng in zip(codes, latitudes, longitudes):
        print(".", end="") # Just to follow progress...
        
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius,
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            code,
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal code', 
                  'Latitude', 
                  'Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

print("Getting venues", end="")
toronto_venues = getNearbyVenues(codes=nbrs['Postal code'],
                                 latitudes=nbrs['Latitude'],
                                 longitudes=nbrs['Longitude'])
print(" Done.\n")
print("Found {} venues.".format(toronto_venues.shape[0]))

Getting venues....................................................................................................... Done.

Found 2417 venues.


Next we process the data to get a new dataframe with each row containing the means of the frequencies of occurrence of each category on a given postal code. This is straight from the New York example.

In [127]:
# one hot encoding
onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add postal code column back to dataframe
onehot['Postal code'] = toronto_venues['Postal code'] 

# move postal code column to the first column
fixed_columns = [onehot.columns[-1]] + list(onehot.columns[:-1])
onehot = onehot[fixed_columns]

toronto_grouped = onehot.groupby('Postal code').mean().reset_index()
toronto_grouped

Unnamed: 0,Postal code,Accessories Store,Adult Boutique,Airport,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,...,Veterinarian,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo Exhibit
0,M1B,0.000000,0.0,0.00,0.000000,0.0,0.0,0.000000,0.00000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,1.0
1,M1C,0.000000,0.0,0.00,0.000000,0.0,0.0,0.000000,0.00000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0
2,M1E,0.000000,0.0,0.00,0.000000,0.0,0.0,0.000000,0.00000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0
3,M1G,0.000000,0.0,0.00,0.000000,0.0,0.0,0.000000,0.00000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0
4,M1H,0.000000,0.0,0.00,0.000000,0.0,0.0,0.000000,0.00000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0
5,M1J,0.000000,0.0,0.00,0.000000,0.0,0.0,0.000000,0.00000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0
6,M1K,0.000000,0.0,0.00,0.000000,0.0,0.0,0.000000,0.00000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0
7,M1L,0.000000,0.0,0.00,0.000000,0.0,0.0,0.000000,0.00000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0
8,M1M,0.000000,0.0,0.00,0.000000,0.0,0.0,0.000000,0.00000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0
9,M1N,0.000000,0.0,0.00,0.000000,0.0,0.0,0.000000,0.00000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0


Let's list the top 10 venues for each postal code.

In [128]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

columns = ['Postal code']
for ind in np.arange(num_top_venues):
    columns.append('Most common {}'.format(ind + 1))

# create a new dataframe
postalcode_venues_sorted = pd.DataFrame(columns=columns)
postalcode_venues_sorted['Postal code'] = toronto_grouped['Postal code']

for ind in np.arange(toronto_grouped.shape[0]):
    postalcode_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

postalcode_venues_sorted.head()

Unnamed: 0,Postal code,Most common 1,Most common 2,Most common 3,Most common 4,Most common 5,Most common 6,Most common 7,Most common 8,Most common 9,Most common 10
0,M1B,Zoo Exhibit,Donut Shop,Food,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Field,Fast Food Restaurant,Farmers Market
1,M1C,Bar,Zoo Exhibit,Farmers Market,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farm,Fast Food Restaurant,Food Court
2,M1E,Construction & Landscaping,Bus Stop,Gym / Fitness Center,Park,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farm,Farmers Market
3,M1G,Korean Restaurant,Park,Business Service,Coffee Shop,Fish & Chips Shop,Field,Fast Food Restaurant,Farmers Market,Fish Market,Dog Run
4,M1H,Playground,Trail,Zoo Exhibit,Falafel Restaurant,Donut Shop,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,Event Space,Farm


And now, clustering! Like we did in Manhattan.

In [129]:
kclusters = 5

clustering = toronto_grouped.drop('Postal code', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(clustering)

postalcode_venues_sorted.insert(0, 'Cluster labels', kmeans.labels_)
toronto_grouped.insert(0, 'Cluster labels', kmeans.labels_)

final_data = nbrs[['Postal code', 'Borough', 'Latitude', 'Longitude']]

final_data = final_data.join(postalcode_venues_sorted.set_index('Postal code'), on='Postal code')
final_data.dropna(inplace=True)
final_data = final_data.astype({'Cluster labels' : 'int32'})
final_data

Unnamed: 0,Postal code,Borough,Latitude,Longitude,Cluster labels,Most common 1,Most common 2,Most common 3,Most common 4,Most common 5,Most common 6,Most common 7,Most common 8,Most common 9,Most common 10
0,M1B,Scarborough,43.81153,-79.19552,0,Zoo Exhibit,Donut Shop,Food,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Field,Fast Food Restaurant,Farmers Market
1,M1C,Scarborough,43.78564,-79.15871,0,Bar,Zoo Exhibit,Farmers Market,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farm,Fast Food Restaurant,Food Court
2,M1E,Scarborough,43.76575,-79.17520,0,Construction & Landscaping,Bus Stop,Gym / Fitness Center,Park,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farm,Farmers Market
3,M1G,Scarborough,43.76820,-79.21761,0,Korean Restaurant,Park,Business Service,Coffee Shop,Fish & Chips Shop,Field,Fast Food Restaurant,Farmers Market,Fish Market,Dog Run
4,M1H,Scarborough,43.76969,-79.23944,0,Playground,Trail,Zoo Exhibit,Falafel Restaurant,Donut Shop,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,Event Space,Farm
5,M1J,Scarborough,43.74309,-79.23526,0,Ice Cream Shop,Restaurant,Train Station,Zoo Exhibit,Farm,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant
6,M1K,Scarborough,43.72861,-79.26367,0,Coffee Shop,Convenience Store,Hockey Arena,Hobby Shop,Chinese Restaurant,Bus Line,Bus Station,Light Rail Station,Department Store,Discount Store
7,M1L,Scarborough,43.71406,-79.28412,0,Bakery,Bus Line,Intersection,Bus Station,Soccer Field,Gym,Coffee Shop,Metro Station,Deli / Bodega,Dance Studio
8,M1M,Scarborough,43.72360,-79.23496,0,Ice Cream Shop,Pizza Place,Sandwich Place,Coffee Shop,Hardware Store,Discount Store,Pharmacy,Fast Food Restaurant,Field,Farmers Market
9,M1N,Scarborough,43.69539,-79.26194,0,General Entertainment,College Stadium,Café,Skating Rink,Gym,Farm,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant


Now we can draw a clustered map:

In [130]:
lat, long = g.latlng

# create map
map_clusters = folium.Map(location=[lat, long], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, code, borough, cluster in zip(final_data['Latitude'], final_data['Longitude'], final_data['Postal code'], final_data['Borough'], final_data['Cluster labels']):
    label_text = '{} ({}) cluster {}'.format(code, borough, cluster)
    label = folium.Popup(label_text, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examining the clusters
Ok, now we have 5 clusters, shown below. Three of them contain only one postal code; one of them contains five; and one cluster contains all the rest (93 postal codes). These numbers make the results of the clustering seem questionable. To be honest, I find it difficult to understand the results or label the clusters. Well, zoo exhibits seem common in all clusters. Further work would be needed to make more sense of the data.

### Cluster 1

In [131]:
final_data.loc[final_data['Cluster labels'] == 0, final_data.columns[[1] + list(range(5, final_data.shape[1]))]]

Unnamed: 0,Borough,Most common 1,Most common 2,Most common 3,Most common 4,Most common 5,Most common 6,Most common 7,Most common 8,Most common 9,Most common 10
0,Scarborough,Zoo Exhibit,Donut Shop,Food,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Field,Fast Food Restaurant,Farmers Market
1,Scarborough,Bar,Zoo Exhibit,Farmers Market,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farm,Fast Food Restaurant,Food Court
2,Scarborough,Construction & Landscaping,Bus Stop,Gym / Fitness Center,Park,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farm,Farmers Market
3,Scarborough,Korean Restaurant,Park,Business Service,Coffee Shop,Fish & Chips Shop,Field,Fast Food Restaurant,Farmers Market,Fish Market,Dog Run
4,Scarborough,Playground,Trail,Zoo Exhibit,Falafel Restaurant,Donut Shop,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,Event Space,Farm
5,Scarborough,Ice Cream Shop,Restaurant,Train Station,Zoo Exhibit,Farm,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant
6,Scarborough,Coffee Shop,Convenience Store,Hockey Arena,Hobby Shop,Chinese Restaurant,Bus Line,Bus Station,Light Rail Station,Department Store,Discount Store
7,Scarborough,Bakery,Bus Line,Intersection,Bus Station,Soccer Field,Gym,Coffee Shop,Metro Station,Deli / Bodega,Dance Studio
8,Scarborough,Ice Cream Shop,Pizza Place,Sandwich Place,Coffee Shop,Hardware Store,Discount Store,Pharmacy,Fast Food Restaurant,Field,Farmers Market
9,Scarborough,General Entertainment,College Stadium,Café,Skating Rink,Gym,Farm,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant


### Cluster 2

In [132]:
final_data.loc[final_data['Cluster labels'] == 1, final_data.columns[[1] + list(range(5, final_data.shape[1]))]]

Unnamed: 0,Borough,Most common 1,Most common 2,Most common 3,Most common 4,Most common 5,Most common 6,Most common 7,Most common 8,Most common 9,Most common 10
40,East York,Park,Music Venue,Bus Stop,Farm,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant,Zoo Exhibit,Donut Shop
44,Central Toronto,Home Service,Park,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farm,Farmers Market,Dog Run
63,Central Toronto,Home Service,Donut Shop,Food,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Field,Fast Food Restaurant,Farmers Market
64,Central Toronto,Park,Zoo Exhibit,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farm,Farmers Market,Dog Run
82,West Toronto,Park,Convenience Store,Zoo Exhibit,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farm,Farmers Market,Fast Food Restaurant


### Cluster 3

In [133]:
final_data.loc[final_data['Cluster labels'] == 2, final_data.columns[[1] + list(range(5, final_data.shape[1]))]]

Unnamed: 0,Borough,Most common 1,Most common 2,Most common 3,Most common 4,Most common 5,Most common 6,Most common 7,Most common 8,Most common 9,Most common 10
32,North York,Business Service,Zoo Exhibit,Donut Shop,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Field,Fast Food Restaurant,Farmers Market


### Cluster 4

In [134]:
final_data.loc[final_data['Cluster labels'] == 3, final_data.columns[[1] + list(range(5, final_data.shape[1]))]]

Unnamed: 0,Borough,Most common 1,Most common 2,Most common 3,Most common 4,Most common 5,Most common 6,Most common 7,Most common 8,Most common 9,Most common 10
11,Scarborough,Auto Garage,Zoo Exhibit,Farmers Market,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farm,Fast Food Restaurant,Donut Shop


### Cluster 5

In [135]:
final_data.loc[final_data['Cluster labels'] == 4, final_data.columns[[1] + list(range(5, final_data.shape[1]))]]

Unnamed: 0,Borough,Most common 1,Most common 2,Most common 3,Most common 4,Most common 5,Most common 6,Most common 7,Most common 8,Most common 9,Most common 10
20,North York,Music Venue,Zoo Exhibit,Donut Shop,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Field,Fast Food Restaurant,Farmers Market
