In [1]:
import pandas as pd
import numpy as np
!conda install -c conda-forge lxml --yes 
import lxml 


Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [2]:
#scraping table from the wiki
data=pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M", header=0)


In [3]:
#converting the scraped list to a dataframe with necessary infomation
areas=pd.DataFrame(data[0])
areas

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [4]:
#removing unassigned boroughs
areas=areas[~areas.Borough.str.contains("Not assigned")]


In [5]:
areas.shape

(103, 3)

In [5]:
londata=pd.read_csv("http://cocl.us/Geospatial_data")

In [6]:
merged=pd.merge(left=areas,right=londata,left_on='Postal Code', right_on='Postal Code')

In [8]:
merged

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [7]:
import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Libraries imported.


In [8]:
toronto_grouped = merged.groupby('Neighborhood').mean().reset_index()

In [9]:
####### CLUSTERING VIA BOROUGHS
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 1, 3, 3, 3, 4, 0, 4, 0, 4], dtype=int32)

In [10]:
#FINDING GEOLOCATION OF TORONTO IN PREPARATION FOR A MAP
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="on_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [11]:
toronto_grouped_clustering.insert(0, 'Cluster Labels', kmeans.labels_)

In [20]:
## CREATING DATAFRAME WITH NEIGHBORHOODS AND CLUSTER LABELS
toronto_grouped['Cluster Labels']=toronto_grouped_clustering['Cluster Labels']


In [21]:
toronto_grouped

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels
0,Agincourt,43.794200,-79.262029,2
1,"Alderwood, Long Branch",43.602414,-79.543484,1
2,"Bathurst Manor, Wilson Heights, Downsview North",43.754328,-79.442259,3
3,Bayview Village,43.786947,-79.385975,3
4,"Bedford Park, Lawrence Manor East",43.733283,-79.419750,3
...,...,...,...,...
94,"Willowdale, Willowdale West",43.782736,-79.442259,3
95,Woburn,43.770992,-79.216917,2
96,Woodbine Heights,43.695344,-79.318389,0
97,York Mills West,43.752758,-79.400049,3


In [22]:
#create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_grouped['Latitude'], toronto_grouped['Longitude'], toronto_grouped['Neighborhood'], toronto_grouped['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters