Project: Segmenting and Clustering Neighborhoods in Toronto - Part 1

In [38]:
#import the necessary libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
%matplotlib inline 


In [39]:
#Get the url
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r = requests.get(url)
data = r.text
#Use BeautifulSoup library to get the table
soup = BeautifulSoup(data,"lxml")
table = soup.find_all('table')[0]
rows = table.find_all('tr')[2:]

data = {
    'Postcode' : [],
    'Borough' : [],
    'Neighbourhood' : []
}

#fill in the dataframe from the table
for row in rows:
    cols = row.find_all('td')
    data['Postcode'].append( cols[0].get_text() )    
    data['Borough'].append( cols[1].get_text() )
    data['Neighbourhood'].append( cols[2].get_text().rstrip('\n') )
    
postalCodes = pd.DataFrame( data )
#The order of the tables are displed wrongly, we then make sure the order is "Postcode, Borough and Neighbourhood
postalCodes = postalCodes[['Postcode'] + postalCodes.columns[:-1].tolist()]

#replace ""Not assigned" to Nan
postalCodes.replace("Not assigned", np.nan, inplace=True)

#simply drop whole row with NaN in "Borough" column
postalCodes.dropna(subset=["Borough"],axis=0, inplace=True)

#reset index because we dropped row
postalCodes.reset_index(drop=True,inplace=True)

#Combine the rows with the same "Postcode", and make sure the neighbourhoods are separated with a comma
postalCodes = postalCodes.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x: ", ".join(x.astype(str))).reset_index()
postalCodes

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [40]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(postalCodes['Borough'].unique()),
        postalCodes.shape[0]
    )
)

The dataframe has 11 boroughs and 103 neighborhoods.


In [41]:
#Make sure for any cell that has a borough and a "nan" neighbourhood, then the neighbourhood will be the same as borough
for borough_item,neighbourhood_item in zip(postalCodes.Borough, postalCodes.Neighbourhood): 
    if neighbourhood_item == "nan":
        postalCodes.Neighbourhood.replace(['nan'], [borough_item], inplace=True)
               
postalCodes

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [42]:
#print the number of rows in the dataframe
postalCodes.shape

(103, 3)

Project: Segmenting and Clustering Neighborhoods in Toronto - Part 2 - Getting the latitude and longitude for the Postal Code

In [43]:
path="http://cocl.us/Geospatial_data"
lat_long = pd.read_csv(path)
#Rename column to have it identical with the similar column from postalCodes
lat_long.rename(columns={"Postal Code":"Postcode"},inplace=True)
lat_long

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [56]:
#Merge the two dataframes on PostCode as a key
postalCodes_geo = pd.merge(postalCodes, lat_long, on='Postcode')
postalCodes_geo

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


Project: Segmenting and Clustering Neighborhoods in Toronto - Part 3 - Generate maps and how they cluster together

In [45]:
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
geopy                     1.18.1                     py_0    conda-forge
Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
folium                    0.5.0                      py_0    conda-forge


In [58]:
address = 'Toronto, ON'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))
postalCodes_geo

  app.launch_new_instance()


The geograpical coordinate of Toronto are 43.653963, -79.387207.


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [59]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
# add markers to map
for lat, lng, borough, neighborhood in zip(postalCodes_geo['Latitude'], postalCodes_geo['Longitude'], postalCodes_geo['Borough'], postalCodes_geo['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [71]:
# set number of clusters
kclusters = 11

toronto_grouped_clustering = postalCodes_geo.drop(['Postcode','Borough','Neighbourhood'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

#check cluster labels generated for each row in the dataframe
kmeans.labels_[0:11]

array([ 1,  1,  1,  1,  1, 10, 10, 10, 10, 10, 10], dtype=int32)

In [72]:
postalCodes_geo['Cluster Labels'] = kmeans.labels_
postalCodes_geo

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,1
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,1
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,1
3,M1G,Scarborough,Woburn,43.770992,-79.216917,1
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,1
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476,10
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029,10
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577,10
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476,10
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848,10


In [73]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]


In [74]:
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(postalCodes_geo['Latitude'], postalCodes_geo['Longitude'], postalCodes_geo['Neighbourhood'], postalCodes_geo['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters