# Segmenting and Clustering Neighborhoods in Toronto

## Create Toronto Neighborhood dataframe

In [11]:
import pandas as pd
import numpy as np
import re

In [2]:
#!python3 -m pip install lxml  ## Uncomment if missing library



In [3]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [42]:
# Using pandas read_html to extract tables from url
url_tables = pd.read_html(url)

In [44]:
# Getting our required table and storing it in dataframe
codes_df = url_tables[0]
codes_df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M1ANot assigned,M2ANot assigned,M3ANorth York(Parkwoods),M4ANorth York(Victoria Village),M5ADowntown Toronto(Regent Park / Harbourfront),M6ANorth York(Lawrence Manor / Lawrence Heights),M7ADowntown Toronto(Queen's Park / Ontario Pro...,M8ANot assigned,M9AEtobicoke(Islington Avenue)
1,M1BScarborough(Malvern / Rouge),M2BNot assigned,M3BNorth York(Don Mills)North,M4BEast York(Parkview Hill / Woodbine Gardens),"M5BDowntown Toronto(Garden District, Ryerson)",M6BNorth York(Glencairn),M7BNot assigned,M8BNot assigned,M9BEtobicoke(West Deane Park / Princess Garden...
2,M1CScarborough(Rouge Hill / Port Union / Highl...,M2CNot assigned,M3CNorth York(Don Mills)South(Flemingdon Park),M4CEast York(Woodbine Heights),M5CDowntown Toronto(St. James Town),M6CYork(Humewood-Cedarvale),M7CNot assigned,M8CNot assigned,M9CEtobicoke(Eringate / Bloordale Gardens / Ol...
3,M1EScarborough(Guildwood / Morningside / West ...,M2ENot assigned,M3ENot assigned,M4EEast Toronto(The Beaches),M5EDowntown Toronto(Berczy Park),M6EYork(Caledonia-Fairbanks),M7ENot assigned,M8ENot assigned,M9ENot assigned
4,M1GScarborough(Woburn),M2GNot assigned,M3GNot assigned,M4GEast York(Leaside),M5GDowntown Toronto(Central Bay Street),M6GDowntown Toronto(Christie),M7GNot assigned,M8GNot assigned,M9GNot assigned
5,M1HScarborough(Cedarbrae),M2HNorth York(Hillcrest Village),M3HNorth York(Bathurst Manor / Wilson Heights ...,M4HEast York(Thorncliffe Park),M5HDowntown Toronto(Richmond / Adelaide / King),M6HWest Toronto(Dufferin / Dovercourt Village),M7HNot assigned,M8HNot assigned,M9HNot assigned
6,M1JScarborough(Scarborough Village),M2JNorth York(Fairview / Henry Farm / Oriole),M3JNorth York(Northwood Park / York University),M4JEast YorkEast Toronto(The Danforth East),M5JDowntown Toronto(Harbourfront East / Union ...,M6JWest Toronto(Little Portugal / Trinity),M7JNot assigned,M8JNot assigned,M9JNot assigned
7,M1KScarborough(Kennedy Park / Ionview / East B...,M2KNorth York(Bayview Village),M3KNorth York(Downsview)East (CFB Toronto),M4KEast Toronto(The Danforth West / Riverdale),M5KDowntown Toronto(Toronto Dominion Centre / ...,M6KWest Toronto(Brockton / Parkdale Village / ...,M7KNot assigned,M8KNot assigned,M9KNot assigned
8,M1LScarborough(Golden Mile / Clairlea / Oakridge),M2LNorth York(York Mills / Silver Hills),M3LNorth York(Downsview)West,M4LEast Toronto(India Bazaar / The Beaches West),M5LDowntown Toronto(Commerce Court / Victoria ...,M6LNorth York(North Park / Maple Leaf Park / U...,M7LNot assigned,M8LNot assigned,M9LNorth York(Humber Summit)
9,M1MScarborough(Cliffside / Cliffcrest / Scarbo...,M2MNorth York(Willowdale / Newtonbrook),M3MNorth York(Downsview)Central,M4MEast Toronto(Studio District),M5MNorth York(Bedford Park / Lawrence Manor East),M6MYork(Del Ray / Mount Dennis / Keelsdale and...,M7MNot assigned,M8MNot assigned,M9MNorth York(Humberlea / Emery)


#### The next section  goes over each cell of the table and extracts Postal Code, Boroughs and Neighborhood using regex.
The extracted values are stored is List objects to later be converted into dataframe.
Empty postal codes are removed.
Each cell is parsed using regular expression to extract Postal Code, borough and neighborhood
4 cells had weird formatting.. some issue with pd.read_html.. Therefore, those were added manually
Extract values for postal codes , boroughs and neighborhoods are added to a list which is then used later to build the dataframe

In [95]:
## This section of the code goes over each cell of the table and extracts Postal Code, Boroughs and Neighborhood using regex.
## The extracted values are stored is List objects to later be converted into dataframe
count = {} 
postal_codes = []
borough_list = []
hood_list = []

for label, content in codes_df.items():
    for cell in content:
        ## First check if the postal code is not assigned
        empty_regex = re.compile("Not assigned")
        match = empty_regex.search(cell)
        if (match):
            count['Empty'] = count.get('Empty',0) + 1
        ## If assigned, use regex to extract postal code, borough and neighborhood
        else:
            if re.search("Enclave|YorkEast",cell): ## Found 4 cells with weird formatting so decided to address those manually
                extract_regex = re.compile(r"^(M\d\S)")
                postal_code = extract_regex.search(cell).group(0)
                print(f"No match found for {postal_code}.. Please manually assign")
                count['Manual'] = count.get('Manual',[]) + [postal_code]
            else:  ## Use regex to extract details
                extract_regex = re.compile(r"^(M\d\S)([a-zA-Z\s]+)\((.+$)")
                match = extract_regex.findall(cell)
                try: ## try - except block to catch any issues
                    postal_code, borough, neighborhood = match[0]
                    ## neighborhood could have extra paranthesis. get rid of it
                    neighborhood = re.sub(r"(\S)\)(\S)",r"\1 \2",neighborhood)
                    ## Also remove ) from the end of string
                    neighborhood = neighborhood.strip(")")
                    ## Convert any '/' to ',' in the neighborhood as per requirements (multiple neighborhood)
                    neighborhood = re.sub("/",",",neighborhood)
                    ## Appending to lists
                    postal_codes.append(postal_code)
                    borough_list.append(borough)
                    hood_list.append(neighborhood)
                except IndexError:
                    print("*******************")
                    print("Error parsing.. Please review and assign value to this code manually")
                    print("*******************")

print(count)

No match found for M4J.. Please manually assign
No match found for M5W.. Please manually assign
No match found for M7R.. Please manually assign
No match found for M7Y.. Please manually assign
{'Empty': 77, 'Manual': ['M4J', 'M5W', 'M7R', 'M7Y']}


Now using the lists created in the previous section to build a dataframe. And adding manual values for four postal codes identified previously

In [101]:
toronto_df = pd.DataFrame({'PostalCode': postal_codes, 'Borough':borough_list, 
                          'Neighborhood':hood_list})

len(toronto_df)
## Adding manual entries for weird formatting identified in previous section
toronto_df.loc[len(toronto_df)] = ['M4J','East York','East Toronto (The Danforth East)']
toronto_df.loc[len(toronto_df)] = ['M5W','Downtown Toronto','Stn A PO Boxes, 25 The Esplanade (Enclave of M5E)']
toronto_df.loc[len(toronto_df)] = ['M7R','Mississauga','Canada Post Gateway Processing Centre (Enclave of L4W)']
toronto_df.loc[len(toronto_df)] = ['M7Y','East Toronto','Business reply mail Processing Centre, 969 Eastern (Enclave of M4L)']
toronto_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern , Rouge"
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park , Ionview , East Birchmount Park"
7,M1L,Scarborough,"Golden Mile , Clairlea , Oakridge"
8,M1M,Scarborough,"Cliffside , Cliffcrest , Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff , Cliffside West"


In [102]:
toronto_df.shape

(103, 3)

### Getting latitude and longitude using geocoder/ given CSV file

This section covers getting langitude and latitude and adding it to the dataframe. I first tried using geocoder and used the while loop method desrcribed in the instructions. It didn't work. I also tried using the Nominatim library but that didn't work either. Therefore I resorted to using the CSV file

Importing CSV file

In [120]:
import requests
url = 'http://cocl.us/Geospatial_data'
r = requests.get(url, allow_redirects=True)
open('geo.csv', 'wb').write(r.content)

2891

In [125]:
geo_df = pd.read_csv('geo.csv')
## Rename column to prep this df for merging with toronto dataframe
geo_df.rename(columns={'Postal Code':'PostalCode'},inplace=True)
geo_df

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


Using pd.merge (join) to left join the toronto dataframe with the geo dataframe on Postal Code field

In [127]:
final_df = pd.merge(toronto_df, geo_df, how='left', on='PostalCode')
final_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park , Ionview , East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile , Clairlea , Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside , Cliffcrest , Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff , Cliffside West",43.692657,-79.264848


### Now we will cluster the neighborhoods in Toronto

For clustering, I have decided to cluster the neighborhoods based on the Borough name. 
This will cluster the neighborhoods nicely based on their geographical location and we will be able to see nice distint clusters when we create the Folium map.

In [144]:
final_df['Borough'].value_counts()

North York            24
Downtown Toronto      19
Scarborough           17
Etobicoke             11
Central Toronto        9
West Toronto           6
East York              5
East Toronto           5
York                   5
Mississauga            1
EtobicokeNorthwest     1
Name: Borough, dtype: int64

For the purposes of clustering, we will reduce the number of unique Borough names.
East York will be merged into East Toronto.
York will be merged into North York.
Missisauga, EtobicokeNorthwest will be merged into Etobicoke.

In [150]:
new_df = final_df.copy()
new_df.loc[new_df['Borough']=='East York','Borough'] = 'East Toronto'
new_df.loc[new_df['Borough']=='York','Borough'] = 'North York'
new_df.loc[new_df['Borough']=='Mississauga','Borough'] = 'Etobicoke'
new_df.loc[new_df['Borough']=='EtobicokeNorthwest','Borough'] = 'Etobicoke'
new_df['Borough'].value_counts()

North York          29
Downtown Toronto    19
Scarborough         17
Etobicoke           13
East Toronto        10
Central Toronto      9
West Toronto         6
Name: Borough, dtype: int64

In [146]:
 # one hot encoding
toronto_onehot = pd.get_dummies(new_df[['Borough']], prefix="", prefix_sep="")

toronto_onehot

Unnamed: 0,Central Toronto,Downtown Toronto,East Toronto,Etobicoke,North York,Scarborough,West Toronto
0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0
2,0,0,0,0,0,1,0
3,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0
5,0,0,0,0,0,1,0
6,0,0,0,0,0,1,0
7,0,0,0,0,0,1,0
8,0,0,0,0,0,1,0
9,0,0,0,0,0,1,0


Now applying KMeans cluster to divide this data into 7 clusters.

In [154]:
# import k-means 
from sklearn.cluster import KMeans

k_clusters = 7
# run k-means clustering
kmeans = KMeans(n_clusters=k_clusters, random_state=0).fit(toronto_onehot)

# check cluster labels generated for each row in the dataframe
labels = kmeans.labels_
labels.shape

(103,)

Now that we have used KMeans to generate cluster labels, we will add these labels back into the new_df along side Longitude and Latitude information

In [155]:
new_df['Labels'] = labels
new_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Labels
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353,3
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497,3
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711,3
3,M1G,Scarborough,Woburn,43.770992,-79.216917,3
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,3
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476,3
6,M1K,Scarborough,"Kennedy Park , Ionview , East Birchmount Park",43.727929,-79.262029,3
7,M1L,Scarborough,"Golden Mile , Clairlea , Oakridge",43.711112,-79.284577,3
8,M1M,Scarborough,"Cliffside , Cliffcrest , Scarborough Village West",43.716316,-79.239476,3
9,M1N,Scarborough,"Birch Cliff , Cliffside West",43.692657,-79.264848,3


Now, let's visualize the resulting clusters on the map of Toronto

First create a map of Toronto

In [162]:
# create map
import folium
latitude, longitude = 43.65,-79.38
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)
map_clusters

Setting up the color scheme for the clusters

In [165]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

x = np.arange(k_clusters)
ys = [i + x + (i*x)**2 for i in range(k_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
print(rainbow)


['#8000ff', '#2c7ef7', '#2adddd', '#80ffb4', '#d4dd80', '#ff7e41', '#ff0000']


Now adding markers to the map of Toronto showing each cluster

In [166]:
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(new_df['Latitude'], new_df['Longitude'], new_df['Neighborhood'], new_df['Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters