# Segmenting and clustering neighbourhood in Toronto #

## 1 - Import data from Wikipedia ##

In this section we build the code to scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe

In [1]:
#!pip install bs4
import requests
from bs4 import BeautifulSoup

website_url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(website_url,"lxml")
#print(soup.prettify())

In [2]:
# Wikipedia page title
soup.title.string

'List of postal codes of Canada: M - Wikipedia'

In [3]:
# table extracted from Wikipedia url
my_table = soup.find("table",{"class": "wikitable sortable"})
#my_table

In [4]:
#columns' names
header = [th.text.rstrip() for th in my_table.find_all("th")]
header

['Postal Code', 'Borough', 'Neighbourhood']

In [5]:
#extract values of each table's cell
c1=[]
c2=[]
c3=[]

for row in my_table.find_all("tr")[1::1]:
    cells = row.find_all("td")
    c1.append(cells[0].text.replace("\n", ""))
    c2.append(cells[1].text.replace("\n", ""))
    c3.append(cells[2].text.replace("\n", ""))

In [6]:
import pandas as pd

#create the DataFrame by the collected data from the table
df = pd.DataFrame({header[0].replace(" ", ""): c1, header[1]: c2, header[2]: c3})
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## 2 - Clean and structure the dataframe ##

In [7]:
#delete rows that don't have the borough
df = df[df.Borough!="Not assigned"]

In [8]:
#group neghbourhoods by postal code
df.groupby(["PostalCode"])
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [9]:
#if a cell has a borough but a "Not assigned" neighborhood, then the neighborhood will be the same as the borough
not_assign_neigh=df.loc[df["Neighbourhood"]=="Not assigned"].index.tolist()
if not_assign_neigh!=[]:
    for i in not_assign_neigh:
        df.loc[i,"Neighbourhood"] = df.loc[i,"Borough"]

In [10]:
#print the number of rows of the dataframe
df.shape

(103, 3)

## 3 - Get geographical coordinates of the neighbourhood ##

Now we get the geographical coordinates of the neighborhoods using the Geocoder package, here is a link to a csv file that has the geographical coordinates of each postal code: http://cocl.us/Geospatial_data

In [11]:
#read the csv file in a dataframe with the latitudes and longitudes of postal codes
lat_lon_df = pd.read_csv("https://cocl.us/Geospatial_data")
lat_lon_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
#merge the two dataframe in a new one based on postal codes
lat_lon_df.rename(columns={"Postal Code": "PostalCode"}, inplace=True)
df_merged = pd.merge(df, lat_lon_df, how="outer", on="PostalCode")
df_merged.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## 4 - Clustering ##
We explore and cluster the neighborhoods in Toronto using KMeans, working with only boroughs that contain the word Toronto.

In [13]:
#create a dataframe with only the Toronto's boroughs
toronto_df = df_merged[df_merged["Borough"].str.contains("Toronto", regex=False)]
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [14]:
from sklearn.cluster import KMeans

#set number of clusters
kclusters = 5

toronto_clustering = toronto_df.drop(['PostalCode', "Borough", "Neighbourhood"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_clustering)

# check cluster labels generated for each row in the dataframe
labels = kmeans.labels_

#insert the cluster labels in the dataframe
toronto_df.insert(0, "Cluster Labels", labels)
toronto_df

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighbourhood,Latitude,Longitude
2,0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,0,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,0,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,0,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,1,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,0,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,3,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,0,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,2,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [16]:
#!pip install folium
import folium
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
toronto_map = folium.Map(location=[43.651070,-79.347015], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
for lat,lon,neighbourhood,cluster in zip(toronto_df["Latitude"],toronto_df["Longitude"],toronto_df["Neighbourhood"],toronto_df["Cluster Labels"]):
    label = folium.Popup(str(neighbourhood) + " Cluster "+str(cluster),parse_html=True)
    folium.CircleMarker(
        [lat,lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=True,
        fill_opacity=0.7
    ).add_to(toronto_map)
    
toronto_map


This map could not be visible on Github, so you can go at the end of this page to see it.
https://dataplatform.cloud.ibm.com/analytics/notebooks/v2/4d02f311-b015-46ea-a79c-7184372f6dc8/view?access_token=24ed47cc3a64640e07e2e85c53e337ccb0d3f9a48d9298478fd98f8df12be284