## Segmenting and Clustering Neighborhoods in Toronto

In [1]:
# Import Libraries
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
# build the code to scrape the following Wikipedia page
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
toronto = requests.get(URL)
print(toronto.headers)

{'Date': 'Tue, 17 Dec 2019 01:46:03 GMT', 'Vary': 'Accept-Encoding,Cookie,Authorization', 'Server': 'ATS/8.0.5', 'X-ATS-Timestamp': '1576547163', 'Content-Type': 'text/html; charset=UTF-8', 'X-Powered-By': 'PHP/7.2.24-1+0~20191026.31+debian9~1.gbpbbacde+wmf1', 'X-Content-Type-Options': 'nosniff', 'P3P': 'CP="See https://en.wikipedia.org/wiki/Special:CentralAutoLogin/P3P for more info."', 'Content-language': 'en', 'Last-Modified': 'Fri, 13 Dec 2019 03:18:54 GMT', 'Backend-Timing': 'D=102301 t=1576207541686528', 'Content-Encoding': 'gzip', 'Content-Length': '15091', 'X-Varnish': '285601697 283110963', 'Age': '1513', 'X-Cache': 'cp2012 hit, cp2004 hit/7', 'X-Cache-Status': 'hit-front', 'Server-Timing': 'cache;desc="hit-front"', 'Strict-Transport-Security': 'max-age=106384710; includeSubDomains; preload', 'Set-Cookie': 'WMF-Last-Access=17-Dec-2019;Path=/;HttpOnly;secure;Expires=Sat, 18 Jan 2020 00:00:00 GMT, WMF-Last-Access-Global=17-Dec-2019;Path=/;Domain=.wikipedia.org;HttpOnly;secure;Ex

In [3]:
# Clean the data
soup = BeautifulSoup(toronto.content, 'html.parser')
table = soup.find('table')
trs = table.find_all('tr')
rows = []
for tr in trs:
    i = tr.find_all('td')
    if i:
        rows.append(i)
        
lst = []
for row in rows:
    postalcode = row[0].text.rstrip()
    borough = row[1].text.rstrip()
    neighborhood = row[2].text.rstrip()
    if borough != 'Not assigned':
        if neighborhood == 'Not assigned':
            neighborhood = borough
        lst.append([postalcode, borough, neighborhood])

In [4]:
# Convert into a dataframe
cols = ['PostalCode', 'Borough', 'Neighborhood']
df = pd.DataFrame(lst, columns=cols)
print(df.shape)
df.head()

(210, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [5]:
# Groupby postalcode
df = df.groupby('PostalCode').agg(
    {
        'Borough':'first', 
        'Neighborhood': ', '.join,}
    ).reset_index()

In [6]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [7]:
# Check the information for M5A, but checking with wiki table, there's only one neighborhood with postal code 'M5A'
df.loc[df['PostalCode'] == 'M5A']

Unnamed: 0,PostalCode,Borough,Neighborhood
53,M5A,Downtown Toronto,Harbourfront


In [8]:
# Try with M1C which with several neighborhood
df.loc[df['PostalCode'] == 'M1C']

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"


In [9]:
# Check the information for M7A with no neighborhood information
# According to the rule it should be the same with Borough "Queen's Park"
df.loc[df['PostalCode'] == 'M7A']

Unnamed: 0,PostalCode,Borough,Neighborhood
85,M7A,Queen's Park,Queen's Park


In [10]:
# Check df.shape
df.shape

(103, 3)