# Segmenting and Clustering Neighborhoods in Toronto

### Import Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

### Scrape Wikipedia page of Toronto postal codes

In [2]:
wiki_page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(wiki_page.content, 'lxml')
tables = soup.find_all('table')                             #retrieve all tables from wikipedia page
df_toronto_raw = pd.read_html(str(tables), header=0)[0]     #convert the 1st table to a dataframe
print('Postal Code table shape:',df_toronto_raw.shape)
df_toronto_raw.head(10)


Postal Code table shape: (289, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


### Clean it up a little

In [3]:
# Delete rows with unassigned borough
df_toronto_raw.drop(df_toronto_raw.index[df_toronto_raw.Borough=='Not assigned'], inplace=True)

#add a separator to Neighborhood
df_toronto_raw.Neighbourhood = ', ' + df_toronto_raw.Neighbourhood

#group by postal code & borough, concatenating all neighborhoods into a string
df_Toronto = df_toronto_raw.groupby(['Postcode','Borough'], sort=False).sum()

#reset the index
df_Toronto.reset_index(inplace=True)

#YUCK, get rid of that leading comma
df_Toronto.Neighbourhood = df_Toronto.Neighbourhood.apply(lambda x: x[2:])

#make sure all neighborhoods have a name
for x in df_Toronto.index:
    if df_Toronto.Neighbourhood[x] == 'Not assigned':
        df_Toronto.Neighbourhood[x] = df_Toronto.Borough[x]


print('Original Wikipedia table shape:',pd.read_html(str(tables), header=0)[0].shape)
print('Dataframe shape w/o unassigned boroughs:', df_toronto_raw.shape)
print('Dataframe shape after cleanup:', df_Toronto.shape)
df_Toronto.head(10)

Original Wikipedia table shape: (289, 3)
Dataframe shape w/o unassigned boroughs: (212, 3)
Dataframe shape after cleanup: (103, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"
