# Segmenting and Clustering Neighborhoods in Toronto

In [142]:
# Import libraries
import urllib.request
import pandas as pd
from bs4 import BeautifulSoup

In [143]:
# Define url
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [144]:
page = urllib.request.urlopen(url)

### Read and parse the wikipage using beautifulsoup

In [145]:
soup = BeautifulSoup(page, "lxml")

In [146]:
table = soup.find('table', attrs={'class':'wikitable sortable'})
table_rows = table.find_all('tr')


In [147]:
# Parse the data and write ti to list and then into a dataframe
toranto_table = []
for tr in table_rows :
    td = tr.find_all('td')
    row = [tr.text.strip('\n') for tr in td]
    toranto_table.append(row)
df = pd.DataFrame(toranto_table, columns=["PostalCode", "Borough", "Neighbourhood"])
df

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
...,...,...,...
176,M5Z,Not assigned,Not assigned
177,M6Z,Not assigned,Not assigned
178,M7Z,Not assigned,Not assigned
179,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


### Data cleaning

In [148]:
# Omit the rows which habe Borough not assigned
df = df[df.Borough != "Not assigned"]

In [149]:
#Drop the first row which has null values
df= df.drop(0)

In [150]:
df.groupby('PostalCode', as_index=False).agg("Neighbourhood")
df

Unnamed: 0,PostalCode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
161,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
166,M4Y,Downtown Toronto,Church and Wellesley
169,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
170,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [151]:
# Reset the index 
df = df.reset_index()
del df['index']
df

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [152]:
#Size of the dataframe
df.shape

(103, 3)