<h1 align=center><font size = 5>Toronto Neighborhood Clustering</font></h1>

<h3>Import BeautifulSoup and requests for scraping the Website</h3>

In [18]:
import requests
from bs4 import BeautifulSoup

WIKI_URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

req = requests.get(WIKI_URL)
soup = BeautifulSoup(req.content, 'lxml')
wikitables = soup.findAll("table", class_='wikitable')

#Below command is just to verify scraping was successful
#print(soup)

In [19]:
type(soup)

bs4.BeautifulSoup

In [20]:
#Retrieve Canada's PostalCode Table

data = []
table = soup.find('table', class_ ='wikitable')
table_body = table.find('tbody')

rows = table_body.find_all('tr')
for row in rows:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    data.append([ele for ele in cols if ele]) # Get rid of empty values

In [21]:
type(data)

list

<h3>Load the data into a dataframe</h3>

In [22]:
import pandas as pd

df = pd.DataFrame(data)
df.drop([0], inplace=True)
df = df.reset_index(drop=True)
df.columns = ['PostalCode', 'Borough','Neighborhood']

df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [23]:
df.shape

(289, 3)

<h3>Ignore cells with a Borough that is Not assigned</h3>

In [24]:
#Look for the indexes that has the condition

indexRow = df[ df['Borough'] == 'Not assigned' ].index
print(indexRow)

Int64Index([  0,   1,   9,  13,  20,  21,  30,  36,  37,  45,  46,  50,  51,
             52,  54,  55,  59,  60,  61,  73,  74,  75,  88,  89,  90, 104,
            105, 106, 120, 121, 136, 137, 148, 149, 155, 161, 162, 167, 175,
            181, 182, 188, 189, 190, 194, 195, 201, 202, 203, 204, 209, 210,
            223, 224, 238, 239, 242, 243, 248, 249, 254, 255, 259, 260, 261,
            262, 264, 265, 275, 276, 277, 278, 279, 280, 281, 282, 288],
           dtype='int64')


In [25]:
len(indexRow)

77

In [26]:
#Drop the rows with Burough = "Not Assigned"
df.drop(indexRow , inplace=True)
df = df.reset_index(drop=True)

In [27]:
df.shape

(212, 3)

<h3>Merge Neighborhoods with same PostalCodes </h3>

In [28]:
df_new = df.groupby(['PostalCode','Borough'], sort = False).agg(lambda x: ','.join(x))
df_new = df_new.reset_index()

In [29]:
df_new.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Not assigned


<h3>Look for Borough that has Neighborhood = "Not assigned" and name Neighborhood same as Borough</h3>

In [30]:
df_new.loc[(df_new['Borough'] != "Not assigned") & (df_new['Neighborhood'] == "Not assigned" )]

Unnamed: 0,PostalCode,Borough,Neighborhood
4,M7A,Queen's Park,Not assigned


In [31]:
df_new.loc[4,'Neighborhood'] = 'Queen\'s Park'

In [32]:
df_new.loc[4]

PostalCode               M7A
Borough         Queen's Park
Neighborhood    Queen's Park
Name: 4, dtype: object

In [33]:
df_new.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [34]:
df_new.shape

(103, 3)

### End of Notebook