# Segmenting & Clustering Neighborhoods in Toronto

## Import Modules

In [2]:
from bs4 import BeautifulSoup
import requests
from lxml import html
import urllib3
import pandas as pd
print("All Imported")

All Imported


## Define URL/Web Link of Wikipedia page and read in data

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = requests.get(url)
print(html)

<Response [200]>


## Parse html link with Beautiful Soup module and convert interesting table into a list.

In [5]:
soup = BeautifulSoup(html.text, "html.parser")
#print(html.text)
#print(soup)
table = soup.find('table', attrs={'class':'wikitable sortable'})
table_rows = table.find_all('tr')

res = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        res.append(row)

print("All Done")

All Done


## Covert into a dataframe with headers and dropping boroughs = 'Not Assigned'

In [6]:
labels = ['postalcode', 'borough', 'neighborhood']
df = pd.DataFrame(res, columns=labels)
df.drop(df[df.borough == 'Not assigned'].index, inplace=True)

### Check result after above

In [8]:
df.head()

Unnamed: 0,postalcode,borough,neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


## Aggregate multiple neighborhoods into single postalcode separated by a comma.

In [10]:
df_merge = df.groupby(['postalcode', 'borough'], as_index=False, sort=False).agg(','.join)


### Check result of above

In [11]:
df_merge.head()

Unnamed: 0,postalcode,borough,neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Not assigned


## Change any 'Not Assigned' Neighborhoods to the name of the Borough.

In [34]:
#df_merge['neighborhood'].where(df_merge['neighborhood'] == "Not assigned")
try:
    df_merge.loc[df_merge['neighborhood'] == 'Not assigned'] = df_merge['borough']
except:
    pass




### Check result of above

In [32]:
df_merge.loc[df_merge['postalcode'] == 'M7A']

Unnamed: 0,postalcode,borough,neighborhood
4,M7A,Queen's Park,Queen's Park


## Check number of rows in dataset

In [36]:
df_merge.shape

(103, 3)