# Segmenting and Clustering Neighborhoods in Toronto
## IBM Data Science Capstone
#### Sam Stump, December 23, 2019
***

### Gather up postal codes for Toronto

- import packages

In [171]:
import requests
from lxml import html
import pandas as pd

- request the data from the URL into an HTML tree

In [172]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = requests.get(url)
tree = html.fromstring(page.content)

- parse the HTML tree and create the raw dataframe

In [173]:
i = 0
data = []
row = []
for x in tree.xpath('//table[@class="wikitable sortable"]//tr/td//text()'):
    value = x.strip()
    if len(value) > 0:
        row.append(value)
        i += 1
    if i % 3 == 0 and len(row) > 0:
        data.append(row)
        row = []
columns = ['PostalCode', 'Borough', 'Neighborhood']
df = pd.DataFrame(data, columns=columns)


- remove rows with 'Not assigned' boroughs
- update neighborhoods 'Not assigned' with borough name

In [174]:
df = df[df.Borough != 'Not assigned']
df['Neighborhood'] = df['Borough'].where(df['Neighborhood'] == 'Not assigned', df['Neighborhood'])

- group by (postal code, borough) and accumulate a list of neighborhoods

In [175]:
df = df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(list).reset_index(name='Neighborhood')
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"[Rouge, Malvern]"
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]"
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]"
3,M1G,Scarborough,[Woburn]
4,M1H,Scarborough,[Cedarbrae]
5,M1J,Scarborough,[Scarborough Village]
6,M1K,Scarborough,"[East Birchmount Park, Ionview, Kennedy Park]"
7,M1L,Scarborough,"[Clairlea, Golden Mile, Oakridge]"
8,M1M,Scarborough,"[Cliffcrest, Cliffside, Scarborough Village West]"
9,M1N,Scarborough,"[Birch Cliff, Cliffside West]"


dataframe shape

In [176]:
df.shape


(103, 3)