# Segmenting and Clustering Neighborhoods in Toronto
## IBM Data Science Capstone
#### Sam Stump, December 23, 2019
***

### Gather up postal codes for Toronto

- import packages

In [18]:
import requests
from lxml import html
import pandas as pd

- request the data from the URL into an HTML tree

In [19]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = requests.get(url)
tree = html.fromstring(page.content)

- parse the HTML tree and create the raw dataframe

In [20]:
i = 0
data = []
row = []
for x in tree.xpath('//table[@class="wikitable sortable"]//tr/td//text()'):
    value = x.strip()
    if len(value) > 0:
        row.append(value)
        i += 1
    if i % 3 == 0 and len(row) > 0:
        data.append(row)
        row = []
columns = ['PostalCode', 'Borough', 'Neighborhood']
df = pd.DataFrame(data, columns=columns)


- remove rows with 'Not assigned' boroughs
- update neighborhoods 'Not assigned' with borough name

In [21]:
df = df[df.Borough != 'Not assigned']
df['Neighborhood'] = df['Borough'].where(df['Neighborhood'] == 'Not assigned', df['Neighborhood'])

- group by (postal code, borough) and accumulate a list of neighborhoods

In [22]:
df = df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(list).reset_index(name='Neighborhood')
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"[Rouge, Malvern]"
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]"
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]"
3,M1G,Scarborough,[Woburn]
4,M1H,Scarborough,[Cedarbrae]
5,M1J,Scarborough,[Scarborough Village]
6,M1K,Scarborough,"[East Birchmount Park, Ionview, Kennedy Park]"
7,M1L,Scarborough,"[Clairlea, Golden Mile, Oakridge]"
8,M1M,Scarborough,"[Cliffcrest, Cliffside, Scarborough Village West]"
9,M1N,Scarborough,"[Birch Cliff, Cliffside West]"


- dataframe shape

In [23]:
df.shape


(103, 3)

### Find lat/lng for postal codes
- import geocoder package

In [24]:
import geocoder

- create 2 new (empty) columns "Latitude" and "Longitude"
- for each PostalCode in the data frame
    - call the ArcGIS geocoder
    - retrieve the lat, lng from the response and insert into the columns
(this cell takes a few minutes to run, be patient)

In [25]:
lat = []
lng = []
print("Geocoding...please wait")
for e in df['PostalCode']:
    place = '{}, Toronto, Canada'.format(e)
    g = geocoder.arcgis(place)
    z = g.json
    lat.append(z['lat'])
    lng.append(z['lng'])
print("Done.")

Geocoding...please wait
Done.


- append the new columns to the data frame

In [26]:
df['Latitude'] = lat
df['Longitude'] = lng
df.head(12)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"[Rouge, Malvern]",43.811525,-79.195517
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]",43.785665,-79.158725
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]",43.765815,-79.175193
3,M1G,Scarborough,[Woburn],43.768369,-79.21759
4,M1H,Scarborough,[Cedarbrae],43.769688,-79.23944
5,M1J,Scarborough,[Scarborough Village],43.743125,-79.23175
6,M1K,Scarborough,"[East Birchmount Park, Ionview, Kennedy Park]",43.726276,-79.263625
7,M1L,Scarborough,"[Clairlea, Golden Mile, Oakridge]",43.713054,-79.285055
8,M1M,Scarborough,"[Cliffcrest, Cliffside, Scarborough Village West]",43.724235,-79.227925
9,M1N,Scarborough,"[Birch Cliff, Cliffside West]",43.69677,-79.259967
