# Segmenting and Clustering Neighborhoods in Toronto

### Import Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

### Create Toronto province dataframe

In [2]:
wiki_page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(wiki_page.content, 'lxml')
tables = soup.find_all('table')
df_toronto_raw = pd.read_html(str(tables), header=0)[0]

df_toronto_raw.drop(df_toronto_raw.index[df_toronto_raw.Borough=='Not assigned'], inplace=True)
df_toronto_raw.Neighbourhood = ', ' + df_toronto_raw.Neighbourhood
df_Toronto = df_toronto_raw.groupby(['Postcode','Borough'], sort=False).sum()
df_Toronto.reset_index(inplace=True)
df_Toronto.Neighbourhood = df_Toronto.Neighbourhood.apply(lambda x: x[2:])

for x in df_Toronto.index:
    if df_Toronto.Neighbourhood[x] == 'Not assigned':
        df_Toronto.Neighbourhood[x] = df_Toronto.Borough[x]

df_Toronto.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


### Get latitude / longitude data

In [3]:
!wget -O Geospatial_Coordinates.csv https://cocl.us/Geospatial_data

--2018-12-20 17:23:56--  https://cocl.us/Geospatial_data
Resolving cocl.us (cocl.us)... 159.8.72.228
Connecting to cocl.us (cocl.us)|159.8.72.228|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2018-12-20 17:23:58--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com (ibm.box.com)... 107.152.27.197
Connecting to ibm.box.com (ibm.box.com)|107.152.27.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.ent.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2018-12-20 17:23:58--  https://ibm.ent.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.ent.box.com (ibm.ent.box.com)... 107.152.26.211
Connecting to ibm.ent.box.com (ibm.ent.box.com)|107.152.26.211|:443... connected.
HTTP request sent, awaiting response... 302 Found
Loc

In [4]:
df_geo = pd.read_csv('Geospatial_Coordinates.csv')
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [5]:
print('Toronto df shape:',df_Toronto.shape)
print('Geo data df:',df_geo.shape)

Toronto df shape: (103, 3)
Geo data df: (103, 3)


### Add Latitude / Longitude to Toronto dataframe

In [6]:
lat = pd.Series(np.zeros(len(df_Toronto)))
lon = pd.Series(np.zeros(len(df_Toronto)))

for n in df_Toronto.index:
    for g in df_geo.index:
        if df_Toronto.Postcode[n] == df_geo['Postal Code'][g]:
            lat[n] = df_geo.Latitude[g]
            lon[n] = df_geo.Longitude[g]
            
df_Toronto['Latitude'] = lat
df_Toronto['Longitude'] = lon
df_Toronto.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
