# Segmenting and Clustering Neighborhoods in Toronto

In [23]:
import numpy as np # library for vectorized computation
import pandas as pd # library to process data as dataframes
import matplotlib.pyplot as plt # plotting library

#### Getting data from Wiki

In [108]:
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = urllib.request.urlopen(url).read()

#### Parcing

In [170]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(page,'html.parser')
    
table=soup.table
table_rows = table.find_all('tr')

#### Converting to DF

In [298]:
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)
df=pd.DataFrame(l, columns=["PostalCode", "Brought", "Neighbourhood"])
df.head()

Unnamed: 0,PostalCode,Brought,Neighbourhood
0,,,
1,M1A,Not assigned,Not assigned\n
2,M2A,Not assigned,Not assigned\n
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n


#### Changing DF to correct view

In [299]:
df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
df['Neighbourhood']=df['Neighbourhood'].map(lambda x: x.rstrip('\n'))
df.Neighbourhood.replace('Not assigned',df.Brought,inplace=True)
df = df.loc[df['Brought'] != 'Not assigned']
df = df.groupby(["PostalCode", "Brought"]).agg([('', ', '.join)])
df.reset_index(inplace=True)
df.head()

Unnamed: 0,PostalCode,Brought,Neighbourhood
,,,
0.0,M1B,Scarborough,"Rouge, Malvern"
1.0,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2.0,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3.0,M1G,Scarborough,Woburn
4.0,M1H,Scarborough,Cedarbrae


In [300]:
df.shape

(103, 3)

#### Getting geo data

In [301]:
!wget -q -O 'geodata.csv' http://cocl.us/Geospatial_data

In [303]:
df_geo = pd.read_csv('geodata.csv')
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Joining with previous DF

In [305]:
df_res=df.set_index('PostalCode').join(df_geo.set_index('Postal Code'))
df_res.columns=['Brought','Neighbourhood', 'Latitude', 'Longitude']
df_res.reset_index(inplace=True)
df_res.head()

Unnamed: 0,PostalCode,Brought,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
