# Segmenting and Clustering Neighbourhoods in Toronto

In [90]:
import pandas as pd

In [91]:
d=pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

In [92]:
type(d)

list

In [93]:
len(d)

3

### creating Dataframe containing the first table in the wikipedia page

In [136]:
df=d[0]
df.columns=['PostalCode','Borough','Neighbourhood']
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [137]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288 entries, 0 to 287
Data columns (total 3 columns):
PostalCode       288 non-null object
Borough          288 non-null object
Neighbourhood    288 non-null object
dtypes: object(3)
memory usage: 6.8+ KB


### Ignoring cells with a borough that is Not assigned

In [141]:
df = df[df.Borough != 'Not assigned']

### finding Neighbourhoods with value 'Not assigned' and replacing them with corresponding value 'Borough'

In [139]:
df=df.reset_index(drop=True)
idx = df[df['Neighbourhood']== "Not assigned"].index.values.astype(int)
idx

array([6])

In [142]:
df.replace(to_replace =["Not assigned"], value = df.iloc[6,1]) 

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [143]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 211 entries, 0 to 210
Data columns (total 3 columns):
PostalCode       211 non-null object
Borough          211 non-null object
Neighbourhood    211 non-null object
dtypes: object(3)
memory usage: 16.6+ KB


### Segmenting and merging neighbourhoods with same postalcode

In [150]:
df1=df[['PostalCode', 'Borough']]
df2=df[['PostalCode', 'Neighbourhood']]

In [151]:
df1=df1.drop_duplicates()
df1=df1.reset_index(drop=True)
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 2 columns):
PostalCode    103 non-null object
Borough       103 non-null object
dtypes: object(2)
memory usage: 1.7+ KB


In [152]:
df2=df2.groupby(['PostalCode'])['Neighbourhood'].apply(','.join)
df2.to_frame()
df2=df2.reset_index()
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 2 columns):
PostalCode       103 non-null object
Neighbourhood    103 non-null object
dtypes: object(2)
memory usage: 1.7+ KB


In [155]:
merged_inner = pd.merge(left=df1,right=df2, left_on='PostalCode', right_on='PostalCode')
merged_inner

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Not assigned
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


In [157]:
merged_inner.shape

(103, 3)

In [158]:
data=pd.read_csv('http://cocl.us/Geospatial_data')

In [159]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
Postal Code    103 non-null object
Latitude       103 non-null float64
Longitude      103 non-null float64
dtypes: float64(2), object(1)
memory usage: 2.5+ KB


In [160]:
data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [162]:
data.columns=['PostalCode','Latitude','Longitude']

In [163]:
new_merged_inner = pd.merge(left=merged_inner,right=data, left_on='PostalCode', right_on='PostalCode')
new_merged_inner

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.654260,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Not assigned,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens,Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937


In [164]:
new_merged_inner.shape

(103, 5)