# Segmenting and Clustering Neighborhoods in Toronto

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans
import requests
from bs4 import BeautifulSoup

## Download and Explore dataset


In [5]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(url).text
soup_html = BeautifulSoup(source, 'lxml')
#print(soup_html.prettify())

In [6]:
table_string = soup_html.find_all('table', 'wikitable')[0]

In [8]:
pd_table = pd.read_html(str(table_string))[0]
pd_table.columns = pd_table.iloc[0]
pd_table = pd_table[1:]

In [9]:
pd_table

Unnamed: 0,Postal Code,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M8A,Not assigned,Not assigned
9,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
10,M1B,Scarborough,"Malvern, Rouge"


In [11]:
# Delete Borough with Not assigned
#borough_assigned = pd_table[pd_table['Borough']!='Not assigned'].reset_index(drop=True)
# skip not assigned boroughs:
borough_assigned =  pd_table.drop( pd_table[( pd_table.Borough == "Not assigned")].index)
# give "Not assigned" Neighborhoods same name as Borough:
borough_assigned.Neighborhood.replace("Not assigned", borough_assigned.Borough, inplace=True)

In [12]:
borough_assigned

Unnamed: 0,Postal Code,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
9,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
10,M1B,Scarborough,"Malvern, Rouge"
12,M3B,North York,Don Mills
13,M4B,East York,"Parkview Hill, Woodbine Gardens"
14,M5B,Downtown Toronto,"Garden District, Ryerson"


## Assign neighourhood with Not assigned.

In [14]:
borough_assigned.Neighborhood.replace("Not assigned", borough_assigned.Borough, inplace=True)

In [15]:
borough_assigned

Unnamed: 0,Postal Code,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
9,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
10,M1B,Scarborough,"Malvern, Rouge"
12,M3B,North York,Don Mills
13,M4B,East York,"Parkview Hill, Woodbine Gardens"
14,M5B,Downtown Toronto,"Garden District, Ryerson"


In [16]:
borough_assigned.columns.str.strip()
borough_assigned

Unnamed: 0,Postal Code,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
9,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
10,M1B,Scarborough,"Malvern, Rouge"
12,M3B,North York,Don Mills
13,M4B,East York,"Parkview Hill, Woodbine Gardens"
14,M5B,Downtown Toronto,"Garden District, Ryerson"


## Combine more Neighbourhoods for same postal code area

In [17]:
# Combine more Neighbourhoods for same postal code area
def join_array(arr):
    return ",".join(arr)

groupby_Postcode = borough_assigned.groupby('Postal Code').agg({'Neighborhood': join_array, 
                                                             'Borough': lambda x: list(set(x))[0]})

Neighborhoods = groupby_Postcode.reset_index()
print(Neighborhoods.shape)
Neighborhoods.head()

(103, 3)


Unnamed: 0,Postal Code,Neighborhood,Borough
0,M1B,"Malvern, Rouge",Scarborough
1,M1C,"Rouge Hill, Port Union, Highland Creek",Scarborough
2,M1E,"Guildwood, Morningside, West Hill",Scarborough
3,M1G,Woburn,Scarborough
4,M1H,Cedarbrae,Scarborough


In [18]:
Neighborhoods

Unnamed: 0,Postal Code,Neighborhood,Borough
0,M1B,"Malvern, Rouge",Scarborough
1,M1C,"Rouge Hill, Port Union, Highland Creek",Scarborough
2,M1E,"Guildwood, Morningside, West Hill",Scarborough
3,M1G,Woburn,Scarborough
4,M1H,Cedarbrae,Scarborough
5,M1J,Scarborough Village,Scarborough
6,M1K,"Kennedy Park, Ionview, East Birchmount Park",Scarborough
7,M1L,"Golden Mile, Clairlea, Oakridge",Scarborough
8,M1M,"Cliffside, Cliffcrest, Scarborough Village West",Scarborough
9,M1N,"Birch Cliff, Cliffside West",Scarborough


## Adding GEOSPATIAL data to Neighbhorhoods Data Frame

In [19]:

#add Geo-spatial data
df= pd.read_csv("http://cocl.us/Geospatial_data")
df.set_index("Postal Code")
Neighborhoods.set_index("Postal Code")
toronto_df=pd.merge(Neighborhoods, df)
toronto_df.head()

Unnamed: 0,Postal Code,Neighborhood,Borough,Latitude,Longitude
0,M1B,"Malvern, Rouge",Scarborough,43.806686,-79.194353
1,M1C,"Rouge Hill, Port Union, Highland Creek",Scarborough,43.784535,-79.160497
2,M1E,"Guildwood, Morningside, West Hill",Scarborough,43.763573,-79.188711
3,M1G,Woburn,Scarborough,43.770992,-79.216917
4,M1H,Cedarbrae,Scarborough,43.773136,-79.239476


In [20]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(toronto_df['Borough'].unique()),
        toronto_df.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


Use geopy library to get the latitude and longitude values of Scarborough.

In [21]:
toronto_df

Unnamed: 0,Postal Code,Neighborhood,Borough,Latitude,Longitude
0,M1B,"Malvern, Rouge",Scarborough,43.806686,-79.194353
1,M1C,"Rouge Hill, Port Union, Highland Creek",Scarborough,43.784535,-79.160497
2,M1E,"Guildwood, Morningside, West Hill",Scarborough,43.763573,-79.188711
3,M1G,Woburn,Scarborough,43.770992,-79.216917
4,M1H,Cedarbrae,Scarborough,43.773136,-79.239476
5,M1J,Scarborough Village,Scarborough,43.744734,-79.239476
6,M1K,"Kennedy Park, Ionview, East Birchmount Park",Scarborough,43.727929,-79.262029
7,M1L,"Golden Mile, Clairlea, Oakridge",Scarborough,43.711112,-79.284577
8,M1M,"Cliffside, Cliffcrest, Scarborough Village West",Scarborough,43.716316,-79.239476
9,M1N,"Birch Cliff, Cliffside West",Scarborough,43.692657,-79.264848


In [1]:
import geopy
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim
address = 'Toronto, ON, Canada'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto, ON, Canada are {}, {}.'.format(latitude, longitude))

Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - defaults/win-64::anaconda==5.3.0=py37_0
  - defaults/win-64::astropy==3.0.4=py37hfa6e2cd_0
  - defaults/win-64::bkcharts==0.2=py37_0
  - defaults/win-64::blaze==0.11.3=py37_0
  - defaults/win-64::bokeh==0.13.0=py37_0
  - defaults/win-64::bottleneck==1.2.1=py37h452e1ab_1
  - defaults/win-64::dask==0.19.1=py37_0
  - defaults/win-64::datashape==0.5.4=py37_1
  - defaults/win-64::h5py==2.8.0=py37h3bdd7fb_2
  - defaults/win-64::imageio==2.4.1=py37_0
  - defaults/win-64::matplotlib==2.2.3=py37hd159220_0
  - defaults/win-64::mkl-service==1.1.2=py37hb217b18_5
  - defaults/win-64::mkl_fft==1.0.4=py37h1e22a9b_1
  - defaults/win-64::mkl_random==1.0.1=py37h77b88f5_1
  - defaults/win-64::numba==0.39.0=py37h830ac7b_0
  - defaults/win-64::numexpr==2.6.8=py37h9ef55f4_0
  - d


[WinError 127] The specified procedure could not be found
[WinError 127] The specified procedure could not be found
[WinError 127] The specified procedure could not be found
[WinError 127] The specified procedure could not be found
[WinError 127] The specified procedure could not be found
[WinError 127] The specified procedure could not be found
[WinError 127] The specified procedure could not be found
[WinError 127] The specified procedure could not be found
[WinError 127] The specified procedure could not be found
[WinError 127] The specified procedure could not be found
[WinError 127] The specified procedure could not be found
[WinError 127] The specified procedure could not be found



The geograpical coordinate of Toronto, ON, Canada are 43.6534817, -79.3839347.
