# Segmenting and Clustering Neighborhoods in Toronto

### Load Libraries

In [16]:
conda install -c anaconda beautifulsoup4 

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [17]:
import numpy as np # library for handling data in  vectorized manner
import requests
from bs4 import BeautifulSoup

import pandas as pd # library for data analsysis
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library
print('Libraries imported.')

Libraries imported.


### Scrap data from wikipedia page & Create dataframe

In [18]:
wiki_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

soup = BeautifulSoup(wiki_url.content, 'html.parser')

### Read table

In [19]:
table = soup.find('tbody')
rows = table.select('tr')
row = [r.get_text() for r in rows]

###  Clean & create the dataframe

In [20]:
df = pd.DataFrame(row)
df1 = df[0].str.split('\n', expand=True)
df2 = df1.rename(columns=df1.iloc[0])
df3 = df2.drop(df2.index[0])
df3.head()

Unnamed: 0,Unnamed: 1,Postal code,Unnamed: 3,Borough,Unnamed: 5,Neighborhood,Unnamed: 7
1,,M1A,,Not assigned,,,
2,,M2A,,Not assigned,,,
3,,M3A,,North York,,Parkwoods,
4,,M4A,,North York,,Victoria Village,
5,,M5A,,Downtown Toronto,,Regent Park / Harbourfront,


### Remove "Not assigned" and then Aggregate

In [21]:
df4 = df3[df3.Borough != 'Not assigned']

df5 = df4.groupby(['Postal code', 'Borough'], sort = False).agg(','.join)
df5.reset_index(inplace = True)
df6 = df5.replace("Not assigned", "Queen's Park")
df6.head()

Unnamed: 0,Postal code,Borough,Unnamed: 3,Unnamed: 4,Unnamed: 5,Neighborhood,Unnamed: 7
0,M3A,North York,,,,Parkwoods,
1,M4A,North York,,,,Victoria Village,
2,M5A,Downtown Toronto,,,,Regent Park / Harbourfront,
3,M6A,North York,,,,Lawrence Manor / Lawrence Heights,
4,M7A,Downtown Toronto,,,,Queen's Park / Ontario Provincial Government,


In [22]:
df6.shape

(103, 7)

## 2-Use the Geocoder package or the csv file to create dataframe with longitude and latitude values

### Read csv file of Geospatial_data

In [23]:
df_geo = pd.read_csv('http://cocl.us/Geospatial_data')
df_geo.columns = ['Postal code', 'Latitude', 'Longitude']
df_geo.head()

Unnamed: 0,Postal code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Create a new dataframe by merge first & second dataframe

In [24]:
df_pc = pd.merge(df6, df_geo,on=['Postal code'], how='inner')
df_pc.head()

Unnamed: 0,Postal code,Borough,Unnamed: 3,Unnamed: 4,Unnamed: 5,Neighborhood,Unnamed: 7,Latitude,Longitude
0,M3A,North York,,,,Parkwoods,,43.753259,-79.329656
1,M4A,North York,,,,Victoria Village,,43.725882,-79.315572
2,M5A,Downtown Toronto,,,,Regent Park / Harbourfront,,43.65426,-79.360636
3,M6A,North York,,,,Lawrence Manor / Lawrence Heights,,43.718518,-79.464763
4,M7A,Downtown Toronto,,,,Queen's Park / Ontario Provincial Government,,43.662301,-79.389494


## 3- Explore and cluster the neighborhoods in Toronto

### Show how many Borough & Neighbourhood in the dataframe

In [25]:
print('The dataframe has {} Borough and {} Neighbourhood.'
      .format(len(df_pc['Borough'].unique()),df_pc.shape[0]))

The dataframe has 10 Borough and 103 Neighbourhood.


In [26]:
address = 'Toronto'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of the City of Toronto are {}, {}.'.format(latitude, longitude))

  This is separate from the ipykernel package so we can avoid doing imports until


The geograpical coordinate of the City of Toronto are 43.6534817, -79.3839347.


In [35]:
# create map of TORONTO using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_pc['Latitude'], df_pc['Longitude'], df_pc['Borough'], df_pc['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_toronto)  
    
map_toronto