# Part 1 - Scraping Toronto Neighborhoods and Transform into Dataframe

### Install packages

In [1]:
!pip install bs4
!pip install requests



### Import Necessary Libraries

In [2]:
from bs4 import BeautifulSoup
import requests  
import pandas as pd

### Scrape Wikipedia Site

In [3]:
url = "https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=1011037969"
data  = requests.get(url).text
soup = BeautifulSoup(data,"html5lib")

### Input Data into Dataframe

In [4]:
table = soup.find('table')
toronto_data = pd.DataFrame(columns=["Postal Code", "Borough", "Neighbourhood"])
for row in table.tbody.find_all('tr'):
    col = row.find_all('td')
    if (col != []):
        Postal_Code = col[0].text.strip()
        Borough = col[1].text.strip()
        Neighbourhood = col[2].text.strip()
        toronto_data = toronto_data.append({"Postal Code":Postal_Code, "Borough":Borough, "Neighbourhood":Neighbourhood}, ignore_index=True)

### Remove Rows with Boroughs listed as Not assigned, and reset index

In [5]:
toronto_data=toronto_data[~toronto_data.Borough.str.contains("Not assigned")]
toronto_data.reset_index(drop=True, inplace=True)

In [6]:
toronto_data

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [7]:
toronto_data.shape

(103, 3)

# Part 2 - Using Geocoder to get Latitude and Longitude of Neighborhoods

### Import Geocoder

In [8]:
!pip install geocoder
import geocoder



### Retrieve Lat and Long for Neighborhoods

In [9]:
# Load coordinate data from html
geo_df = pd.read_csv('https://cocl.us/Geospatial_data')

# Merge coordinates into neighbourhood dataframe
toronto_data = toronto_data.merge(geo_df)
toronto_data

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


# Part 3 - Clustering Neighborhoods

### Install and import Folium and Geopy

In [10]:
!conda install -c conda-forge folium=0.5.0 --yes
!conda install -c conda-forge geopy --yes
import folium
from geopy.geocoders import Nominatim

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



### Sort Toronto Data by Boroughs

In [11]:
toronto_borough = toronto_data.sort_values(by='Borough')
toronto_borough

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
86,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049
79,M4S,Central Toronto,Davisville,43.704324,-79.388790
83,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.383160
68,M5P,Central Toronto,"Forest Hill North & West, Forest Hill Road Park",43.696948,-79.411307
74,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.672710,-79.405678
...,...,...,...,...,...
69,M6P,West Toronto,"High Park, The Junction South",43.661608,-79.464763
56,M6M,York,"Del Ray, Mount Dennis, Keelsdale and Silverthorn",43.691116,-79.476013
64,M9N,York,Weston,43.706876,-79.518188
21,M6E,York,Caledonia-Fairbanks,43.689026,-79.453512


In [12]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(toronto_borough['Borough'].unique()),
        toronto_borough.shape[0]
    )
)

The dataframe has 11 boroughs and 103 neighborhoods.


### Build Folium Map for Neighbourhoods in Toronto

In [13]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))


The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [14]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighbourhood in zip(toronto_borough['Latitude'], toronto_borough['Longitude'], toronto_borough['Borough'], toronto_borough['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  


In [15]:
map_toronto

### Import libraries for clustering

In [16]:
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

In [17]:
toronto_borough

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
86,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049
79,M4S,Central Toronto,Davisville,43.704324,-79.388790
83,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.383160
68,M5P,Central Toronto,"Forest Hill North & West, Forest Hill Road Park",43.696948,-79.411307
74,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.672710,-79.405678
...,...,...,...,...,...
69,M6P,West Toronto,"High Park, The Junction South",43.661608,-79.464763
56,M6M,York,"Del Ray, Mount Dennis, Keelsdale and Silverthorn",43.691116,-79.476013
64,M9N,York,Weston,43.706876,-79.518188
21,M6E,York,Caledonia-Fairbanks,43.689026,-79.453512


### Group boroughs by mean frequency

In [18]:
toronto_grouped = toronto_borough.groupby('Borough').mean().reset_index()
toronto_grouped

Unnamed: 0,Borough,Latitude,Longitude
0,Central Toronto,43.70198,-79.398954
1,Downtown Toronto,43.654597,-79.383972
2,East Toronto,43.669436,-79.324654
3,East York,43.700303,-79.335851
4,Etobicoke,43.660043,-79.542074
5,Mississauga,43.636966,-79.615819
6,North York,43.750727,-79.429338
7,Scarborough,43.766229,-79.249085
8,Toronto/York,43.673185,-79.487262
9,West Toronto,43.652653,-79.44929


In [19]:
toronto_grouped.shape

(11, 3)

In [20]:
kclusters = 5

toronto_clustering = toronto_grouped.drop('Borough', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_clustering)
kmeans.labels_[0:10] 

array([4, 1, 1, 1, 0, 0, 4, 3, 2, 2])

In [21]:
toronto_clustering

Unnamed: 0,Latitude,Longitude
0,43.70198,-79.398954
1,43.654597,-79.383972
2,43.669436,-79.324654
3,43.700303,-79.335851
4,43.660043,-79.542074
5,43.636966,-79.615819
6,43.750727,-79.429338
7,43.766229,-79.249085
8,43.673185,-79.487262
9,43.652653,-79.44929


### Make copy of original dataframe without latitude and longitude

In [22]:
toronto_copy = toronto_borough[['Postal Code', 'Borough', 'Neighbourhood']].copy()
toronto_copy

Unnamed: 0,Postal Code,Borough,Neighbourhood
86,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest..."
79,M4S,Central Toronto,Davisville
83,M4T,Central Toronto,"Moore Park, Summerhill East"
68,M5P,Central Toronto,"Forest Hill North & West, Forest Hill Road Park"
74,M5R,Central Toronto,"The Annex, North Midtown, Yorkville"
...,...,...,...
69,M6P,West Toronto,"High Park, The Junction South"
56,M6M,York,"Del Ray, Mount Dennis, Keelsdale and Silverthorn"
64,M9N,York,Weston
21,M6E,York,Caledonia-Fairbanks


### Merge cluster dataframe to copied dataframe

In [23]:
# add clustering labels
toronto_grouped.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_copy

# merge toronto_copy with manhattan_grouped to add latitude/longitude for each borough
toronto_merged = toronto_copy.join(toronto_grouped.set_index('Borough'), on='Borough')

toronto_merged.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Cluster Labels,Latitude,Longitude
86,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",4,43.70198,-79.398954
79,M4S,Central Toronto,Davisville,4,43.70198,-79.398954
83,M4T,Central Toronto,"Moore Park, Summerhill East",4,43.70198,-79.398954
68,M5P,Central Toronto,"Forest Hill North & West, Forest Hill Road Park",4,43.70198,-79.398954
74,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",4,43.70198,-79.398954


### Create map for clustered boroughs

In [24]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Borough'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters
