# Problem 3
#### In part 2 of the *Clustering Neighbourhoods in Toronto* problem, a dataframe was create to display the latitude and longitude data for all the postal codes and neighbourhoods in Toronto. In this part, the neighbourhoods in Toronto will be explored and clustered.

In [1]:
#Import necessary libraries and modules
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
!pip install pgeocode
import pgeocode
print('Libraries are imported.')

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Libraries are imported.


### 1.a. Scraping location data for Toronto from Wikipedia

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url).text

In [3]:
soup = BeautifulSoup(source, 'html5lib') #choose html5lib parser

### 1.b. Creating and populating a dataframe using the scraped data

In [4]:
table=soup.find('table') #locate the table within the scraped data

In [5]:
#Create and populate the dataframe
table_contents=[]
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

location_df=pd.DataFrame(table_contents)
location_df['Borough']=location_df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [6]:
location_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [7]:
#Number of rows of the final processed dataframe
location_df.shape

(103, 3)

### 2. Extracting geographical coordinates (latitude/longitude) data for all postal codes in all neighbourhoods of Toronto

In [8]:
# fetch geographical coordinates for all postal codes
geolocator = pgeocode.Nominatim('ca')
postal_codes = location_df['PostalCode'].tolist()
latitudes = []
longitudes = []
for i, postal_code in enumerate(postal_codes):
    g = geolocator.query_postal_code(postal_code)
    
    if not g.empty:
        latitudes.append(g.latitude)
        longitudes.append(g.longitude)

In [9]:
# create dataframe from geographical coordinates data
d = {'PostalCode': postal_codes, 'Latitudes': latitudes, 'Longitudes': longitudes}
geocode_df = pd.DataFrame(d)
geocode_df

Unnamed: 0,PostalCode,Latitudes,Longitudes
0,M3A,43.7545,-79.3300
1,M4A,43.7276,-79.3148
2,M5A,43.6555,-79.3626
3,M6A,43.7223,-79.4504
4,M7A,43.6641,-79.3889
...,...,...,...
98,M8X,43.6518,-79.5076
99,M4Y,43.6656,-79.3830
100,M7Y,43.7804,-79.2505
101,M8Y,43.6325,-79.4939


In [10]:
# merge geographical coordinate dataframe with the location dataframe and clean it
merged_df = pd.merge(geocode_df, location_df, on='PostalCode')
geospatial_data = merged_df[['PostalCode','Borough','Neighborhood','Latitudes','Longitudes']]
geospatial_data

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitudes,Longitudes
0,M3A,North York,Parkwoods,43.7545,-79.3300
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Queen's Park,Ontario Provincial Government,43.6641,-79.3889
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.6518,-79.5076
99,M4Y,Downtown Toronto,Church and Wellesley,43.6656,-79.3830
100,M7Y,East Toronto Business,Enclave of M4L,43.7804,-79.2505
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.6325,-79.4939


### 3. Exploring, clustering, and visualizing the neighbourhoods of Canada which contain *Toronto* in their Borough

In [11]:
# import necessary libraries and modules for exploring, clustering, and visualizing neighbourhoods
from sklearn.cluster import KMeans
!pip install folium
import folium 
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors
print('Libraries are imported.')

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Libraries are imported.


In [12]:
# fetch all rows from the geospatial_data dataframe wherein "Toronto" is mentioned in the Borough
toronto_data = geospatial_data[geospatial_data['Borough'].str.contains('Toronto', regex=False)]
toronto_data

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitudes,Longitudes
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3783
15,M5C,Downtown Toronto,St. James Town,43.6513,-79.3756
19,M4E,East Toronto,The Beaches,43.6784,-79.2941
20,M5E,Downtown Toronto,Berczy Park,43.6456,-79.3754
24,M5G,Downtown Toronto,Central Bay Street,43.6564,-79.386
25,M6G,Downtown Toronto,Christie,43.6683,-79.4205
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.6496,-79.3833
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.6655,-79.4378
35,M4J,East York/East Toronto,The Danforth East,43.6872,-79.3368


In [13]:
# fetch geographical coordinates for the city of Toronto
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ca_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


In [14]:
# plot all the neighbourhoods in the toronto_data dataframe on a map, using Folium
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(toronto_data['Latitudes'], toronto_data['Longitudes'], toronto_data['Borough'], toronto_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto

In [15]:
# cluster neighbourhoods in Toronto using K-Means
k=5
toronto_clusters = toronto_data.drop(['PostalCode','Borough','Neighborhood'], 1)
kmeans = KMeans(n_clusters = k, random_state=0).fit(toronto_clusters)
toronto_data.insert(5, 'Cluster Labels', kmeans.labels_)
kmeans.labels_

array([0, 0, 0, 1, 0, 0, 2, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 1, 3, 3, 3, 3,
       2, 3, 0, 2, 3, 0, 2, 3, 0, 3, 0, 0, 0, 0, 0, 0, 4], dtype=int32)

In [16]:
#display the toronto_data dataframe with cluster labels
toronto_data

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitudes,Longitudes,Cluster Labels
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626,0
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3783,0
15,M5C,Downtown Toronto,St. James Town,43.6513,-79.3756,0
19,M4E,East Toronto,The Beaches,43.6784,-79.2941,1
20,M5E,Downtown Toronto,Berczy Park,43.6456,-79.3754,0
24,M5G,Downtown Toronto,Central Bay Street,43.6564,-79.386,0
25,M6G,Downtown Toronto,Christie,43.6683,-79.4205,2
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.6496,-79.3833,0
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.6655,-79.4378,2
35,M4J,East York/East Toronto,The Danforth East,43.6872,-79.3368,1


In [17]:
# plot the clustered neighbourhoods in the toronto_data dataframe on a map, using Folium
clusters_map = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighborhood, cluster in zip(toronto_data['Latitudes'], toronto_data['Longitudes'], toronto_data['Neighborhood'], toronto_data['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(clusters_map)
       
clusters_map