In [2]:
#get required libraries

from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from IPython.display import display_html

#!conda install -c conda-forge folium=0.5.0 --yes
!pip install folium
import folium

from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import os
from geopy.geocoders import Nominatim 

print('Libraries import done')

Collecting folium
  Downloading folium-0.11.0-py2.py3-none-any.whl (93 kB)
[K     |████████████████████████████████| 93 kB 2.3 MB/s  eta 0:00:01
[?25hCollecting branca>=0.3.0
  Downloading branca-0.4.1-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.11.0
Libraries import done


In [6]:
#scrap wikipedia page
page_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(page_url).text

soup=BeautifulSoup(source,'xml')

tab = str(soup.table)
#display_html(tab,raw=True)

In [7]:
#Convert html to Pandas dataframe to enable cleaning and processing of data

dfs = pd.read_html(tab)
df=dfs[0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [8]:
#Ignore cells with borough that are 'Not assigned' so that we only process the rows with an assigned borough
df1 = df[df.Borough != 'Not assigned']

#Combine neighbourhoods with same Postal Code
df2 = df1.groupby(['Postal Code','Borough'], sort=False).agg(', '.join)
df2.reset_index(inplace=True)

#Replace data where neighbourhood that are 'Not assigned' with the name of Borough
df2['Neighbourhood'] = np.where(df2['Neighbourhood'] == 'Not assigned',df2['Borough'], df2['Neighbourhood'])

df2.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [9]:
#display shape of the dataframe
df2.shape

(103, 3)

In [22]:
#Get lattitude and longitude by importing provided link to csv
la_lo = pd.read_csv('https://cocl.us/Geospatial_data')
la_lo.head()

#assign latitude and longitude to neighbourhoods in canada by merging two dataframes
df_canada_geo = pd.merge(df2, la_lo, on ='Postal Code')
df_canada_geo.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [24]:
df_canada_geo.groupby('Borough').count()['Neighbourhood']

Borough
Central Toronto      9
Downtown Toronto    19
East Toronto         5
East York            5
Etobicoke           12
Mississauga          1
North York          24
Scarborough         17
West Toronto         6
York                 5
Name: Neighbourhood, dtype: int64

In [35]:
#get data that has Borough as Toronto
df_toronto_geo = df_canada_geo[df_canada_geo['Borough'].str.contains('Toronto',regex=False)]
#df_toronto_geo.head()
df_toronto_geo.groupby('Borough').count()['Neighbourhood']

Borough
Central Toronto      9
Downtown Toronto    19
East Toronto         5
West Toronto         6
Name: Neighbourhood, dtype: int64

In [42]:
# Get lat and Lon for Toronto
geolocator = Nominatim (user_agent="ny_explorer")
location = geolocator.geocode('Toronto')
lat_toronto = location.latitude
lon_toronto = location.longitude
print (lat)
print (lon)

43.6534817
-79.3839347


In [43]:
#Generate maps to visualize neighborhoods and show how they cluster together
map_toronto = folium.Map(location=[lat_toronto,lon_toronto],zoom_start=10)

for la,ln,borough,neighbourhood in zip(df_toronto_geo['Latitude'],df_toronto_geo['Longitude'],df_toronto_geo['Borough'],df_toronto_geo['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [la,ln],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto