In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_html = requests.get(url).text
soup = BeautifulSoup(wiki_html, 'html.parser')

data = []
for tr in soup.tbody.find_all('tr'):
    data.append([ td.get_text().strip() for td in tr.find_all('td')])

In [4]:
df = pd.DataFrame(data,columns=['PostalCode','Borough','Neighborhood2'])

In [5]:
# "Not assigned" in Borough column
indexNames = df[(df['Borough'] == "Not assigned")].index

# Drop rows that have "Not assigned" Borough
df.drop(indexNames,inplace=True)

# Drop the first row
df.dropna(inplace=True)

In [6]:
# Combine multiple rows using PostalCode and Borough
df=df.groupby(['PostalCode','Borough'])['Neighborhood2'].apply(', '.join).reset_index()

In [7]:
# Replace "Not assigned" Neighbourhoods with Borough
def custom_fx(data):
    if data['Neighborhood2']=='Not assigned':
        var=data['Borough']
    else:
        var=data['Neighborhood2']
    return var

df['Neighborhood']=df.apply(custom_fx,axis='columns')

print("There are {} rows that have 'Not assigned' in Neighborhood column in the dataframe".format(
    len(df[df['Neighborhood']=='Not assigned'])
)
     )

# Delete Neighborhood2 column
df.drop(columns='Neighborhood2')

There are 0 rows that have 'Not assigned' in Neighborhood column in the dataframe


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park
7,M1L,Scarborough,Golden Mile / Clairlea / Oakridge
8,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West
9,M1N,Scarborough,Birch Cliff / Cliffside West


In [10]:
lonlat=pd.read_csv('http://cocl.us/Geospatial_data')
todf=df

In [9]:
print("Column names of lonlat dataframe are: {}, {}, and {}.".format(lonlat.columns[0],lonlat.columns[1],lonlat.columns[2]))

Column names of lonlat dataframe are: Postal Code, Latitude, and Longitude.


In [11]:
print("Column names of todf dataframe are: {}, {}, and {}.".format(todf.columns[0],todf.columns[1],todf.columns[2]))

Column names of todf dataframe are: PostalCode, Borough, and Neighborhood2.


In [12]:
lonlat.rename(columns={'Postal Code':'PostalCode'},inplace=True)

In [13]:
todf_geo=pd.merge(todf,lonlat,how='left',on='PostalCode')

In [17]:
todf_geo

Unnamed: 0,PostalCode,Borough,Neighborhood2,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park,Kennedy Park / Ionview / East Birchmount Park,43.727929,-79.262029
7,M1L,Scarborough,Golden Mile / Clairlea / Oakridge,Golden Mile / Clairlea / Oakridge,43.711112,-79.284577
8,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West,Cliffside / Cliffcrest / Scarborough Village West,43.716316,-79.239476
9,M1N,Scarborough,Birch Cliff / Cliffside West,Birch Cliff / Cliffside West,43.692657,-79.264848


In [22]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

import geopy # install it in Anaconda Prompt
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!pip install folium

from IPython.display import Image # will upload screenshots later

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 15.4MB/s eta 0:00:01
[?25hCollecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/81/6d/31c83485189a2521a75b4130f1fee5364f772a0375f81afff619004e5237/branca-0.4.0-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.0 folium-0.10.1


In [23]:
import folium

In [25]:
tonbhd = todf_geo
to_data = tonbhd[tonbhd.Borough.str.contains('Toronto')]
to_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood2,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,The Danforth West / Riverdale,The Danforth West / Riverdale,43.679557,-79.352188
42,M4L,East Toronto,India Bazaar / The Beaches West,India Bazaar / The Beaches West,43.668999,-79.315572
43,M4M,East Toronto,Studio District,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,Lawrence Park,43.72802,-79.38879


In [27]:
to_data.shape

(39, 6)

In [29]:
address = 'Toronto'
geolocator = Nominatim(user_agent="trt_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Toronto coordinates are {}, {}.'.format(latitude, longitude))

Toronto coordinates are 43.6534817, -79.3839347.


In [30]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)
for lat, lng, label in zip(to_data['Latitude'], to_data['Longitude'], to_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto