In [2]:
from bs4 import BeautifulSoup
import numpy as np # library to handle data in a vectorized manner
import pandas as pd
import requests

<h1>Part 1</h1>

In [47]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
rs = requests.get(url)
print(rs.status_code)

200


In [48]:
soup = BeautifulSoup(rs.content, "lxml")

In [49]:
table = soup.find('table', {"class":"wikitable sortable"})
trs = table.find_all('tr')

postcode = []
borough = []
neighbourhood = []

for tr in trs:
    tds = tr.find_all('td')
    if tds:
        postcode.append(tds[0].text.strip())
        borough.append(tds[1].text.strip())
        neighbourhood.append(tds[2].text.strip())


<b>The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood</b>

In [50]:
dataFrame = pd.DataFrame(data=[postcode, borough, neighbourhood]).transpose()
dataFrame.columns=['PostalCode', 'Borough', 'Neighbourhood']
dataFrame.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


<b>Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.</b>

In [51]:
dataFrame2 = dataFrame[dataFrame.Borough != "Not assigned"].reset_index(drop=True)
dataFrame2.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


<b>Group the same Postal Code</b>

In [52]:
dataFrame3 = dataFrame2.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
dataFrame3.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


<b>Neighborhood="Not assigned", make the value the same as Borough</b>

In [53]:
dataFrame4 = dataFrame3
for index, row in dataFrame4.iterrows():
    if row["Neighbourhood"] == "Not assigned":
        row["Neighbourhood"] = row["Borough"]
        
dataFrame4.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


<b>Number of rows</b>

In [54]:
dataFrame4.shape

(103, 3)

<h1>Part 2</h1>

<b>Load Geospatial coordinates data</b>

In [55]:
url = "https://cocl.us/Geospatial_data"
coordinates = pd.read_csv(url)
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [65]:
coordinates.rename(columns={"Postal Code": "PostalCode"}, inplace=True)
coordinates.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


<b>Merge 2 table</b>

In [67]:
dataFrame5 = dataFrame4.merge(coordinates, on="PostalCode", how="left")
dataFrame5.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


<b> Show all to check </b>

In [77]:
column_names = ["PostalCode", "Borough", "Neighbourhood", "Latitude", "Longitude"]
test_df = pd.DataFrame(columns=column_names)

test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in test_list:
    test_df = test_df.append(dataFrame5[dataFrame5["PostalCode"]==postcode], ignore_index=True)

test_df.sort = True
test_df

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Maryvale, Wexford",43.750072,-79.295849
7,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442


<h1> Part 3 </h1>

In [85]:
pip install folium

Collecting folium
  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/63/36/1c93318e9653f4e414a2e0c3b98fc898b4970e939afeedeee6075dd3b703/branca-0.3.1-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.3.1 folium-0.10.1
Note: you may need to restart the kernel to use updated packages.


In [86]:
from geopy.geocoders import Nominatim
import folium

In [82]:
address = 'Toronto'

geolocator = Nominatim(user_agent="diennh")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Toronto location: {}, {}.'.format(latitude, longitude))

Toronto location: 43.653963, -79.387207.


In [90]:
torontoFrame = dataFrame5
# create map
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add mark to map
for lat, lng, borough, neighborhood in zip(torontoFrame['Latitude'], 
                                           torontoFrame['Longitude'], 
                                           torontoFrame['Borough'], 
                                           torontoFrame['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

<b>Only boroughs that contain the word Toronto</b>

In [91]:
borough_names = list(torontoFrame.Borough.unique())

borough_with_toronto = []

for x in borough_names:
    if "toronto" in x.lower():
        borough_with_toronto.append(x)
        
borough_with_toronto

['East Toronto', 'Central Toronto', 'Downtown Toronto', 'West Toronto']

In [92]:
newTorontoFrame = torontoFrame
newTorontoFrame = newTorontoFrame[newTorontoFrame['Borough'].isin(borough_with_toronto)].reset_index(drop=True)
print(newTorontoFrame.shape)
newTorontoFrame.head()

(39, 5)


Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [94]:
# create map
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add mark to map
for lat, lng, borough, neighborhood in zip(newTorontoFrame['Latitude'], 
                                           newTorontoFrame['Longitude'], 
                                           newTorontoFrame['Borough'], 
                                           newTorontoFrame['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto