In [1]:
import pandas as pd
import numpy as np
import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents 

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

#mapping tools
#!pip install geopy 
#from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

#!pip install folium
import folium # map rendering library
#from shapely.geometry import Point
#import geopandas as gpd
import matplotlib.pyplot as plt

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

## step 1 use BeautifulSoup to scrape data from website: 

In [2]:
data = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text


In [3]:
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

## step 2 based on the table in the wikipedia (search for table and extract to dict & create pd DataFrame)

In [4]:
#search for table
table = soup.find('table',{'class':'wikitable sortable'})
table; #remove ';' to view output

# step 3 extract table data and create pd DataFrame


In [5]:
#extract rows to dict
rows = []
for row in table.find_all("tr"):
    cols = row.find_all("td")
    cols = [ele.text.strip() for ele in cols]
    rows.append(cols)

rows; #remove ';' to view output

## Create dataframe with 103 Postcodes ('Postcode', 'Borough', 'Neighborhood')

In [6]:
#create initial pd DataFrame
df = pd.DataFrame(rows)
df = df.rename(columns={0:"Postcode",1:"Borough",2:"Neighborhood"})
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [7]:
df.shape

(289, 3)

In [8]:
df_bkp = df.copy()

# drop columns where Borough is Not assigned or None

In [9]:
df = df.drop([0])

In [10]:
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [11]:
df.drop(df[df['Borough']=='Not assigned'].index,inplace=True)

In [12]:
df.reset_index(drop=True)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


## step 4 - data transform - if 'Neighborhood' = 'Not Assigned', then use 'Borough'

In [13]:
df[df.Neighborhood=='Not assigned']

Unnamed: 0,Postcode,Borough,Neighborhood
9,M7A,Queen's Park,Not assigned


In [14]:
df.Neighborhood[df.Neighborhood == 'Not assigned'] = df.Borough

#### More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.

In [15]:
df = df.groupby(['Postcode', 'Borough'])['Neighborhood'].apply(list).apply(lambda x:', '.join(x)).to_frame().reset_index()

In [16]:
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [17]:
df.shape

(103, 3)

###  Load the coordinates file 

In [24]:
coordinates = pd.read_csv('Geospatial_Coordinates.csv')
coordinates.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [22]:
# rename the first column to allow merging dataframes on Postcode
coordinates.columns = ['Postcode', 'Latitude', 'Longitude']

In [23]:
# Merge the data frame into 1 based on postal code 
data = pd.merge(coordinates, df, on='Postcode')

In [24]:
## arrange columns
data = data[['Postcode', 'Borough', 'Neighborhood', 'Latitude', 'Longitude']]
data

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [25]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(data['Borough'].unique()),
        data.shape[0]
    )
)

The dataframe has 11 boroughs and 103 neighborhoods.


In [27]:
address = 'Toronto, Ontario'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto, Canada are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto, Canada are 43.653963, -79.387207.


In [28]:
df_toronto = data[data['Borough'].str.contains('Toronto')].reset_index(drop=True)
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Coordinates
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,"(-79.2930312, 43.67635739999999)"
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,"(-79.352188, 43.6795571)"
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,"(-79.31557159999998, 43.6689985)"
3,M4M,East Toronto,Studio District,43.659526,-79.340923,"(-79.340923, 43.6595255)"
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,"(-79.3887901, 43.7280205)"


In [29]:

# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=13)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
  label = '{},{}'.format(neighborhood,borough)
  label = folium.Popup(label, parse_html=True)
  folium.CircleMarker(
      [lat, lng],
      radius=5,
      popup=label,
      color='blue',
      fill=True,
      fill_color='#3186cc',
      fill_opacity=0.7).add_to(map_toronto) 
    
map_toronto