In [15]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
print("Imported!")


Imported!


In [29]:
# url to the wikipedia website
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# create a BeautifulSoup object from the url
data = BeautifulSoup(requests.get(url).text, "html.parser")


In [30]:
# find all tables in the website
table_list = data.find_all("table")

# find all cells in the table containing the postal codes
table_cells = table_list[0].find_all('td')


In [31]:
df = pd.DataFrame(columns=['PostalCode', 'Borough', 'Neighborhood'])

for n in range(len(table_cells)):
    if not table_cells[n].find('span').get_text().split('(')[0] == 'Not assigned': # ignore the cells without borough
        neigh_tmp = table_cells[n].find('span').get_text().split('(')[1:]
        neigh_list=[]
        for l in range(len(neigh_tmp)): # some cells contain multiple parentheses with neighborhood names
            tmp = neigh_tmp[l][0:neigh_tmp[l].find(')')].split("/")
            neigh_list += tmp 
        for l in range(len(neigh_list)):
            neigh_list[l]=neigh_list[l].strip()
        if not neigh_list: neigh_list = table_cells[n].find('span').get_text().split('(')[0] # if no Neighborhood then Borough=Neighborhood
        new_row ={'PostalCode': table_cells[n].b.string, 'Borough': table_cells[n].find('span').get_text().split('(')[0], 
                 'Neighborhood': neigh_list}
        df=df.append(new_row,  ignore_index='True')

df.Neighborhood = df.Neighborhood.str.join(', ')

In [32]:
df['Borough'][df['Borough']=='MississaugaCanada Post Gateway Processing Centre']='Mississauga'
df['Borough'][df['Borough']=='Downtown TorontoStn A PO Boxes25 The Esplanade']='Downtown Toronto Stn A'
df['Borough'][df['Borough']=='East TorontoBusiness reply mail Processing Centre969 Eastern']='East Toronto Business'
df['Borough'][df['Borough']=='East YorkEast Toronto']='East York/East Toronto'
df['Borough'][df['Borough']=='EtobicokeNorthwest']='Etobicoke Northwest'
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [33]:
print('The size of the dataframe is: {} by {}'.format(df.shape[0], df.shape[1]))


The size of the dataframe is: 103 by 3


In [34]:
# reading the input csv with geospatial data
url_long_lat = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv'
df_long_lat = pd.read_csv(url_long_lat)

# add new columns to the dataframe
df['Latitude']='None'
df['Longitude']='None'

# copy the Latitude and Londitude values 
for n in df_long_lat['Postal Code']:
    df.loc[df['PostalCode']==n, 'Latitude':'Longitude']=df_long_lat.loc[df_long_lat['Postal Code']==n, 'Latitude':'Longitude'].values

In [35]:
df.head(15)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7533,-79.3297
1,M4A,North York,Victoria Village,43.7259,-79.3156
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6543,-79.3606
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7185,-79.4648
4,M7A,Queen's Park,Ontario Provincial Government,43.6623,-79.3895
5,M9A,Etobicoke,Islington Avenue,43.6679,-79.5322
6,M1B,Scarborough,"Malvern, Rouge",43.8067,-79.1944
7,M3B,North York,Don Mills,43.7459,-79.3522
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.7064,-79.3099
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3789


# Part 3

In [36]:
%%capture
!pip install folium
!pip install geocoder
import folium
from geopy.geocoders import Nominatim


In [37]:
geolocator = Nominatim(user_agent='Tor_loc')
Tor_loc = geolocator.geocode('Toronto, Canada')


In [38]:
# open the map on the location of Toronto
map_Toronto = folium.Map(location=[Tor_loc.latitude, Tor_loc.longitude], zoom_start=11, tiles='CartoDB positron')

# create a marker for all the PostalCodes
for n in df.PostalCode:
    folium.CircleMarker(
        [float(df.loc[df['PostalCode']==n,'Latitude']), float(df.loc[df['PostalCode']==n,'Longitude'])],
        radius=5, #radius in pixels
        popup='{}: \n{}'.format(df[df.PostalCode==n].Borough.values[0],df[df.PostalCode==n].Neighborhood.values[0]),
        color='orange',
        fill='True',
        fill_color='lightblue',
        fill_opacity=0.75
    ).add_to(map_Toronto)

# show the map
map_Toronto

In [39]:

# define the reduced data frame - only the boroughs with the word 'Toronto' in the name
df_2=pd.DataFrame(columns=['PostalCode', 'Borough', 'Neighborhood', 'Latitude', 'Longitude'])
for n in range(len(df.Borough)):
    if df.iloc[n,1].find('Toronto') is not -1:
        df_2 = df_2.append(df.iloc[n,:])
print('The analyzed number of the Toronto boroughs has been reduced from {} to {}.'.format(df.shape[0], df_2.shape[0]))
df_2.reset_index(inplace=True, drop=True)
df_2.head()

The analyzed number of the Toronto boroughs has been reduced from 103 to 39.


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
