In [1]:
import requests
import pandas as pd
import os

# Extract data from wiki page

In [2]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

# Create soup object

In [4]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(website_url,'lxml')
#print(soup.prettify())

In [5]:
Tor_table = soup.find('table',{'class':'wikitable sortable'})

# Extract the column header from page

In [6]:
columns = ""
for record in soup.findAll('tr'):
     for data in record.findAll('th'):
        if columns == "":
            columns = data.text
        else:
            columns = columns +","+ data.text
tor_col = columns.splitlines()
del(tor_col[1])
#tor_col
column_name = tor_col[0].split(",")
column_name

['Postcode', 'Borough', 'Neighbourhood']

# Define Dataframe with column name only

In [7]:
tor_df = pd.DataFrame(columns=column_name)
tor_df

Unnamed: 0,Postcode,Borough,Neighbourhood


# Extract data from wiki page

In [8]:
row_data = ""
for record in soup.findAll('tr'):
    cell_data = ""
    for data in record.findAll('td'):
        if cell_data == "":
            cell_data = data.text
        else:
            cell_data = cell_data +","+ data.text
    row_data = row_data + cell_data

# Format the data into different lines

In [9]:
row_new = row_data.splitlines()
print(len(row_new))
print(row_new[1])
col_new = row_new[1].split(",")
print(col_new)
print(len(col_new))

415
M2A,Not assigned,Not assigned
['M2A', 'Not assigned', 'Not assigned']
3


# Convert the extracted data into List of List to directly put into Dataframe

In [10]:
table_list = []
for i in range (len(row_new)):
    column_list = []
    column_list = row_new[i].split(",")
    if column_list[0] != "":
        table_list.append(column_list)
table_list[:5]

[['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront']]

# Create the Dataframe with extracted data and column_name

In [11]:
ton_df = pd.DataFrame(table_list,columns=column_name)
ton_df.tail(5)

Unnamed: 0,Postcode,Borough,Neighbourhood
316,V,,
317,X,,
318,Y,,
319,NL,,
320,A,,


# Drop the missing value rows from dataframe

In [12]:
ton_df.dropna(axis=0,inplace=True)
ton_df.tail()

Unnamed: 0,Postcode,Borough,Neighbourhood
284,M8Z,Etobicoke,Mimico NW
285,M8Z,Etobicoke,The Queensway West
286,M8Z,Etobicoke,Royal York South West
287,M8Z,Etobicoke,South of Bloor
288,M9Z,Not assigned,Not assigned


# Ignore cells with a borough that is "Not assigned"

In [13]:
ton_df_1 = ton_df[ton_df.Borough != "Not assigned"]
ton_df_1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


## If cell has a borough but a "Not assigned" neighborhood, then the neighborhood will be the same as the borough. 
## Only one row affected by this.

In [14]:
ton_df_1['Neighbourhood'].replace('Not assigned',ton_df_1['Borough'],inplace=True)
ton_df_1.head(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [15]:
print(ton_df_1.shape)

(212, 3)


## Combine more than one neighborhood can exist in one postal code area

## ton_df_2 = ton_df_1.groupby(["Postcode", "Borough"], as_index=False).agg("sum") 
### above Group by will group the Neighbourhood, but without comma (","). Not sure how to put that comma so, putting below logic to put comma

In [23]:
ton_df_1.reset_index(drop=True,inplace=True)
ton_df_1.head(3)

Unnamed: 0,index,Postcode,Borough,Neighbourhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,Harbourfront


In [26]:
ton_df_1.drop(['index'], axis=1,inplace=True)
ton_df_1.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [28]:
print(ton_df_1.at[1,"Postcode"])

M4A


In [29]:
row = ton_df_1.Postcode.count()
print(row)
for i in range(row):
    if i > 0:
        if ton_df_1.at[i,"Postcode"] == ton_df_1.at[(i-1),"Postcode"]:
            ton_df_1.at[i,"Neighbourhood"] = ton_df_1.at[i-1,"Neighbourhood"] + ',' + ton_df_1.at[i,"Neighbourhood"]
ton_df_1.drop_duplicates(subset=["Postcode","Borough"], keep="last", inplace=True)
ton_df_1.head(10)

212


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,"Harbourfront,Regent Park"
5,M6A,North York,"Lawrence Heights,Lawrence Manor"
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,"Rouge,Malvern"
10,M3B,North York,Don Mills North
12,M4B,East York,"Woodbine Gardens,Parkview Hill"
14,M5B,Downtown Toronto,"Ryerson,Garden District"


# Final Shape

In [30]:
print(ton_df_1.shape)

(103, 3)


## JOIN Latitude and Longitude from CSV file

In [41]:
ton_df_1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,"Harbourfront,Regent Park"
5,M6A,North York,"Lawrence Heights,Lawrence Manor"
6,M7A,Queen's Park,Queen's Park


In [42]:
path = "http://cocl.us/Geospatial_data/Geospatial_Coordinates-1.csv"
df_lat = pd.read_csv(path)
print(df_lat.shape)
df_lat.head()

(103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [43]:
df_long = df_lat.rename(columns={"Postal Code": "Postcode"})
df_long.head(3)

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711


In [44]:
tor_final = ton_df_1.join(df_long.set_index('Postcode'), on='Postcode')
tor_final.head(3)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
3,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636


In [45]:
tor_final.reset_index(drop=True,inplace=True)
tor_final.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


## Create Toronto Map

In [48]:
from geopy.geocoders import Nominatim

## Get latitude and Longitude for Toronto City for generating map

In [49]:
address = 'Toronto, ON'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Toronto City are 43.653963, -79.387207.


In [50]:
import matplotlib.cm as cm
import matplotlib.colors as colors
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

## Get the Borough that contain the word "Toronto"

In [55]:
tor_data = tor_final[tor_final['Borough'].str.contains("Toronto")]
tor_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


## Map for Toronto City

In [56]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(tor_data['Latitude'], tor_data['Longitude'], tor_data['Borough'], tor_data['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto