# Scraping Postal Codes in Toronto from Wikipedia

In [1]:
# Libraries
import pandas as pd
from bs4 import BeautifulSoup
import requests

Set up the url and send request

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_data = requests.get(url).text

Use the beautiful soup library to parse the html information from the url

In [3]:
soup = BeautifulSoup(html_data, 'html.parser')
#print(soup.prettify()) # uncomment to see html_data in neat format

We want to get the list of postal codes and the various Boroughs and Neighbourhoods listed. There appears to be only one table in the html data. Let's see what the table contains

In [4]:
# Get table
table = soup.find_all('table')
#table # uncomment to see the html table

After inspecting the table on the website as well as the html here, the data we want to scrap is contained in various text boxes. The text boxes are contained in the 'td' tag. Each text box contains 2 or more lines. The first line represents the Postal Code, the second line represents the the Borough, and the third line if it exists contains the Neighbourhood information.

In [5]:
textboxes = soup.find_all('td')

# View one text box
print(soup.find_all('td')[3].prettify())

<td style="width:11%; vertical-align:top;">
 <p>
  <b>
   M4A
  </b>
  <br/>
  <span style="font-size:85%;">
   <a href="/wiki/North_York" title="North York">
    North York
   </a>
   <br/>
   (
   <a href="/wiki/Victoria_Village" title="Victoria Village">
    Victoria Village
   </a>
   )
  </span>
 </p>
</td>



The postal codes are contained in the 'b' tag of the html. Let's take a look at one of them.

In [6]:
postalcode = textboxes[3].find('b')
print(postalcode)
print('\nIn neat format:')
print(soup.find_all('b')[3].prettify())

<b>M4A</b>

In neat format:
<b>
 M4A
</b>



In [7]:
# The text for postal code
postalcode.text

'M4A'

The rest of the text that we need, for Boroughs and Neigbourhoods is contained in the 'span' tag.

In [8]:
borough_neigh = textboxes[3].find('span')
print(borough_neigh)
print('\nIn neat format:')
print(soup.find('table').find_all('span')[3].prettify())

<span style="font-size:85%;"><a href="/wiki/North_York" title="North York">North York</a><br/>(<a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>)</span>

In neat format:
<span style="font-size:85%;">
 <a href="/wiki/North_York" title="North York">
  North York
 </a>
 <br/>
 (
 <a href="/wiki/Victoria_Village" title="Victoria Village">
  Victoria Village
 </a>
 )
</span>



The text under the 'span' tag contains both the borough and neighbourhood information. We observe that, the information for borough is always the text before the first open bracket, and the text for neighbourhood(s) is within the brackets. We will extract them as such.

In [9]:
# The text for borough
borough = textboxes[3].find_all('span')[0].text.split('(')[0]
borough

'North York'

In [10]:
# The text for neighbourhood
neighbourhood = textboxes[3].find_all('span')[0].text.split('(')[1][0:-1]
print(neighbourhood)

Victoria Village


Now that we know where all the information is, and how to extract them, let us scrap it into a dictionary that we will later convert into a dataframe.

In [11]:
# Initialize dictionary
toronto = {'Postal Code': [], 'Borough': [], 'Neighbourhood': []}

# Go through the textboxes, extract the needed information
for textbox in textboxes:
    pcodes = textbox.find('b') # postal codes
    bhoods = textbox.find_all('span') # boroughs and neighbourhoods information
    
    # Not all postal codes has neighbourhood information - eliminate them
    try:
        if '(' not in bhoods[0].text:
            toronto['Neighbourhood'].append('NA')
    except: IndexError
        
    # Append extracted information to dictionary
    try:
        for pcode in pcodes:
            toronto['Postal Code'].append(pcode)
        toronto['Borough'].append(bhoods[0].text.split('(')[0])
        toronto['Neighbourhood'].append(bhoods[0].text.split('(')[1][0:-1].replace(' / ', ', '))
    except: None

# Check to make sure all the list of of equal length        
print("The length of Postal Code is {}".format(len(toronto['Postal Code'])))
print("The length of Borough is {}".format(len(toronto['Borough'])))
print("The length of Neighbourhood is {}".format(len(toronto['Neighbourhood'])))        

#toronto # uncomment to see dictionary

The length of Postal Code is 180
The length of Borough is 180
The length of Neighbourhood is 180


Convert the dictionary into a dataframe

In [12]:
df_toronto = pd.DataFrame(toronto)

# Make sure all postal codes are extracted
print("The dataframe contains {} rows and {} columns.".format(df_toronto.shape[0], df_toronto.shape[1]))
print("\nThe head:\n{}".format(df_toronto.head(2)))
print("\nThe tail:\n{}".format(df_toronto.tail(2)))

The dataframe contains 180 rows and 3 columns.

The head:
  Postal Code       Borough Neighbourhood
0         M1A  Not assigned            NA
1         M2A  Not assigned            NA

The tail:
    Postal Code       Borough  \
178         M8Z     Etobicoke   
179         M9Z  Not assigned   

                                         Neighbourhood  
178  Mimico NW, The Queensway West, South of Bloor,...  
179                                                 NA  


Remove postal codes that are not yet assigned

In [13]:
df_toronto = df_toronto[df_toronto.Borough != 'Not assigned'].reset_index(drop = True)
df_toronto

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East TorontoBusiness reply mail Processing Cen...,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [14]:
df_toronto.shape

(103, 3)

Get latitude and longitude information using either geocoder or this url (https://cocl.us/Geospatial_data)

In [15]:
df_latlong = pd.read_csv('https://cocl.us/Geospatial_data')
df_latlong.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merge the postal code data from wikipedia to the latitudes and longitudes data.

In [16]:
df_torontolatlong = pd.merge(df_toronto, df_latlong, how = 'left', on = ['Postal Code', 'Postal Code'])
df_torontolatlong.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


In [17]:
print("The dataframe contains {} rows and {} columns.".format(df_torontolatlong.shape[0], df_torontolatlong.shape[1]))

The dataframe contains 103 rows and 5 columns.
