In [1]:
# Importing essential libraries to web scrape the data
import requests       # To handle requests
import csv
from bs4 import BeautifulSoup as bs     
import urllib.request
import pandas as pd

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
def scrape_table_bs4(cname,cols):
    page  = urllib.request.urlopen(url).read()
    soup  = bs(page,'lxml')
    table = soup.find("table",class_=cname)
    header = [head.findAll(text=True)[0].strip() for head in table.find_all("th")]
    data   = [[td.findAll(text=True)[0].strip() for td in tr.find_all("td")]
              for tr in table.find_all("tr")]
    data    = [row for row in data if len(row) == cols]
    
    # Storing the data to this temporary dataframe
    
    scraped_data = pd.DataFrame(data,columns=header)
    return scraped_data

In [2]:
# calling the scraping function
raw_scraped = scrape_table_bs4("wikitable",3)
raw_scraped

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Queen's Park,Not assigned
8,M8A,Not assigned,Not assigned
9,M9A,Downtown Toronto,Queen's Park


In [3]:
# Not Assigned Borough are not included
new_scraped = raw_scraped[~raw_scraped['Borough'].isin(['Not assigned'])]

# Reassigning th indices after elimination of Not Assigned Boroughs
new_scraped = new_scraped.sort_values(by=['Postcode','Borough','Neighborhood'], ascending=[1,1,1]).reset_index(drop=True)

new_scraped.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,Malvern
1,M1B,Scarborough,Rouge
2,M1C,Scarborough,Highland Creek
3,M1C,Scarborough,Port Union
4,M1C,Scarborough,Rouge Hill


In [4]:
# Not Assigned Neighborhood will be assigned as their Borough name   eg; Queen's Park
new_scraped.loc[new_scraped['Neighborhood'] == 'Not Assigned',['Neighborhood']] = new_scraped['Borough']

# Resetting the indices
new_scraped = new_scraped.groupby(['Postcode','Borough'])['Neighborhood'].apply(', '.join).reset_index()

new_scraped

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [5]:
new_scraped.shape

(103, 3)

In [6]:
new_scraped.to_csv('Toronto_data.csv', index = False)