In [1]:
import numpy as np # to handle data in a vectorized manner

import pandas as pd # for data analsysis
pd.set_option("display.max_columns", None) # to be able to see all columns
pd.set_option("display.max_rows", None) # to be able to see all rows

import json # to handle JSON files
from pandas.io.json import json_normalize # to tranform JSON file into a pandas dataframe

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # to handle requests
import urllib.request
from bs4 import BeautifulSoup # to parse HTML and XML documents

from sklearn.cluster import KMeans # import k-means from clustering stage


In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urllib.request.urlopen(url)  

In [3]:
# parse the HTML from our URL into the BeautifulSoup parse tree format
soup = BeautifulSoup(page, "lxml")

In [4]:
# let's see the title of the web page as example
soup.title.string

'List of postal codes of Canada: M - Wikipedia'

In [5]:
# all_tables = soup.find_all('table')
# or just specify the table with a "wikitable sortable" class ID
# doing so, we will get rid of some informations unnecessary

table = soup.find('table', class_ = 'wikitable sortable')

In [6]:
A = []
B = []
C = []
table = soup.find("table")
table_rows = table.tbody.find_all("tr")

for row in table_rows:
    cells=row.find_all("td")
    if len(cells)==3:
        A.append(cells[0].find(text=True).rstrip('\n'))
        B.append(cells[1].find(text=True).rstrip('\n'))
        C.append(cells[2].find(text=True).rstrip('\n'))

In [7]:
df = pd.DataFrame(A, columns=['PostalCode'])
df['Borough'] = B
df['Neighborhood'] = C
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood


In [8]:
df['Borough'].replace('Not assigned', np.nan, inplace=True)
df.dropna(subset=['Borough'], inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood


In [9]:
# if there would be "not assigned" for Neighborhood column

for index, row in df.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]

In [10]:
df.shape

(0, 3)

In [11]:
def get_rows(table):
    rows = []
    for tr in table.find_all("tr"):
       
        row_entry = []
        tds = tr.find_all("td")
        #print(tds)
        
        for td in tds:
           
            row_entry.append(td.text.strip())
            
        rows.append(row_entry)
        
    return rows
postal_codes_table = soup.find('table')
#print(postal_codes_table)
#headers = get_header(postal_codes_table)
rows = get_rows(postal_codes_table)
#print(rows)
df = pd.DataFrame(rows)

df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M1ANot assigned,M2ANot assigned,M3ANorth York(Parkwoods),M4ANorth York(Victoria Village),M5ADowntown Toronto(Regent Park / Harbourfront),M6ANorth York(Lawrence Manor / Lawrence Heights),M7AQueen's Park(Ontario Provincial Government),M8ANot assigned,M9AEtobicoke(Islington Avenue)
1,M1BScarborough(Malvern / Rouge),M2BNot assigned,M3BNorth York(Don Mills)North,M4BEast York(Parkview Hill / Woodbine Gardens),"M5BDowntown Toronto(Garden District, Ryerson)",M6BNorth York(Glencairn),M7BNot assigned,M8BNot assigned,M9BEtobicoke(West Deane Park / Princess Garden...
2,M1CScarborough(Rouge Hill / Port Union / Highl...,M2CNot assigned,M3CNorth York(Don Mills)South(Flemingdon Park),M4CEast York(Woodbine Heights),M5CDowntown Toronto(St. James Town),M6CYork(Humewood-Cedarvale),M7CNot assigned,M8CNot assigned,M9CEtobicoke(Eringate / Bloordale Gardens / Ol...
3,M1EScarborough(Guildwood / Morningside / West ...,M2ENot assigned,M3ENot assigned,M4EEast Toronto(The Beaches),M5EDowntown Toronto(Berczy Park),M6EYork(Caledonia-Fairbanks),M7ENot assigned,M8ENot assigned,M9ENot assigned
4,M1GScarborough(Woburn),M2GNot assigned,M3GNot assigned,M4GEast York(Leaside),M5GDowntown Toronto(Central Bay Street),M6GDowntown Toronto(Christie),M7GNot assigned,M8GNot assigned,M9GNot assigned


In [12]:
df.shape

(20, 9)