In [1]:
# importing the useful libraries for this project. 

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Extracting the data from wikipedia page 
* Extract the HTML of the webpage using requests. 
* Parse HTML using BeautifulSoup.


In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
website_url = requests.get(url).text     # This returns the HTML of website
soup = BeautifulSoup(website_url, 'lxml')
# print(soup.prettify())                   # To check how tags are nested. 

#### The useful data is of the class type "wikitable sorttable". Therefore, we find that data. 

In [5]:
my_table = soup.find('table',{'class':'wikitable sortable'})
table_rows = my_table.findAll('tr')
# table_rows

In [6]:
# This part of code takes table_rows as input and output the data in dataframe. 
data = []
for row in table_rows:
    data.append([t.text.strip() for t in row.find_all('td')])

df = pd.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighbourhood'])
df = df[~df['PostalCode'].isnull()]  # to filter out bad rows
df.head(5)

Unnamed: 0,PostalCode,Borough,Neighbourhood
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [7]:
df = df[~(df['Borough'] == 'Not assigned')]   # To drop the rows with (Borough = Not assigned). 
df.reset_index(inplace = True)
df = df.drop(columns = 'index')
df.head(5)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### To check the number of neighbourhoods in one postal code

In [22]:
zx = (df['PostalCode'].value_counts() > 1)  # Create a series with elements = True if a 'PostalCode' reoccured. 
(zx).values.sum()    # To find if any of the 'PostalCode' reoccured. 

# This shows non repition of postal code. 

0

### To check if a nieghbourhood is not assigned. 

In [24]:
df_na = df[(df['Neighbourhood'] == 'Not assigned')]
df_na.shape

(0, 3)

Hence, there is no rows with not assigned neighbourhood. 

In [25]:
df.shape

(103, 3)