### Import Required libraries

In [1]:
import pandas as pd
import numpy as np
import requests
import wikipedia as wp
from bs4 import BeautifulSoup


## Method 1 using Beautiful Soup

In [2]:
wiki_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(wiki_url,'html.parser')

In [3]:
postal_codes = []
items = soup.find('table',{'class':'wikitable sortable'}).findAll('tr')
for item in items:
    data = item.findAll(['th','td'])
    data = [x.text.strip() for x in data]
    postal_codes.append(data)
df_postal_codes = pd.DataFrame(postal_codes[1::],columns=postal_codes[0])
df_postal_codes.shape

(180, 3)

## Method 2 using wikipedia Python library

In [4]:
html = wp.page("List_of_postal_codes_of_Canada:_M").html().encode("UTF-8")
df_postal_codes = pd.read_html(html)[0]
df_postal_codes.shape

(180, 3)

### convert all cells in the dataframe to upper case

In [5]:
df_postal_codes = df_postal_codes.apply(lambda x: x.str.upper())
df_postal_codes.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,NOT ASSIGNED,NOT ASSIGNED
1,M2A,NOT ASSIGNED,NOT ASSIGNED
2,M3A,NORTH YORK,PARKWOODS
3,M4A,NORTH YORK,VICTORIA VILLAGE
4,M5A,DOWNTOWN TORONTO,"REGENT PARK, HARBOURFRONT"


In [6]:
df_postal_codes['Borough'].value_counts()

NOT ASSIGNED        77
NORTH YORK          24
DOWNTOWN TORONTO    19
SCARBOROUGH         17
ETOBICOKE           12
CENTRAL TORONTO      9
WEST TORONTO         6
YORK                 5
EAST TORONTO         5
EAST YORK            5
MISSISSAUGA          1
Name: Borough, dtype: int64

### cleanup Burough and Neighborhood columns as per instructions
___Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.___


___If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.___

In [7]:
df_postal_codes = df_postal_codes[df_postal_codes['Borough'] != 'NOT ASSIGNED']
df_postal_codes.loc[(df_postal_codes['Neighborhood']=='NOT ASSIGNED')&(df_postal_codes['Borough']!='NOT ASSIGNED'), 'Neighborhood'] = df_postal_codes['Borough']
df_postal_codes.shape

(103, 3)

In [8]:
df_postal_codes.describe()

Unnamed: 0,Postal Code,Borough,Neighborhood
count,103,103,103
unique,103,10,99
top,M4Y,NORTH YORK,DOWNSVIEW
freq,1,24,4


In [9]:
df_postal_codes.shape

(103, 3)