In [3]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [4]:
can_url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [5]:
page = requests.get(can_url)

### parse html page

In [6]:
soup = BeautifulSoup(page.text, 'html.parser')

### find our table and load to pandas

In [7]:
table = soup.find_all(class_='wikitable sortable')
#table = soup.find_all('table')[0]

In [8]:
df = pd.read_html(str(table), header=0)[0]

In [9]:
df.shape

(288, 3)

### start cleaning

In [10]:
df['Borough'] = df['Borough'].replace('Not assigned', np.nan)

In [11]:
df.dropna(subset=['Borough'], axis=0, inplace=True)

In [12]:
df.reset_index(drop=True, inplace=True)

In [13]:
df['Neighbourhood'] = df.apply(lambda row: row['Borough'] if row['Neighbourhood']=='Not assigned' else row['Neighbourhood'], axis=1)

In [14]:
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [15]:
df = pd.DataFrame(df.groupby(['Postcode','Borough'], as_index=False)['Neighbourhood'].apply(','.join)).reset_index()

In [36]:
df.columns = ['Postcode','Borough','Neighbourhood']
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


### result

In [37]:
df.shape

(103, 3)

In [28]:
geo_cor_df = pd.read_csv('https://raw.githubusercontent.com/rmvaliev/ds-example/master/Geospatial_Coordinates.csv', delimiter=',', header=0, names=['Postcode', 'Latitude','Longitude' ])

In [29]:
geo_cor_df.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [38]:
pd.merge(df, geo_cor_df, on='Postcode')

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848
