### Part I: Scraping Wikipedia and Creating a Pandas Dataframe from Table
#### Import necessary libraries 

In [1]:
from bs4 import BeautifulSoup
import requests

import numpy as np
import pandas as pd

#### Scraping Canadian Postal Codes from Wikipedia

In [2]:
# create soup from url
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
req = requests.get(url)

wiki_soup = BeautifulSoup(req.text, 'lxml')

In [3]:
# extract table from soup using tr tags
pc_table = wiki_soup.find('tbody')
tr = pc_table.find_all('tr')

# create list of rows with the data
pc_contents = []
for row in tr:
    pc_contents.append(row.text.split('\n')[1:-1])
    
len(pc_contents)  # check num rows + column header

289

#### Cleaning the table as per specifications
 - Create a pandas dataframe from table
 - Change neighbourhood to neighborhood, against Canadian wishes
 - Remove all unassigned boroughs
 - Change unassigned neighborhoods to borough name
 - Group by postal code and combine neighborhoods into one column, separated by commas

In [4]:
# Create a pandas df from table
pc_df = pd.DataFrame(pc_contents[1:], columns=pc_contents[0])

# Change to American spelling of neighborhood
pc_df = pc_df.rename(columns={pc_df.columns[2]:"Neighborhood"})

# Change unassigned neighborhoods to borough name
pc_df = pc_df[pc_df['Borough'] != 'Not assigned']
pc_df.reset_index(drop=True, inplace=True)

# Remove all unassigned boroughs
ind_na = pc_df.loc[pc_df['Neighborhood'] == 'Not assigned'].index
na_borough = pc_df.iloc[ind_na, pc_df.columns.get_loc('Borough')]
pc_df.iloc[ind_na, pc_df.columns.get_loc('Neighborhood')] = na_borough
print(pc_df.iloc[ind_na])

pc_df.head(10)  # Check the dataframe

  Postcode       Borough  Neighborhood
6      M7A  Queen's Park  Queen's Park


Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [5]:
# Create a dataframe that groups by postcode
grouped_pc = pc_df.groupby(['Postcode'])

# Concatenate neighborhoods by postcode, separate by commas
grouped_nh = grouped_pc['Neighborhood'].apply(lambda x: x.sum())
grouped_nh = grouped_pc['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))

# Create a dataframe with postcode and borough
grouped_bo = grouped_pc['Borough'].apply(lambda x: set(x).pop())

# Merge grouped_nh and grouped_bo on postcode
pc_data = grouped_bo.to_frame().merge(grouped_nh.to_frame(), on='Postcode')
pc_data = pc_data.reset_index()
pc_data.to_csv('postcode_data.csv', index=False)
pc_data.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [6]:
pc_data.shape

(103, 3)