### Scrape  Wikipedia page

In [17]:
import bs4 as bs
import requests
import lxml.html as lh
import urllib.request
import numpy as np
import pandas as pd

**implementation**

In [31]:
# source 
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
# using bs4 for web scraping and pandas dataframe for holding data
def scrape(cname,cols):
    page = urllib.request.urlopen(url).read()
    soup = bs.BeautifulSoup(page,'html5lib')
    table = soup.find("table",class_=cname)
    header = [head.findAll(text=True)[0].strip() for head in table.find_all("th")]
    data   = [[td.findAll(text=True)[0].strip() for td in tr.find_all("td")]
              for tr in table.find_all("tr")]
    data = [row for row in data if len(row)==cols]
    raw_df = pd.DataFrame(data,columns=header)
    return raw_df

    
raw_TorontoPostalCodes = scrape("wikitable",3)
print(raw_TorontoPostalCodes.info(verbose=True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Postal code   180 non-null    object
 1   Borough       180 non-null    object
 2   Neighborhood  180 non-null    object
dtypes: object(3)
memory usage: 4.3+ KB
None


### Data pre-processing

*deal with unwanted entries in table: drop cells, replace not assigned, group table*

In [42]:
# ignore not assigned
TorontoPostalCodes=raw_TorontoPostalCodes[~raw_TorontoPostalCodes['Borough'].isin(['Not assigned'])]
# sort
TorontoPostalCodes=TorontoPostalCodes.sort_values(by=['Postal code','Borough','Neighborhood'], ascending=[1,1,1]).reset_index(drop=True)
# treat missing values
TorontoPostalCodes.loc[TorontoPostalCodes['Neighborhood'] == 'Not assigned', ['Neighborhood']] = TorontoPostalCodes['Borough']
check_unassigned_post_state_sample = TorontoPostalCodes.loc[TorontoPostalCodes['Borough'] == 'Queen\'s Park']
# group by
TorontoPostalCodes = TorontoPostalCodes.groupby(['Postal code','Borough'])['Neighborhood'].apply(', '.join).reset_index()
TorontoPostalCodes

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,Kingsview Village / St. Phillips / Martin Grov...
101,M9V,Etobicoke,South Steeles / Silverstone / Humbergate / Jam...


In [43]:
TorontoPostalCodes.shape

(103, 3)

In [53]:
TorontoPostalCodes.to_csv('Toronto_neighborhoods.csv',index=False)