In [31]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

### The next two cells are just to set up the creation of the table. I get the source of the table using the requests package and then just get the headings.

In [38]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')

In [39]:
mytable = soup.find('table')
ths = mytable.find_all('th')
headings = [th.text.strip() for th in ths]
print(headings)

['Postcode', 'Borough', 'Neighbourhood']


### First, I get all of the text by stripping it out of the HTML.
### Then, I initialize some lists and fill them with values depending on what the index is, since the "entries" has a whole row in groups of 3. If any of the post codes and borough pairs are the same, then I append the second neighborhood onto the first and remove the second's row.
### Once I have the lists, I create a data frame and then filter it to remove any Borough that is Not Assigned.

In [89]:
tds = mytable.find_all('td')
entries = [td.text.strip() for td in tds]
post = []
borough = []
neighborhood = []
for i in range(0,len(entries)):
    if i%3 == 0:
               post.append(entries[i])
    elif i%3 == 1:
               borough.append(entries[i])
    elif i%3 == 2:
               neighborhood.append(entries[i])

for i in range(0,len(post)):
    for j in range(0,len(post)):
        try:
            if post[i] == post[j] and borough[i] == borough[j]:
                neighborhood[i] = neighborhood[i] + ", " + neighborhood[j]
                del post[j]
                del borough[j]
                del neighborhood[j]
        except: pass

wiki = pd.DataFrame({'Postcode':post,'Borough':borough,'Neighbourhood':neighborhood})
wiki = wiki[wiki["Borough"] != "Not assigned"]
wiki.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park
3,M6A,North York,Lawrence Manor
5,M1B,Scarborough,"Rouge, Malvern"
6,M3B,North York,Don Mills North
7,M4B,East York,Parkview Hill
8,M5B,Downtown Toronto,Garden District
10,M9B,Etobicoke,"Martin Grove, Cloverdale, Islington, Princess ..."
11,M1C,Scarborough,"Port Union, Highland Creek, Rouge Hill"
12,M3C,North York,"Flemingdon Park, Don Mills South"


In [50]:
wiki.shape

(74, 3)

In [90]:
geocodes = pd.read_csv('https://cocl.us/Geospatial_data')

### I decided to use the CSV since the geocodes package was not working. Below, I did a join by using the post codes as indices, then bringing it back as a named column

In [113]:
wikijoin = wiki.set_index("Postcode")
wikijoin = wikijoin.join(geocodes.set_index("Postal Code"))
wikijoin = wikijoin.reset_index()
wikijoin.rename(columns={"index":"Postcode"}, inplace=True)
wikijoin.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Port Union, Highland Creek, Rouge Hill",43.784535,-79.160497
2,M1E,Scarborough,"Morningside, West Hill",43.763573,-79.188711
3,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
4,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
5,M1K,Scarborough,"Kennedy Park, East Birchmount Park, Ionview",43.727929,-79.262029
6,M1L,Scarborough,"Oakridge, Clairlea, Golden Mile",43.711112,-79.284577
7,M1M,Scarborough,"Scarborough Village West, Cliffcrest, Cliffside",43.716316,-79.239476
8,M1N,Scarborough,Cliffside West,43.692657,-79.264848
9,M1P,Scarborough,"Wexford Heights, Dorset Park, Scarborough Town...",43.75741,-79.273304
