### Find Neighborhoods' Coo

In [47]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

Create soup object of our Wiki page.

In [48]:
html = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(html).text
soup = BeautifulSoup(source, 'lxml')

From the returned html structure, our interested information is in tag <tr> and <td>, extract the data and store into a list.

In [49]:
templist = []
for line in soup.find_all('tr'):
    if len(line.find_all('th')) != 0:
        templist.append([item.text.strip('\n') for item in line.find_all('th')])
    else:
        templist.append([item.text.strip('\n') for item in line.find_all('td')])

Convert the templist into a pandas dataframe.

In [50]:
df = pd.DataFrame([item[0:3] for item in templist[1:-5]])
df.columns = templist[0]
print(df.shape)
df.head()

(289, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Process the dataframe, first remove all records that Borough is 'Not assigned'.

In [51]:
df = df[df.Borough != 'Not assigned'] 
print(df.shape)
df.head()

(212, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


Second, for records that has Borough information, but without Neighbourhood, assign Borough to Neighbourhood.

In [52]:
temp = df[df['Neighbourhood'] =='Not assigned'].Borough
df.loc[df['Neighbourhood'] == 'Not assigned', 'Neighbourhood']=temp
df = df.reset_index(drop=True)
print(df.shape)
df.head()

(212, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


Third, group records that have the same postcode and Borough into one row with the neighbourhoods, comma delimited.

In [53]:
temp = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x: '%s' % ','.join(x))
df1 = pd.DataFrame(temp.index.get_level_values(0))
df2 = pd.DataFrame(temp.index.get_level_values(1))
df3 = pd.DataFrame(temp.values)
df4 = pd.concat([df1, df2, df3], axis=1)
df4.columns = ['Postcode', 'Borough', 'Neighbourhood']
pd.set_option('display.max_colwidth', -1)
# df4[df4.Postcode == 'M9V']


To exam if rows are correctly combined, check one postcode that has different neighbourhoods, M9V.

In [54]:
print(df4[df4.Postcode == 'M9V']['Neighbourhood'])
df[df.Postcode == 'M9V']

101    Albion Gardens,Beaumond Heights,Humbergate,Jamestown,Mount Olive,Silverstone,South Steeles,Thistletown
Name: Neighbourhood, dtype: object


Unnamed: 0,Postcode,Borough,Neighbourhood
174,M9V,Etobicoke,Albion Gardens
175,M9V,Etobicoke,Beaumond Heights
176,M9V,Etobicoke,Humbergate
177,M9V,Etobicoke,Jamestown
178,M9V,Etobicoke,Mount Olive
179,M9V,Etobicoke,Silverstone
180,M9V,Etobicoke,South Steeles
181,M9V,Etobicoke,Thistletown


Let's see the final data frame size

In [55]:
print(df4.shape)
df4.head()

(103, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Find Lat, Lon of each postcode.

In [56]:
!conda install -c conda-forge geocoder --yes

Solving environment: done

# All requested packages already installed.



In [57]:
import geocoder

def getCoord(postal_code):
    lat_lng_coords = None
    print(postal_code)
    while(lat_lng_coords is None):
        print('looping...')
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
        print(g, g.latlng)
        
    return lat_lng_coord[0], lat_lng_coord[1]
print("Coordinates collector is ready to use!")

Coordinates collector is ready to use!


Because get lat long from geocoder.google is taking too long for even a single postal code, use the csv file that already contains coordinates we are looking for.

In [61]:
CoordList = 'Geospatial_Coordinates.csv'
coord_df = pd.read_csv(CoordList)
coord_df.columns = ['Postcode', 'Latitude', 'Longitude']
coord_df.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Now combine this coord_df with previous dataframe we scraped from wikipedia.

In [63]:
combined = pd.merge(df4, coord_df, on='Postcode')
combined.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
