In [2]:
# Importing required libraries
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Getting data from wikipedia
* Requests library will be used

In [3]:
postal_page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(postal_page.text, 'html.parser')

table_data = []
table = soup.find_all('table')[0]
table_body = table.find('tbody')
rows = table_body.find_all('tr')

for row in rows:
    cols = [col.text.strip() for col in row.find_all('td')]
    table_data.append(cols)

postal_df = pd.DataFrame(table_data, columns=['PostalCode','Borough','Neighborhood'])
print(postal_df.head())

  PostalCode       Borough      Neighborhood
0       None          None              None
1        M1A  Not assigned      Not assigned
2        M2A  Not assigned      Not assigned
3        M3A    North York         Parkwoods
4        M4A    North York  Victoria Village


# Cleaning and processing dataframe

In [11]:
# Dropping empty cells using "PostalCode"
postal_df = postal_df.dropna(subset=['PostalCode'])

# Ignoring cells with a borough that is Not assigned.
postal_df = postal_df[~postal_df['Borough'].str.contains('Not assigned')]

# Grouping the Neighborhoods into one row
postal_df = postal_df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(','.join).reset_index()

# Assigning borough as neighborhood, if a cell has a borough but a Not assigned neighborhood
postal_df['Neighborhood'] = np.where(postal_df['Neighborhood'].str.contains('Not assigned'), postal_df['Borough'], postal_df['Neighborhood'])

print(postal_df.head())
print(postal_df.shape)


  PostalCode      Borough                          Neighborhood
0        M1B  Scarborough                         Rouge,Malvern
1        M1C  Scarborough  Highland Creek,Rouge Hill,Port Union
2        M1E  Scarborough       Guildwood,Morningside,West Hill
3        M1G  Scarborough                                Woburn
4        M1H  Scarborough                             Cedarbrae
(103, 3)


# Adding Geospatial coordinates

In [14]:
geo_df = pd.read_csv('Geospatial_Coordinates.csv')
geo_df = geo_df.rename(columns={'Postal Code': 'PostalCode'})
print(geo_df.head())

  PostalCode   Latitude  Longitude
0        M1B  43.806686 -79.194353
1        M1C  43.784535 -79.160497
2        M1E  43.763573 -79.188711
3        M1G  43.770992 -79.216917
4        M1H  43.773136 -79.239476


In [15]:
# Merging postal data and geospatial coordinates 

toronto_df = postal_df.merge(geo_df, on='PostalCode', how='left')

print(toronto_df.head())

  PostalCode      Borough                          Neighborhood   Latitude  \
0        M1B  Scarborough                         Rouge,Malvern  43.806686   
1        M1C  Scarborough  Highland Creek,Rouge Hill,Port Union  43.784535   
2        M1E  Scarborough       Guildwood,Morningside,West Hill  43.763573   
3        M1G  Scarborough                                Woburn  43.770992   
4        M1H  Scarborough                             Cedarbrae  43.773136   

   Longitude  
0 -79.194353  
1 -79.160497  
2 -79.188711  
3 -79.216917  
4 -79.239476  
