Step 1 - import required libraries 

In [1]:
#import required libraries
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation
from bs4 import BeautifulSoup #import the Beautiful soup functions to parse the data returned from the website
from geopy.geocoders import Nominatim # import Nominatim for lan, lat

Step 2 - Using requests and BeautifulSoap libraries to scrap the data from wiki page and identified the table for data extract

In [2]:
# wiki URL to get Toronta neighborhood 
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
results = requests.get(url)
soup = BeautifulSoup(results.text,'html.parser')
all_tables=soup.find_all('table')
# assign the right table to object for data parsing
right_table = all_tables[0]

Step 3- Extract the table data from webpage and store in panda dataframe

In [3]:
# to get all rows and cols from this table
rows = right_table.find_all("tr")
rows_count = len(rows)
# define list to store the data 
postcode = []
borough = []
neighbourhood = []

for i in range(1,rows_count):
    cells = rows[i].findAll('td')
    postcode.append(cells[0].text)
    borough.append(cells[1].text)
    neighbourhood.append(cells[2].text.strip('\n'))

toronto_df = pd.DataFrame({'postcode':postcode,
                           'borough':borough,
                           'neighbourhood':neighbourhood})
toronto_df.head()

Unnamed: 0,postcode,borough,neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [4]:
# print the shape of the dataframe
toronto_df.shape

(287, 3)

Step 4 - Clean dataframe by removing 'Not assigned' rows 

In [5]:
toronto_df_clean = toronto_df.copy()
toronto_df_clean.drop(toronto_df_clean[toronto_df_clean['borough'] =='Not assigned'].index,inplace=True)
toronto_df_clean.head()

Unnamed: 0,postcode,borough,neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [6]:
# to find out that Not assigned neighourhood. it will be same as borough
toronto_df_clean[toronto_df_clean['neighbourhood']=='Not assigned']

Unnamed: 0,postcode,borough,neighbourhood
7,M7A,Queen's Park,Not assigned


In [7]:
# to change Not assigned value in neighbourhood value to borough value
for index, row in toronto_df_clean.iterrows():
    if row['neighbourhood']=='Not assigned':
        toronto_df_clean.at[index,'neighbourhood']=toronto_df_clean.at[index,'borough']



In [8]:
# to check the values are updated in the dataframe
toronto_df_clean[toronto_df_clean['neighbourhood']==toronto_df_clean['borough']]

Unnamed: 0,postcode,borough,neighbourhood
7,M7A,Queen's Park,Queen's Park


Step 5 - Group the neighbourhood by Postcode and borough. More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table

In [9]:
toronto_df_grouped = toronto_df_clean.groupby(['postcode','borough']).agg(neighbourhood=('neighbourhood', ', '.join)).reset_index()
toronto_df_grouped.head()

Unnamed: 0,postcode,borough,neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Step 6 - print the shape of the final dataframe

In [10]:
toronto_df_grouped.shape

(103, 3)

Step 7 - mapping geographical coordinates of a given postal code. API is not working properly it shows time out error most of time. So mapping with CSV file


In [21]:
# to open the geo file
geo_df = pd.read_csv(r'C:\DataScience\IBM Data Science\Final_project\Geospatial_Coordinates.csv')
geo_df.columns =['postcode','Latitude','Longitude']
geo_df.head()

Unnamed: 0,postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [22]:
# to check the shape of the geo dataframe
geo_df.shape


(103, 3)

In [24]:
#print the new dataframe
toronto_geo = pd.merge(toronto_df_grouped, geo_df, on='postcode')
toronto_geo

Unnamed: 0,postcode,borough,neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437
