In [63]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [64]:
#obtaining the wikipedia page and using lxml parser
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')
print(soup.title)

<title>List of postal codes of Canada: M - Wikipedia</title>


In [65]:
postcode_table = soup.find('table', class_= 'wikitable sortable')
table_elements = postcode_table.findAll('td')
print(table_elements[:20])

[<td>M1A</td>, <td>Not assigned</td>, <td>Not assigned
</td>, <td>M2A</td>, <td>Not assigned</td>, <td>Not assigned
</td>, <td>M3A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td>, <td>M4A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td>, <td>M5A</td>, <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>, <td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td>, <td>M5A</td>, <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>, <td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td>, <td>M6A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>]


In [66]:
len(table_elements) #finding the total number of elements in the table


867

In [67]:
data_table=[]
for i in range(0, 867, 3):
    split_table_in_columns = [[table_elements[i], table_elements[i+1], table_elements[i+2]] ]
    postcode = table_elements[i].text.strip()
    bourough = table_elements[i+1].text.strip()
    neighbourhood = table_elements[i+2].text.strip()
    data_table.append([postcode, bourough, neighbourhood])
data_table

[['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M5A', 'Downtown Toronto', 'Regent Park'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', "Queen's Park", 'Not assigned'],
 ['M8A', 'Not assigned', 'Not assigned'],
 ['M9A', 'Etobicoke', 'Islington Avenue'],
 ['M1B', 'Scarborough', 'Rouge'],
 ['M1B', 'Scarborough', 'Malvern'],
 ['M2B', 'Not assigned', 'Not assigned'],
 ['M3B', 'North York', 'Don Mills North'],
 ['M4B', 'East York', 'Woodbine Gardens'],
 ['M4B', 'East York', 'Parkview Hill'],
 ['M5B', 'Downtown Toronto', 'Ryerson'],
 ['M5B', 'Downtown Toronto', 'Garden District'],
 ['M6B', 'North York', 'Glencairn'],
 ['M7B', 'Not assigned', 'Not assigned'],
 ['M8B', 'Not assigned', 'Not assigned'],
 ['M9B', 'Etobicoke', 'Cloverdale'],
 ['M9B', 'Etobicoke', 'Islington'],
 ['M9B', 

In [68]:
df_CanadaPostcodes = pd.DataFrame(data_table)
df_CanadaPostcodes.columns = ['Postcode', 'Bourough', 'Neighbourhood']
df_CanadaPostcodes.head(10)

Unnamed: 0,Postcode,Bourough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [69]:
df_CanadaPostcodes.shape #checking the dimensions of the dataframe


(289, 3)

In [70]:
df_CanadaPostcodes = df_CanadaPostcodes[~df_CanadaPostcodes['Bourough'].isin(['Not assigned'])]
df_CanadaPostcodes.reset_index(inplace=True)
df_CanadaPostcodes.drop('index', axis=1, inplace=True)
df_CanadaPostcodes.head(10)

Unnamed: 0,Postcode,Bourough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [71]:
df_CanadaPostcodes.shape #checking the dimensions of the dataframe


(212, 3)

In [72]:
df_CanadaPostcodes = df_CanadaPostcodes.groupby(['Postcode','Bourough'])['Neighbourhood'].apply(', '.join).reset_index()
df_CanadaPostcodes.head()

Unnamed: 0,Postcode,Bourough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [73]:
missing_neighbourhoods = df_CanadaPostcodes.Neighbourhood.values == 'Not assigned'
df_CanadaPostcodes.Neighbourhood[missing_neighbourhoods] = df_CanadaPostcodes.Bourough[missing_neighbourhoods]
df_CanadaPostcodes.head()

Unnamed: 0,Postcode,Bourough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [74]:
df_CanadaPostcodes.shape #checking the dimensions of the dataframe


(103, 3)

In [75]:
#importing the csv file that has the geographical coordinates of each postal code and casting it into a new pandas dataframe called geo_data
urlfordata="http://cocl.us/Geospatial_data" 
geo_data=pd.read_csv(urlfordata)
geo_data.head(10)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [76]:
df_CanadaPostcodes.rename(columns={'Postcode': 'Postal Code'}, inplace=True)#renaming the first column in the first dataframe to match the second dataframe
df_CanadaPostcodes.head()

Unnamed: 0,Postal Code,Bourough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [77]:
final_df=pd.merge(df_CanadaPostcodes, geo_data, on='Postal Code')
final_df.head()

Unnamed: 0,Postal Code,Bourough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [78]:
#importing the libraries necessary for analysis

import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.

Libraries imported.


In [79]:
##Use geopy library to get the latitude and longitude values of Toronto City.

address = 'Toronto'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Toronto City are 43.653963, -79.387207.


In [80]:
toronto_df = final_df[final_df['Bourough'].str.contains("Toronto")].reset_index(drop=True)


In [81]:
toronto_df.head()


Unnamed: 0,Postal Code,Bourough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [82]:
toronto_df.shape


(38, 5)

In [83]:
toronto_df.rename(columns={'Bourough': 'Area'}, inplace=True)
toronto_df.set_index('Neighbourhood', drop=True, inplace=True)
toronto_df.head()

Unnamed: 0_level_0,Postal Code,Area,Latitude,Longitude
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
The Beaches,M4E,East Toronto,43.676357,-79.293031
"The Danforth West, Riverdale",M4K,East Toronto,43.679557,-79.352188
"The Beaches West, India Bazaar",M4L,East Toronto,43.668999,-79.315572
Studio District,M4M,East Toronto,43.659526,-79.340923
Lawrence Park,M4N,Central Toronto,43.72802,-79.38879


In [84]:
original_loc = toronto_df.loc['Rosedale', :].copy()
original_loc['Area'] = 'Rosedale'
original_loc = original_loc.drop(labels=['Postal Code'])
original_loc

Area         Rosedale
Latitude      43.6796
Longitude    -79.3775
Name: Rosedale, dtype: object

In [85]:
#obtaining the wikipedia page and using lxml parser
source = requests.get('https://en.wikipedia.org/wiki/Postal_district_numbers_of_Melbourne').text
soup = BeautifulSoup(source, 'lxml')
print(soup.title)


<title>Postal district numbers of Melbourne - Wikipedia</title>


In [86]:
postcode_table = soup.find('table', class_= 'wikitable sortable')
table_elements = postcode_table.findAll('td')
print(table_elements[:50])

[<td>(Carnegie North)</td>, <td>SE.10</td>, <td></td>, <td></td>, <td>* no Post Office
</td>, <td>(Hawthorn East)</td>, <td>E. 3</td>, <td></td>, <td></td>, <td>* no Post Office
</td>, <td>(Williamstown West)</td>, <td>W.17</td>, <td></td>, <td></td>, <td>* no Post Office
</td>, <td>Abbotsford</td>, <td>N. 9</td>, <td>1928</td>, <td><a href="/wiki/Abbotsford,_Victoria" title="Abbotsford, Victoria">Abbotsford</a></td>, <td>** name of district
</td>, <td><a href="/wiki/Essendon_Airport" title="Essendon Airport">Aerodrome Essendon</a></td>, <td>W. 6</td>, <td>1938</td>, <td><a href="/wiki/Essendon,_Victoria" title="Essendon, Victoria">Essendon</a></td>, <td>.
</td>, <td>Albert Park</td>, <td>SC. 6</td>, <td>1928</td>, <td><a href="/wiki/Albert_Park,_Victoria" title="Albert Park, Victoria">Albert Park</a></td>, <td>*
</td>, <td>Albert Park South</td>, <td>SC. 6</td>, <td>1947</td>, <td><a href="/wiki/Albert_Park,_Victoria" title="Albert Park, Victoria">Albert Park</a></td>, <td>.
</td>, <t

In [87]:
len(table_elements) #finding the total number of elements in the table


1790

In [88]:
data_table=[]
for i in range(0, 1790, 5):
    split_table_in_columns = [[table_elements[i], table_elements[i+1], table_elements[i+2],table_elements[i+3],table_elements[i+4]] ]
    locality = table_elements[i].text.strip()
    suburb = table_elements[i+3].text.strip()
    data_table.append([locality, suburb])
data_table

[['(Carnegie North)', ''],
 ['(Hawthorn East)', ''],
 ['(Williamstown West)', ''],
 ['Abbotsford', 'Abbotsford'],
 ['Aerodrome Essendon', 'Essendon'],
 ['Albert Park', 'Albert Park'],
 ['Albert Park South', 'Albert Park'],
 ['Alphington', 'Alphington'],
 ['Altona', 'Altona'],
 ['Altona East', 'Altona North'],
 ['Altona North', 'Altona North'],
 ['Argyle Street', 'Footscray West'],
 ['Armadale', 'Armadale'],
 ['Armadale North', 'Armadale'],
 ['Ascot Vale', 'Ascot Vale'],
 ['Ascot Vale East', 'Ascot Vale'],
 ['Ascot Vale RAAF', 'Ascot Vale'],
 ['Ascot Vale West', 'Ascot Vale'],
 ['Ashburton', 'Ashburton'],
 ['Ashburton East', 'Ashburton'],
 ['Ashwood', 'Ashwood'],
 ['Aspendale', 'Aspendale'],
 ['Auburn', 'Hawthorn East'],
 ['Auburn South', 'Hawthorn'],
 ['Balaclava', 'Balaclava'],
 ['Balwyn', 'Balwyn'],
 ['Balwyn East', 'Balwyn'],
 ['Balwyn North', 'Balwyn North'],
 ['Balwyn West', ''],
 ['Barker', 'Hawthorn'],
 ['Batman', 'Coburg North'],
 ['Beaumaris', 'Beaumaris'],
 ['Beaumaris South'

In [89]:
df_MelbourneArea = pd.DataFrame(data_table)
df_MelbourneArea.columns = ['Area', 'Suburb']
df_MelbourneArea.head(10)

Unnamed: 0,Area,Suburb
0,(Carnegie North),
1,(Hawthorn East),
2,(Williamstown West),
3,Abbotsford,Abbotsford
4,Aerodrome Essendon,Essendon
5,Albert Park,Albert Park
6,Albert Park South,Albert Park
7,Alphington,Alphington
8,Altona,Altona
9,Altona East,Altona North


In [94]:
missing_suburbs = df_MelbourneArea.Suburb.values == ''
df_MelbourneArea.Suburb[missing_suburbs] = df_MelbourneArea.Area[missing_suburbs]
#df_MelbourneArea=df_MelbourneArea.Area.replace({'Were Street, Brighton': 'Were Street Brighton','Geelong Road, Footscray':'Geelong Road Footscray','North Road, Brighton':'North Road Brighton',"St George's Road, Thornbury":"St George's Road Thornbury"})
df_MelbourneArea = pd.DataFrame(df_MelbourneArea)
df_MelbourneArea.head()

Unnamed: 0,Area,Suburb
0,(Carnegie North),(Carnegie North)
1,(Hawthorn East),(Hawthorn East)
2,(Williamstown West),(Williamstown West)
3,Abbotsford,Abbotsford
4,Aerodrome Essendon,Essendon
