## Setting up the Environment

In [55]:
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
import json
from geopy.geocoders import Nominatim
import requests
from bs4 import BeautifulSoup
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium
print("Libraries imported.")

Libraries imported.


## Building the code to scrape the Wikipedia page html

In [56]:
# sending the GET request to read HTML
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
# parsing data from HTML into beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')
# creating lists to store the data into a table
postalCode_List = []
borough_List = []
neighborhood_List = []

In [57]:
# appending the data into defined lists
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCode_List.append(cells[0].text.rstrip('\n'))
        borough_List.append(cells[1].text.rstrip('\n'))
        neighborhood_List.append(cells[2].text.rstrip('\n'))

In [58]:
# creating new dataframe from the defined lists
toronto_df = pd.DataFrame({"PostalCode": postalCode_List,
                           "Borough": borough_List,
                           "Neighborhood": neighborhood_List})

toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Ignoring cells with a borough that is 'Not assigned'

In [59]:
# ignoring cells with a borough that is 'Not assigned'
toronto_df_drop_NA = toronto_df[toronto_df.Borough != "Not assigned"].reset_index(drop=True)
toronto_df_drop_NA.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## Grouping the neighborhood cell in the same borough

In [60]:
# grouping the neighborhood cell in the same borough
toronto_df_grp = toronto_df_drop_NA.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
toronto_df_grp.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Making the value same of borough for neighborhood 'Not assigned'

In [61]:
# Making the value same of borough for neighborhood 'Not assigned'
for index, row in toronto_df_grp.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
toronto_df_grp.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Validating data based on the list in the question

In [62]:
# creating a new dataframe for validation
column_names = ["PostalCode", "Borough", "Neighborhood"]
test_toronto_df = pd.DataFrame(columns=column_names)

test_toronto_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in test_toronto_list:
    test_toronto_df = test_toronto_df.append(toronto_df_grp[toronto_df_grp["PostalCode"]==postcode], ignore_index=True)
    
test_toronto_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Parkview Hill, Woodbine Gardens"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Wexford, Maryvale"
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har..."


## Printing the number of rows of the dataframe

In [63]:
# printing the number of rows of the dataframe
toronto_df_grp.shape

(103, 3)

## Loading the geographical coordinates from Geospatial_data csv file

In [77]:
# Loading the geographical coordinates from Geospatial_data csv file which has been uploaded to the Jupyter Notebook

geo_coordinates = pd.read_csv(body)
geo_coordinates.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [74]:
# renaming the column "Postal Code" to PostalCode"
geo_coordinates.rename(columns={"Postal Code": "PostalCode"}, inplace=True)
geo_coordinates.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Getting the coordinates based on both the tables

In [75]:
# getting the coordinates based on both the tables using 'PostalCode' column
toronto_df_merged = toronto_df_grp.merge(geo_coordinates, on="PostalCode", how="left")
toronto_df_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Validating the coordinates based on the assignment

In [76]:
# validating the coordinates based on the assignment
column_names = ["PostalCode", "Borough", "Neighborhood", "Latitude", "Longitude"]
test_toronto_df = pd.DataFrame(columns=column_names)

test_toronto_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in test_toronto_list:
    test_toronto_df = test_toronto_df.append(toronto_df_merged[toronto_df_merged["PostalCode"]==postcode], ignore_index=True)
    
test_toronto_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Wexford, Maryvale",43.750072,-79.295849
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442
