# Importing all libraries

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import lxml.html as lxml
import bs4 as bs
import urllib.request

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Waiting for a Spark session to start...
Spark Initialization Done! ApplicationId = app-20200731215157-0000
KERNEL_ID = a46f3b61-59f3-4cd4-b766-d26f152742b2
Libraries imported.


In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
result = requests.get(url)
beaustifulsoup = bs.BeautifulSoup(result.content,'lxml')
table = beaustifulsoup.find_all('table')[0]
df = pd.read_html(str(table))
df_1 = pd.read_json(df[0].to_json(orient='records'))

In [3]:
df_1.head()

Unnamed: 0,Borough,Neighbourhood,Postal Code
0,Not assigned,Not assigned,M1A
1,Not assigned,Not assigned,M2A
2,North York,Parkwoods,M3A
3,North York,Victoria Village,M4A
4,Downtown Toronto,"Regent Park, Harbourfront",M5A


# Processing cells that have an assigned borough and ignore 'Not assigned'

In [4]:
df_1['Borough'].replace('Not assigned', np.nan, inplace=True)
df_1.dropna(subset=['Borough'], inplace=True)
df_1

Unnamed: 0,Borough,Neighbourhood,Postal Code
2,North York,Parkwoods,M3A
3,North York,Victoria Village,M4A
4,Downtown Toronto,"Regent Park, Harbourfront",M5A
5,North York,"Lawrence Manor, Lawrence Heights",M6A
6,Downtown Toronto,"Queen's Park, Ontario Provincial Government",M7A
8,Etobicoke,"Islington Avenue, Humber Valley Village",M9A
9,Scarborough,"Malvern, Rouge",M1B
11,North York,Don Mills,M3B
12,East York,"Parkview Hill, Woodbine Gardens",M4B
13,Downtown Toronto,"Garden District, Ryerson",M5B


# Grouping Borough and Postal Code

In [5]:
df_2 = df_1[df_1['Borough'] != 'Not assigned']
df_2 = df_2.groupby(['Borough', 'Postal Code'], as_index=False).agg(','.join)
df_2

Unnamed: 0,Borough,Postal Code,Neighbourhood
0,Central Toronto,M4N,Lawrence Park
1,Central Toronto,M4P,Davisville North
2,Central Toronto,M4R,"North Toronto West, Lawrence Park"
3,Central Toronto,M4S,Davisville
4,Central Toronto,M4T,"Moore Park, Summerhill East"
5,Central Toronto,M4V,"Summerhill West, Rathnelly, South Hill, Forest..."
6,Central Toronto,M5N,Roselawn
7,Central Toronto,M5P,"Forest Hill North & West, Forest Hill Road Park"
8,Central Toronto,M5R,"The Annex, North Midtown, Yorkville"
9,Downtown Toronto,M4W,Rosedale


# Changing 'Not Assigned' neighborhood to 'Borough' if any present

In [6]:
df_2['Neighbourhood'] = np.where(df_2['Neighbourhood'] == 'Not assigned', df_2['Borough'], df_2['Neighbourhood'])
df_2

Unnamed: 0,Borough,Postal Code,Neighbourhood
0,Central Toronto,M4N,Lawrence Park
1,Central Toronto,M4P,Davisville North
2,Central Toronto,M4R,"North Toronto West, Lawrence Park"
3,Central Toronto,M4S,Davisville
4,Central Toronto,M4T,"Moore Park, Summerhill East"
5,Central Toronto,M4V,"Summerhill West, Rathnelly, South Hill, Forest..."
6,Central Toronto,M5N,Roselawn
7,Central Toronto,M5P,"Forest Hill North & West, Forest Hill Road Park"
8,Central Toronto,M5R,"The Annex, North Midtown, Yorkville"
9,Downtown Toronto,M4W,Rosedale


# Using .Shape method to print number of rows in dataframe

In [7]:
df_2.shape

(103, 3)

# Get the csv file for geographical coordinates 

In [14]:
geo_url = "https://cocl.us/Geospatial_data"

In [15]:
df_geo = pd.read_csv(geo_url)
df_geo

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


# Merging dataframe

In [18]:
df_mergeGeo = pd.merge(df_2, df_geo, on=['Postal Code'], how='inner')
df_mergeGeo

Unnamed: 0,Borough,Postal Code,Neighbourhood,Latitude,Longitude
0,Central Toronto,M4N,Lawrence Park,43.72802,-79.38879
1,Central Toronto,M4P,Davisville North,43.712751,-79.390197
2,Central Toronto,M4R,"North Toronto West, Lawrence Park",43.715383,-79.405678
3,Central Toronto,M4S,Davisville,43.704324,-79.38879
4,Central Toronto,M4T,"Moore Park, Summerhill East",43.689574,-79.38316
5,Central Toronto,M4V,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049
6,Central Toronto,M5N,Roselawn,43.711695,-79.416936
7,Central Toronto,M5P,"Forest Hill North & West, Forest Hill Road Park",43.696948,-79.411307
8,Central Toronto,M5R,"The Annex, North Midtown, Yorkville",43.67271,-79.405678
9,Downtown Toronto,M4W,Rosedale,43.679563,-79.377529
