In [1]:
# @hidden_cell
# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.
from project_lib import Project
project = Project(project_id='e52cc99b-da11-4f13-8bab-8d751b588c56', project_access_token='p-eb2870c8017d6d43152d4cfe58b6a8539536841a')
pc = project.project_context


## This notebook combines two data frames, one contains coordinates and the other postal codes of Toronto neighborhoods.

### So, let's start by importing related libraries as usual.

In [1]:
import numpy as np 
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.18.1-py_0 conda-forge

geographiclib- 100% |################################| Time: 0:00:00  13.09 MB/s
geopy-1.18.1-p 100% |################################| Time: 0:00:00  18.93 MB/s
Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  33.35 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  37.01 MB/s
vincent-0.4.4- 100% |###################

### Scrap wikipedia data on Toronto neighborhood location data and create a data frame, say, df.

In [2]:
data = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', header = 0)

df = pd.DataFrame(data[0])
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Rename "Postcode" with "PostalCode".

In [3]:
df.rename(columns={'Postcode': 'PostalCode', 'Borough': 'Borough', 'Neighbourhood': 'Neighbourhood'}, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Let's see the number of rows.

In [4]:
df.shape

(289, 3)

### Remove rows having values "Not assigned" in the column "Borough".

In [5]:
df= df[df.Borough != 'Not assigned']
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### Check again the number of rows.

In [6]:
df.shape

(212, 3)

### Re-indexing.

In [7]:
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


### Merging all neighborhoods under the same postal codes.

In [10]:
df = df.groupby('Borough').agg({'PostalCode':'first', 
                             'Neighbourhood': ', '.join 
                              }).reset_index()
df.head(20)

Unnamed: 0,Borough,PostalCode,Neighbourhood
0,Central Toronto,M4N,"Lawrence Park, Davisville North, North Toronto..."
1,Downtown Toronto,M4W,"Rosedale, Cabbagetown, St. James Town, Church ..."
2,East Toronto,M4E,"The Beaches, Riverdale, The Danforth West, Ind..."
3,East York,M4B,"Parkview Hill, Woodbine Gardens, Woodbine Heig..."
4,Etobicoke,M8V,"Humber Bay Shores, Mimico South, New Toronto, ..."
5,Mississauga,M7R,Canada Post Gateway Processing Centre
6,North York,M2H,"Hillcrest Village, Fairview, Henry Farm, Oriol..."
7,Queen's Park,M7A,Not assigned
8,Scarborough,M1B,"Malvern, Rouge, Highland Creek, Port Union, Ro..."
9,West Toronto,M6H,"Dovercourt Village, Dufferin, Little Portugal,..."


### Now see the reduced number of rows.

In [11]:
df.shape

(11, 3)

### Shifting "PostalCode" column to the first position.

In [12]:
columnsList=["PostalCode","Borough", "Neighbourhood"]
df=df.reindex(columns=columnsList)
df.head(20)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M4N,Central Toronto,"Lawrence Park, Davisville North, North Toronto..."
1,M4W,Downtown Toronto,"Rosedale, Cabbagetown, St. James Town, Church ..."
2,M4E,East Toronto,"The Beaches, Riverdale, The Danforth West, Ind..."
3,M4B,East York,"Parkview Hill, Woodbine Gardens, Woodbine Heig..."
4,M8V,Etobicoke,"Humber Bay Shores, Mimico South, New Toronto, ..."
5,M7R,Mississauga,Canada Post Gateway Processing Centre
6,M2H,North York,"Hillcrest Village, Fairview, Henry Farm, Oriol..."
7,M7A,Queen's Park,Not assigned
8,M1B,Scarborough,"Malvern, Rouge, Highland Creek, Port Union, Ro..."
9,M6H,West Toronto,"Dovercourt Village, Dufferin, Little Portugal,..."


### Replace the value "Not assigned"  with "Queen's Park" from the "Neighbourhood" column same as Borough's name.

In [13]:
df.replace({'Neighbourhood': 'Not assigned'}, {'Neighbourhood': "Queen's Park"}, regex=True)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M4N,Central Toronto,"Lawrence Park, Davisville North, North Toronto..."
1,M4W,Downtown Toronto,"Rosedale, Cabbagetown, St. James Town, Church ..."
2,M4E,East Toronto,"The Beaches, Riverdale, The Danforth West, Ind..."
3,M4B,East York,"Parkview Hill, Woodbine Gardens, Woodbine Heig..."
4,M8V,Etobicoke,"Humber Bay Shores, Mimico South, New Toronto, ..."
5,M7R,Mississauga,Canada Post Gateway Processing Centre
6,M2H,North York,"Hillcrest Village, Fairview, Henry Farm, Oriol..."
7,M7A,Queen's Park,Queen's Park
8,M1B,Scarborough,"Malvern, Rouge, Highland Creek, Port Union, Ro..."
9,M6H,West Toronto,"Dovercourt Village, Dufferin, Little Portugal,..."


### Now let's download another csv file which contains coordinates of all neighborhoods of Toronto area alongwith corresponding postal codes. Call this dataframe as df2.

In [15]:
data2 = pd.read_csv('http://cocl.us/Geospatial_data', header = 0)

df2 = pd.DataFrame(data2)
df2.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Let's see its shape, i.e., number of rows it contains.

In [16]:
df2.shape

(103, 3)

### Renaming "Postal Code" with "PostalCode" to make identical with that of the previous dataframe df.

In [17]:
df2.rename(columns={'Postal Code': 'PostalCode', 'Latitude': 'Latitude', 'Longitude': 'Longitude'}, inplace=True)
df2.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Now, let's merge the two dataframe keeping the "PostalCode" unique.

In [18]:
df3 = pd.merge(df, df2, on="PostalCode")
df3.head(20)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M4N,Central Toronto,"Lawrence Park, Davisville North, North Toronto...",43.72802,-79.38879
1,M4W,Downtown Toronto,"Rosedale, Cabbagetown, St. James Town, Church ...",43.679563,-79.377529
2,M4E,East Toronto,"The Beaches, Riverdale, The Danforth West, Ind...",43.676357,-79.293031
3,M4B,East York,"Parkview Hill, Woodbine Gardens, Woodbine Heig...",43.706397,-79.309937
4,M8V,Etobicoke,"Humber Bay Shores, Mimico South, New Toronto, ...",43.605647,-79.501321
5,M7R,Mississauga,Canada Post Gateway Processing Centre,43.636966,-79.615819
6,M2H,North York,"Hillcrest Village, Fairview, Henry Farm, Oriol...",43.803762,-79.363452
7,M7A,Queen's Park,Not assigned,43.662301,-79.389494
8,M1B,Scarborough,"Malvern, Rouge, Highland Creek, Port Union, Ro...",43.806686,-79.194353
9,M6H,West Toronto,"Dovercourt Village, Dufferin, Little Portugal,...",43.669005,-79.442259
