In [1]:
import pandas as pd
import numpy as np
import json # library to handle JSON files

!pip install geopy

import requests # library to handle requests

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

print('Libraries imported.')

Libraries imported.


# First Part
This part contains extracting the data from Wikipedia and processing it into a dataframe by removing 'Not Assigned' Boroughs and formating the Neighborhood column according to specifications.

The only assumption made is that there are no Neighboorhoods that are not named (except for those Not Assigned Boroughs) and that can be seen in the Wikipedia page. 

In [2]:
url_html='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

df = pd.read_html(url_html) #load into a data frame
df = df[0]
df['Borough'].value_counts() #Find how many boroughs are not assigned

Not assigned        77
North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
York                 5
East York            5
East Toronto         5
Mississauga          1
Name: Borough, dtype: int64

In [3]:
df.dropna(inplace = True) #drop 'Not Assigned' boroughs

In [4]:
#Format DF
df.reset_index(inplace=True)
df.drop(columns = 'index',inplace =True)

In [5]:
# Create new columns and set them as the dataframe's columns
columns = ['PostalCode', 'Borough', 'Neighborhood']
df.columns = columns

In [6]:
df.dtypes

PostalCode      object
Borough         object
Neighborhood    object
dtype: object

In [7]:
hood = df['Neighborhood']
neighborhood = hood.str.replace('/',',') # Replace parentheses with commas as requested.
neighborhood = pd.DataFrame(neighborhood)

In [8]:
df.drop(columns = 'Neighborhood',inplace =True) #drop original Neighborhood column that contains parentheses to separate neighborhoods

df1 =pd.concat([df,neighborhood], axis = 1, sort=False) #Join orignal dataframe to the neighborhood dataframe, which contains the changes made

Below is the .shape method to print the number of rows of your dataframe.

In [9]:
print("Shape of DataFrame: {}".format(df1.shape)) 
df1.head(10)

Shape of DataFrame: (103, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern , Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill , Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


# Second Part
In this part, the latitudes and longitudes of the different locations are retrieved, by the postal code, using Geocoder. 

In [10]:
!pip install geocoder==1.5. #install geocoder

Collecting geocoder==1.5.
[?25l  Downloading https://files.pythonhosted.org/packages/2d/ea/9554295b2abce67935ae1640ae8d8aa9cadc0f42deb27b3f6fc432a4e541/geocoder-1.5.0-py2.py3-none-any.whl (50kB)
[K     |████████████████████████████████| 51kB 7.2MB/s eta 0:00:011
[?25hCollecting ratelim (from geocoder==1.5.)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.5.0 ratelim-0.1.6


In [11]:
col_names = ['Latitude', 'Longitude']
cols = pd.DataFrame(columns=col_names)

In [12]:
import geocoder # import geocoder


for code in df['PostalCode']: #For loop to go through all the postal code values
    
    zip_code = code #initalizating variable for postal code
    
    lat_lng_coords = None 
    
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(zip_code))
        lat_lng_coords = g.latlng

    
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
    cols = cols.append({'Latitude': latitude,
                       'Longitude': longitude}, ignore_index=True) #append all the values into the dataframe

In [13]:
cols.head(20)

Unnamed: 0,Latitude,Longitude
0,43.752935,-79.335641
1,43.728102,-79.31189
2,43.650964,-79.353041
3,43.723265,-79.451211
4,43.66179,-79.38939
5,43.667481,-79.528953
6,43.808626,-79.189913
7,43.7489,-79.35722
8,43.707193,-79.311529
9,43.657491,-79.377529


In [14]:
df2 = pd.concat([df1,cols],axis = 1, sort=False) #concatenate the coordinates dataframe and the locations dataframe

In [15]:
df2.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.752935,-79.335641
1,M4A,North York,Victoria Village,43.728102,-79.31189
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.650964,-79.353041
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.723265,-79.451211
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.66179,-79.38939
5,M9A,Etobicoke,Islington Avenue,43.667481,-79.528953
6,M1B,Scarborough,"Malvern , Rouge",43.808626,-79.189913
7,M3B,North York,Don Mills,43.7489,-79.35722
8,M4B,East York,"Parkview Hill , Woodbine Gardens",43.707193,-79.311529
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657491,-79.377529


In [16]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    certifi-2020.4.5.1         |   py36h9f0ad1d_0         151 KB  conda-forge
    ca-certificates-2020.4.5.1 |       hecc5488_0         146 KB  conda-forge
    branca-0.4.0               |             py_0          26 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    openssl-1.1.1f             |       h516909a_0         2.1 MB  conda-forge
    altair-4.1.0               |             py_1         614 KB  conda-forge
    ------------------------------------------------------------
                       

In [17]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(df2['Latitude'], df2['Longitude'], df2['Borough'], df2['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto