In [0]:
import random # library for random number generation
import numpy as np # library for vectorized computation
import pandas as pd # library to process data as dataframes

import matplotlib.pyplot as plt # plotting library
# backend for rendering plots within the browser
%matplotlib inline 

from sklearn.cluster import KMeans 
from sklearn.datasets.samples_generator import make_blobs

print('Libraries imported.')


### Part 1: Parsing html data

In [92]:
tab = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
#read_html returns a list so we parse through with list indexing
tab = tab[0]
#rename the postal code column forconsistency
tab.rename(columns={"Postal Code":"PostalCode"},inplace=True)
tab.head(15)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [93]:
#drop rows/entries with 'Not assigned' values and reset the index
tab = tab[tab.Borough != 'Not assigned']
tab.reset_index(inplace=True)
tab.drop('index',axis=1,inplace=True)
tab.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### To find out the number of rows contained in our data we use the .shape method

In [94]:
print("Our Dataframe has {} rows".format(tab.shape[0]))

Our Dataframe has 103 rows


## Part 2: Getting the location coordinates

In [96]:
!pip install geocoder

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |███▎                            | 10kB 17.8MB/s eta 0:00:01[K     |██████▋                         | 20kB 2.2MB/s eta 0:00:01[K     |██████████                      | 30kB 2.8MB/s eta 0:00:01[K     |█████████████▎                  | 40kB 3.1MB/s eta 0:00:01[K     |████████████████▋               | 51kB 2.6MB/s eta 0:00:01[K     |████████████████████            | 61kB 2.8MB/s eta 0:00:01[K     |███████████████████████▎        | 71kB 3.1MB/s eta 0:00:01[K     |██████████████████████████▋     | 81kB 3.4MB/s eta 0:00:01[K     |██████████████████████████████  | 92kB 3.6MB/s eta 0:00:01[K     |████████████████████████████████| 102kB 2.7MB/s 
Collecting ratelim
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad4

In [110]:
#read in the csv file since geocode API was not responsive
cordtable = pd.read_csv('https://cocl.us/Geospatial_data')
cordtable.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [132]:
#rename the Postal Code column for consistency
cordtable.rename(index=str, columns={'Postal Code': 'PostalCode'}, inplace=True)
cordtable.head(3)


Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711


In [136]:
#Here i perform a merge joining the new dataframe to the previous one
cleantable = pd.merge(tab,cordtable,how='outer', on=['PostalCode'])
cleantable.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [137]:
cleantable.shape
#Now we have 5 columns with the addition of the Lat and Long Coordinates

(103, 5)