# Clustering Toronto - Part 1

### Load libraries

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re

### Data Exctraction and Cleaning

In [2]:
# Scrape wiki

with urlopen("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M") as fp:
    soup = BeautifulSoup(fp)

In [3]:
# Extract table

table_html = soup.find('table').find_all('tr')

In [4]:
# Define function for converting html to list

def cleanhtml(raw_html):
    cleaner = re.compile('<.*?>')
    text = re.sub(cleaner, '', str(raw_html)).splitlines()
    return list(filter(None, text))

In [5]:
# Get list

table = [cleanhtml(row) for row in table_html]

In [6]:
# Convert list to df and clean

table_df = pd.DataFrame(table, columns = table[0])
table_df = table_df.drop(table_df.index[0])
table_df = table_df[table_df.Borough != 'Not assigned'].reset_index()

### Data checking and verifying

In [7]:
len(table_df)

103

In [8]:
len(table_df[table_df.Neighbourhood != 'Not assigned'])

103

Here we can see no neighbourhoods are unassigned.

In [9]:
table_df['Postal Code'].value_counts().value_counts()

1    103
Name: Postal Code, dtype: int64

Here we can see no Postal Codes are shared.

In [10]:
table_df

Unnamed: 0,index,Postal Code,Borough,Neighbourhood
0,3,M3A,North York,Parkwoods
1,4,M4A,North York,Victoria Village
2,5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,6,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...,...
98,161,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,166,M4Y,Downtown Toronto,Church and Wellesley
100,169,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,170,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [11]:
table_df.shape

(103, 4)

### Adding latitude and the longitude coordinates

The Geospatial_Coordinates.csv provided was used and read through Pandas and combined with table extracted from Wikipedia

In [12]:
GeoCoords = pd.read_csv('Geospatial_Coordinates.csv')
GeoCoords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
tor_coord = table_df.merge(GeoCoords,how='right',on='Postal Code').drop('index', axis = 1)

In [14]:
tor_coord.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
