# Capstone project - analyzing canadian boroughs.

All the code is in this one file. The three subsections tell about the three subproblems.


## Scaping data from Wikipedia

In [63]:
# Importing minimun libraries
import numpy as np
import pandas as pd

In [64]:
# Get the data set from wikipedia
 
postal_codes_df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]


# Look at the data
postal_codes_df.head()


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [103]:
# Removing not assigned neighborhoods and checking the outcome
borough_prune_df = postal_codes_df.loc[postal_codes_df.Borough != 'Not assigned']
borough_prune_df.head()

Unnamed: 0,Postal_Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [104]:
# We look at the shape of the result
postal_codes_df.shape

(180, 3)

In [105]:
# Then we check if the shape changes after removing empty elements. It does not, so we are happy.
postal_codes_df.dropna
postal_codes_df.shape

(180, 3)

In [106]:
postal_codes_df.shape

(180, 3)

## Getting the latitudes and longitudes

We could not import geocoder, so we are using the provided csv file.


In [115]:
# Reading the csv from the given site
langlot_df = pd.read_csv("https://cocl.us/Geospatial_data")

In [116]:
# Checking the data
langlot_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [120]:
# We turn the postal code label to a more accessible form and set it as the index for joining
left_df = postal_codes_df.rename(columns={'Postal Code':'Postal_Code'})
right_df = langlot_df.rename(columns={'Postal Code':'Postal_Code'})

left_df.set_index('Postal_Code', inplace=True)
right_df.set_index('Postal_Code', inplace=True)

In [128]:
# we join the data sets along the index

combined_df = left_df.join(right_df)
combined_df = combined_df.loc[combined_df.Borough != 'Not assigned']

combined_df.head()

Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude
Postal_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M3A,North York,Parkwoods,43.753259,-79.329656
M4A,North York,Victoria Village,43.725882,-79.315572
M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [127]:
combined_df.shape

(180, 4)