This notebook will be mainly used for the Applied Data Science Capstone Project

<h3>Part-1-Built a DataFrame</h3>

In [1]:
import requests
import pandas as pd

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = requests.get(url).content
df_list = pd.read_html(html)
df = df_list[0]
print(df)
df.to_csv('toronto_data.csv')

    Postcode           Borough          Neighbourhood
0        M1A      Not assigned           Not assigned
1        M2A      Not assigned           Not assigned
2        M3A        North York              Parkwoods
3        M4A        North York       Victoria Village
4        M5A  Downtown Toronto           Harbourfront
..       ...               ...                    ...
282      M8Z         Etobicoke              Mimico NW
283      M8Z         Etobicoke     The Queensway West
284      M8Z         Etobicoke  Royal York South West
285      M8Z         Etobicoke         South of Bloor
286      M9Z      Not assigned           Not assigned

[287 rows x 3 columns]


In [70]:
df_toronto=pd.read_csv('toronto_data.csv')

In [71]:
df_toronto.head()

Unnamed: 0.1,Unnamed: 0,Postcode,Borough,Neighbourhood
0,0,M1A,Not assigned,Not assigned
1,1,M2A,Not assigned,Not assigned
2,2,M3A,North York,Parkwoods
3,3,M4A,North York,Victoria Village
4,4,M5A,Downtown Toronto,Harbourfront


In [72]:
df_toronto.drop("Unnamed: 0", axis = 1, inplace=True)

In [73]:
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [74]:
import numpy as np

# replace "?" to NaN
df_toronto.replace("Not assigned", np.nan, inplace = True)
df_toronto.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [75]:
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
# simply drop whole row with NaN in "Borough" column
df_toronto.dropna(subset=["Borough"], axis=0, inplace=True)

# reset index, because we droped rows
df_toronto.reset_index(drop=True, inplace=True)

In [76]:
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [77]:
missing_data = df_toronto.isnull()
missing_data.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False


In [78]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
df_toronto['Neighbourhood'].replace("Not assigned", df_toronto["Borough"],inplace=True)
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [79]:
# More than one neighborhood can exist in one postal code area, combined these into one row with the neighborhoods separated with a comma
df_toronto["Neighbourhood"] = df_toronto.groupby("Postcode")["Neighbourhood"].transform(lambda neigh: ', '.join(neigh))

#remove duplicates
df_toronto = df_toronto.drop_duplicates()
df_toronto.reset_index(drop=True, inplace=True) 

df_toronto.head()


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


In [80]:
df_toronto.shape

(103, 3)

<h3>Part-2- Building a DataFrame with Latitude and Longitude</h3>

In [81]:

latlong_df = pd.read_csv("Geospatial_Coordinates.csv")
latlong_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [82]:
df_toronto = df_toronto.join(latlong_df)
df_toronto.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M3A,North York,Parkwoods,M1B,43.806686,-79.194353
1,M4A,North York,Victoria Village,M1C,43.784535,-79.160497
2,M5A,Downtown Toronto,Harbourfront,M1E,43.763573,-79.188711
3,M6A,North York,"Lawrence Heights, Lawrence Manor",M1G,43.770992,-79.216917
4,M7A,Downtown Toronto,Queen's Park,M1H,43.773136,-79.239476
5,M9A,Etobicoke,Islington Avenue,M1J,43.744734,-79.239476
6,M1B,Scarborough,"Rouge, Malvern",M1K,43.727929,-79.262029
7,M3B,North York,Don Mills North,M1L,43.711112,-79.284577
8,M4B,East York,"Woodbine Gardens, Parkview Hill",M1M,43.716316,-79.239476
9,M5B,Downtown Toronto,"Ryerson, Garden District",M1N,43.692657,-79.264848
