### Toronto Neighborhood - Part 1

In [24]:
from requests import get
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd

In [2]:
# Web scrape the wiki page using beautiful soup so that we can load the table into a dataframe
wiki_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
wiki_page = urlopen(wiki_url)
b_soup = BeautifulSoup(wiki_page,'html.parser')

In [3]:
# Find the table

wiki_table = b_soup.body.table.tbody

# Create a dataframe to store the table

table_df = pd.DataFrame(columns=['Postal Code','Borough','Neighborhood'])

In [4]:

# Extract the table values and append into dataframe

for tr_cell in wiki_table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        table_df.loc[len(table_df)] = row_data
table_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [5]:
# Check initial dataframe shape

table_df.shape

(180, 3)

In [6]:

# remove rows where Borough is 'Not assigned'

table_df=table_df[table_df['Borough']!='Not assigned']

In [9]:
# assign Neighbourhood=Borough where Neighbourhood is 'Not assigned'

#table_df[table_df['Neighborhood']=='Not assigned'] = table_df['Borough']

In [10]:
table_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [11]:
table_df.shape

(103, 3)

In [12]:

# group multiple Neighbourhood under one Postcode

temp_df=table_df.groupby('Postal Code')['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
temp_df=temp_df.reset_index(drop=False)
temp_df.rename(columns={'Neighborhood':'Neighborhood_joined'},inplace=True)

In [13]:
# join the newly constructed joined data frame

df_merged = pd.merge(table_df, temp_df, on='Postal Code')

In [14]:
# drop the Neighbourhood column

df_merged.drop(['Neighborhood'],axis=1,inplace=True)

In [15]:
# drop duplicates from the data frame

df_merged.drop_duplicates(inplace=True)

In [16]:
df_merged.rename(columns={'Neighborhood_joined':'Neighborhood'},inplace=True)
df_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [18]:
df_merged

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [19]:
df_merged.shape

(103, 3)

### Part 2

In [20]:
# Since it takes long time to read from the geocode package we will use the csv file provided

geo_df=pd.read_csv('http://cocl.us/Geospatial_data')
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [21]:
# Before we merge the Neighborhood data frame with the Geospatial dataframe we need to sort the Neighborhood data frame by the Postal Code first

df_merged_sorted = df_merged.sort_values("Postal Code")
df_merged_sorted.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
6,M1B,Scarborough,"Malvern, Rouge"
12,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
18,M1E,Scarborough,"Guildwood, Morningside, West Hill"
22,M1G,Scarborough,Woburn
26,M1H,Scarborough,Cedarbrae


In [22]:
# Merge the two dataframes

geo_merged_df = pd.merge(df_merged_sorted, geo_df, on='Postal Code')
geo_merged_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [23]:
geo_merged_df.shape

(103, 5)