### Part 2: Coordinates Acquisition

### This section is an extension to "Part 1: Data Scraping" which contains the same code as the mentioned notebook in the first few lines.

In [1]:
#Importing of libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
#File path
path = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [3]:
#Creating soup object to scrape data
source = requests.get(path).text

soup = BeautifulSoup(source, 'xml')

In [4]:
table = soup.find('table')

In [5]:
#Creating the table with the following columns: PostalCode, Borough, and Neighborhood
column_names = ['Postalcode','Borough','Neighborhood']
df = pd.DataFrame(columns = column_names)

In [6]:
#Fetching the data from the Wikipedia Table
for tr_cell in table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        df.loc[len(df)] = row_data

In [7]:
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [8]:
#Remove rows having no assigned boroughs
df_new = df[df.Borough != 'Not assigned']
df_new.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [9]:
#Grouping neighborhoods with the same postcode
df_new = df_new.groupby(['Postalcode', 'Borough'])['Neighborhood'].apply(list).apply(lambda x:', '.join(x)).to_frame().reset_index()

In [10]:
# Assigning the borough for postcodes without any assigned neighborhood
for index, row in df_new.iterrows():
    if row['Neighborhood'] == 'Not assigned':
        row['Neighborhood'] = row['Borough']

### To get the coordinates of each neighborhood:

In [11]:
#Use the given csv file due to the unreliable geocoder
url = 'http://cocl.us/Geospatial_data'
df_geodata = pd.read_csv(url)
df_geodata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
df_geodata.shape

(103, 3)

In [13]:
#Since the geospatial dataframe has the same dimension as the formatted table, we merge both tables
df_new = df_new.join(df_geodata.set_index('Postal Code'), on='Postalcode')
df_new.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
