# IBM applied Data Science Capstone Project (Week 3 Part 2)
AIM: Get the latitude and longitude of the neighborhoods

# Import libraries

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
import folium



# Get the data from Wikipedia page and save the table in 3 lists


In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
result = requests.get(url).text

In [3]:
#Use BeautifulSoup to get the html format of data
soup = BeautifulSoup(result, 'html.parser')

In [4]:
#Create three lists 
postalcode, borough, neighborhood = [], [], []

In [5]:
#Find the table and get the data 
rows = soup.find('table').find_all('tr')

for row in rows:
    data = row.find_all('td')
    if len(data) > 0:
        postalcode.append(data[0].text)
        borough.append(data[1].text)
        neighborhood.append(data[2].text.rstrip('\n'))

In [6]:
#Create a dataframe and store the data in it
df = pd.DataFrame({'Postal Code' : postalcode,
                   'Borough' : borough,
                   'Neighborhood' : neighborhood})
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


# Drop the rows with Neighborhood as not assigned

In [7]:
df = df[df['Neighborhood'] != 'Not assigned'].reset_index(drop = True)

# Group the neighborhood in the same borough

In [8]:
df = df.groupby(['Postal Code', 'Borough'], as_index = False).agg(lambda x : ','.join(x))
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


# For neighborhood as "Not Assigned", make it same as Borough


In [9]:
for index, row in df.iterrows():
    if row['Neighborhood'] == 'Not assigned':
        row['Neighborhood'] = row['Borough']


# Print the shape of dataframe

In [10]:
df.shape

(103, 3)

 <h1> Get the longitude and latitude </h1>


#  Load the geospatial coordinates file and save in Dataframe

In [11]:
coordinates = pd.read_csv('Geospatial_Coordinates.csv')
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


# Merge the two dataframe to single one

In [12]:
toronto_data = df.merge(coordinates, on = 'Postal Code', how = 'left')
toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
