# Problem 2
#### In part 1, a dataframe containing the postal code of each neighbourhood along with the borough name and neighbourhood name in Toronto was populated by scraping Wikipedia data. In order to utilize the Foursquare location data to cluster and segment the neighbourhoods in Toronto, the latitude and the longitude coordinates of each neighborhood need to be included in the dataframe.

In [1]:
#Import necessary libraries and modules
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
!pip install pgeocode
import pgeocode
print('Libraries are imported.')

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting pgeocode
  Downloading pgeocode-0.3.0-py3-none-any.whl (8.5 kB)
Installing collected packages: pgeocode
Successfully installed pgeocode-0.3.0
Libraries are imported.


### 1.a. Scraping location data for Toronto from Wikipedia

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url).text

In [3]:
soup = BeautifulSoup(source, 'html5lib') #choose html5lib parser

### 1.b. Creating and populating a dataframe using the scraped data

In [4]:
table=soup.find('table') #locate the table within the scraped data

In [5]:
#Create and populate the dataframe
table_contents=[]
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

location_df=pd.DataFrame(table_contents)
location_df['Borough']=location_df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [6]:
location_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [7]:
#Number of rows of the final processed dataframe
location_df.shape

(103, 3)

In [11]:
# fetch geographical coordinates for all postal codes
geolocator = pgeocode.Nominatim('ca')
postal_codes = location_df['PostalCode'].tolist()
latitudes = []
longitudes = []
for i, postal_code in enumerate(postal_codes):
    g = geolocator.query_postal_code(postal_code)
    
    if not g.empty:
        latitudes.append(g.latitude)
        longitudes.append(g.longitude)

In [17]:
# create dataframe from geographical coordinates data
d = {'PostalCode': postal_codes, 'Latitudes': latitudes, 'Longitudes': longitudes}
geocode_df = pd.DataFrame(d)
geocode_df

Unnamed: 0,PostalCode,Latitudes,Longitudes
0,M3A,43.7545,-79.3300
1,M4A,43.7276,-79.3148
2,M5A,43.6555,-79.3626
3,M6A,43.7223,-79.4504
4,M7A,43.6641,-79.3889
...,...,...,...
98,M8X,43.6518,-79.5076
99,M4Y,43.6656,-79.3830
100,M7Y,43.7804,-79.2505
101,M8Y,43.6325,-79.4939


In [20]:
# merge geographical coordinate dataframe with the location dataframe and clean it
merged_df = pd.merge(geocode_df, location_df, on='PostalCode')
geospatial_data = merged_df[['PostalCode','Borough','Neighborhood','Latitudes','Longitudes']]
geospatial_data

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitudes,Longitudes
0,M3A,North York,Parkwoods,43.7545,-79.3300
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Queen's Park,Ontario Provincial Government,43.6641,-79.3889
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.6518,-79.5076
99,M4Y,Downtown Toronto,Church and Wellesley,43.6656,-79.3830
100,M7Y,East Toronto Business,Enclave of M4L,43.7804,-79.2505
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.6325,-79.4939
