The project is to get data for neighborhoods in Toronto from Wikipedia

Scrape Table from Wikipedia

In [1]:
import urllib.request

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = urllib.request.urlopen(url)

from bs4 import BeautifulSoup

soup = BeautifulSoup(page, "lxml")


In [2]:
all_tables=soup.find_all("table")

In [3]:
right_table=soup.find('table', class_='wikitable sortable')

Data frame construction 

We have 3 columns, so we contruct 3 lists to contain the data

In [4]:
A=[]
B=[]
C=[]

for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))

In [5]:
import pandas as pd
df=pd.DataFrame(A,columns=['PostalCode'])
df['Borough']=B
df['Neighborhood']=C

In [6]:
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"
...,...,...,...
175,M5Z\n,Not assigned\n,Not assigned\n
176,M6Z\n,Not assigned\n,Not assigned\n
177,M7Z\n,Not assigned\n,Not assigned\n
178,M8Z\n,Etobicoke\n,"Mimico NW, The Queensway West, South of Bloor,..."


In [7]:
columns=['PostalCode','Borough', 'Neighborhood']
new_df = df[columns].replace('\\n','', regex=True)

In [8]:
new_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


Remove 'Not assigned' entries from the dataframe

In [9]:
new_df.drop(new_df[new_df.Neighborhood == 'Not assigned'].index, inplace=True)

In [10]:
new_df.reset_index(drop=True, inplace=True)

In [11]:
new_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


Check number of rows and columns in the dataframe

In [12]:
new_df.shape

(103, 3)

Get the latitude and the longitude coordinates of each neighborhood

In [80]:
#!pip install geocoder

#import geocoder # import geocoder
#from geopy.geocoders import Nominatim 

# initialize your variable to None

#for postal_code in new_df['PostalCode']:
    
#    address ='{}, Toronto, Ontario'.format(postal_code)
#    address
#    geolocator = Nominatim(user_agent="ny_explorer")
#    location = geolocator.geocode(address)
#    latitude = location.latitude
#    longitude = location.longitude
    
 #   lat_lng_coords = None
 # loop until you get the coordinates
 #      while(lat_lng_coords is None):
 #       g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
 #       lat_lng_coords = g.latlng

 #   latitude = lat_lng_coords[0]
 #   longitude = lat_lng_coords[1]

I was not able to get the corrdinates using geolocator

I imported the coordinates using the provided CSV file

In [13]:
coord=pd.read_csv('/Users/shayan/Desktop/IBM Data Science/Geospatial_Coordinates.csv')

In [14]:
coord

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [15]:
coord.rename(columns={'Postal Code':'PostalCode'}, inplace=True)

In [16]:
df_final=pd.merge(new_df, coord, on='PostalCode')

In [17]:
df_final

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


Number of boroughs and neighborhoods

In [18]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df_final['Borough'].unique()),
        len(df_final['Neighborhood'].unique())
    ))

The dataframe has 10 boroughs and 99 neighborhoods.


In [23]:
#Install geocoder 

!pip install geocoder

import geocoder # import geocoder
from geopy.geocoders import Nominatim 



In [24]:
#Use geopy library to get the latitude and longitude values of New York City
#In order to define an instance of the geocoder, we need to define a user_agent. We will name our agent ny_explorer.

address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="canada_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [25]:
#Create a map of Toronto with neighborhoods superimposed on top

import folium

# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_final['Latitude'], df_final['Longitude'], df_final['Borough'], df_final['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [26]:
#simplify the above map and segment and cluster only the neighborhoods with Toronto in the Borough name

toronto_data = df_final[df_final['Borough'].str.contains('Toronto')]
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [27]:
#Create a map of Toronto with neighborhoods superimposed on top

import folium

# create map of Toronto using latitude and longitude values
map_toronto_borough = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Borough'], toronto_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_borough)  
    
map_toronto_borough