# Scrape wiki page

In [10]:
import numpy as np
import pandas as pd
import urllib

Read the content from our url and store it into a variable

In [11]:
wikipage = urllib.request.urlopen("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
content = wikipage.read()

Convert into string in utf-8 format

In [12]:
scontent = content.decode("UTF-8")

Extract the data from the table by looking for the < table > tags

In [13]:
wikitable = scontent[scontent.find("<table"):scontent.find("</table>")+8]

Now we are able to read the table with Pandas

In [14]:
mydata = pd.read_html(wikitable, header = 0)[0]

We have to process the cells that have an assigned borough and ignore the cells with a borough that is 'Not assigned'.

In [15]:
mydata = mydata[mydata.Borough != "Not assigned"]

If a cell has a borough but a 'Not assigned' neighborhood, then the neighborhood will be the same as the borough.

In [16]:
mydata.Neighbourhood[mydata.Neighbourhood == "Not assigned"] = mydata.Borough[mydata.Neighbourhood == "Not assigned"]

Let's see how our dataframe looks so far

In [17]:
mydata.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


Group by Postcode and Borough and combine Neighbourhoods in one row seperated by comma

In [18]:
gdata = mydata.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda neighbour: ','.join(neighbour))
mydata = gdata.reset_index()

Below we can see that Neighbourhoods with the same postcode and borough are now combined in one row

In [19]:
mydata.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Finally we print the size of our dataframe

In [20]:
mydata.shape

(103, 3)

# Add geographical coordinates to my dataframe

In [21]:
# link to Geospatial data as csv on the web 
csv_url='https://cocl.us/Geospatial_data'
# pandas read csv from URL
geodata = pd.read_csv(csv_url)
geodata.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [22]:
mydata_geo = pd.merge(mydata,geodata, on=[mydata.Postcode, geodata["Postal Code"]])
mydata_geo.drop(['key_0','key_1','Postcode'], axis=1,inplace=True)
mydata_geo.head()

Unnamed: 0,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,Scarborough,"Rouge,Malvern",M1B,43.806686,-79.194353
1,Scarborough,"Highland Creek,Rouge Hill,Port Union",M1C,43.784535,-79.160497
2,Scarborough,"Guildwood,Morningside,West Hill",M1E,43.763573,-79.188711
3,Scarborough,Woburn,M1G,43.770992,-79.216917
4,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


In [23]:
mydata_geo = mydata_geo[["Postal Code", "Borough", "Neighbourhood", "Latitude", "Longitude"]]
mydata_geo.rename(columns={"Postal Code":"PostalCode"}, inplace=True)
mydata_geo.head(12)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


# Explore and cluster the neighborhoods in Toronto

In [6]:
!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.9.11          |           py36_0         147 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    altair-3.2.0               |           py36_0         770 KB  conda-forge
    ca-certificates-2019.9.11  |       hecc5488_0         144 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.2 MB

The following NEW packages will be 

In [7]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geographical coordinate of Toronto are 43.653963, -79.387207.


Show on map all post codes of Toronto

In [25]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, Postcode in zip(mydata_geo['Latitude'], mydata_geo['Longitude'], mydata_geo['Borough'], mydata_geo['PostalCode']):
    label = '{}, {}'.format(Postcode, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto