<h1>Part 1: Data Scrapping and Cleaning</h1>

In [2]:
#Import Libraries
import random
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.cluster import KMeans 
from sklearn.datasets.samples_generator import make_blobs
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
from urllib.request import urlopen

<h2>Step 1: Scrape the Wikipedia page data</h2>

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df = pd.read_html(url, attrs={'class': 'wikitable'})
dfs = df[0]
#display(dfs)

<h2>Step 2: Remove Rows where Borough = 'Not Assigned'</h2>

In [4]:
idx = dfs[(dfs['Borough'] == 'Not assigned')].index
dfs.drop(idx, inplace=True)
dfs = dfs.reset_index(drop=True)
#display(dfs)

<h2>Step 3: Assign Borough values to Unassigned Neighbourhoods</h2>

In [14]:
idx1 = dfs[(dfs['Neighbourhood'] == 'Not assigned')].index
for each in idx1:
    dfs.at[each,'Neighbourhood'] = dfs.at[each,'Borough']
#display(dfs)

<h2>Step 4: Combine Neighbourhoods of Duplicate Postal Codes</h2>

In [15]:
#Get Duplicated Row Values
dups = dfs[dfs['Postcode'].duplicated()]

#Remove Duplicated Rows
dfs.drop(dfs[dfs['Postcode'].duplicated()].index, inplace=True)

for index, row in dfs.iterrows():
    temp = dups.loc[dups['Postcode'] == row['Postcode']]
    for index2, each in temp.iterrows():
        dfs.at[index,'Neighbourhood'] = dfs.at[index,'Neighbourhood'] + ', ' + each['Neighbourhood']

<h1>Step 5: Print Shape</h1>

In [16]:
dfs.shape

(103, 5)

<h1>Part 2</h1>
<h2>Step 1: Get Geographical Coordinates Data</h2>

<p>
    Note: The Geocoder package did not work, therefore the csv file provided was used to retrieve the geographical coordinates information.
</p>

In [17]:
geo_data = pd.read_csv("https://cocl.us/Geospatial_data")
geo_data

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


<h2>Step 2: Sort and Merge Data</h2>

<p>
    Both dataframes seemed to have the same number of rows with equitable Postal Code data. It was assumed that the data is already clean and there is a 1:1 relationship between the Postal Codes.
</p>

<p>
    The following steps will sort the data by Postal Code and join accordingly. 
</p>

In [18]:
dfs = dfs.reset_index(drop=True)

dfs = dfs.sort_values(by=['Postcode'])
geo_data = geo_data.sort_values(by=['Postal Code'])

dfs['Latitude'] = geo_data["Latitude"]
dfs['Longitude'] = geo_data["Longitude"]

display(dfs.sort_index(axis=0, level=None, ascending=True, inplace=False, kind='quicksort'))

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


<h1>Part 3</h1>

<h2>Step 1: Install Additional Libraries</h2>

In [19]:
import json

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!pip install folium
import folium

print('Libraries imported.')

Collecting package metadata: done
Solving environment: | 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - conda-forge/linux-64::altair==2.4.1=py36_0
  - conda-forge/noarch::branca==0.3.1=py_0
  - conda-forge/linux-64::certifi==2019.3.9=py36_0
  - conda-forge/linux-64::conda==4.6.14=py36_0
  - conda-forge/noarch::folium==0.5.0=py_0
  - conda-forge/linux-64::geopy==1.11.0=py36_0
  - conda-forge/linux-64::giflib==5.1.7=h516909a_1
  - defaults/linux-64::grpcio==1.16.1=py36hf8bcb03_1
  - conda-forge/linux-64::ipython-sql==0.3.9=py36_1000
  - conda-forge/noarch::ipywidgets==7.4.2=py_0
  - conda-forge/linux-64::jpeg==9c=h14c3975_1001
  - conda-forge/linux-64::leptonica==1.76.0=h7f84942_1005
  - conda-forge/linux-64::libgcc==7.2.0=h69d50b8_2
  - conda-forge/linux-64::libwebp==1.0.2=h99fbfcb_2
  - conda-forge/linux-64::nodejs==9.11.0=0
  - conda-forge/linux-64::openjpeg==2.3.1=h58a6597_0
  - conda-forge/linux-64

<h2>Step 2: Use geopy library to get the latitude and longitude values of Toronto</h2>

In [20]:
address = 'Toronto'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [29]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

<h2>Step 3: Select the neighbourhoods in Toronto</h2>

In [24]:
toronto_df = dfs[dfs['Borough'].str.contains("Toronto")]

<h2>Step 4: Display a map of the neighbourhoods in Toronto</h2>