## Explore and cluster the neighborhoods in Toronto

### Import packages

In [1]:
pip install BeautifulSoup4

Note: you may need to restart the kernel to use updated packages.


In [2]:
# library to handle data in a vectorized manner
import numpy as np

# library for data analsysis
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# library to handle JSON files
import json

from bs4 import BeautifulSoup

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# library to handle requests
import requests

# tranform JSON file into a pandas dataframe
from pandas.io.json import json_normalize 

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Libraries imported.


### Get the data - Scrape the Wiki Page

In [3]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r = requests.get(URL) 
  
soup = BeautifulSoup(r.content, 'html5lib') 
table = soup.find('div', attrs = {'id':'container'}) 

# print(soup.prettify()) 
print('Page Scrapped.')

Page Scrapped.


In [4]:
PC = [];
Boroughs= [];
neighbors= [];
columnNum = 1;
passVal = False

for row in soup.find_all('td'):
    for value in row:
        if value.string and value.string[0].isalpha() and len(value.string) > 2:
            passVal = False
            if columnNum == 1:
                if passVal == False and value.string[1].isdigit():
                    PC.append(value.string);   
                    columnNum = 2
                else:
                    continue
            elif columnNum == 2 :
                if value.string == 'Not assigned\n':
                    passVal = True
                    del PC[-1]
                    columnNum = 1
                    continue
                else:
                    Boroughs.append(value.string);      
                    columnNum = 3
            elif columnNum == 3 :
                if value.string == 'Not assigned\n':
                    neighbors.append(boroughs[-1])
                else:
                    neighbors.append(value.string); 
                columnNum = 1
                
print('Data Collected.')

Data Collected.


### create the dataframe

In [6]:
# define the dataframe columns
column_names = ['PostalCode', 'Borough', 'Neighborhood'] 

# instantiate the dataframe
neighbors_tab = pd.DataFrame(columns=column_names)

neighbors_tab

Unnamed: 0,PostalCode,Borough,Neighborhood


In [7]:
for data in range(len(neighbors)):
    code = PC[data]
    borough = Boroughs[data]
    neighborhood_name = neighbors[data]

    neighbors_tab = neighbors_tab.append({ 'PostalCode': code,
                                   'Borough': borough,
                                   'Neighborhood': neighborhood_name}, ignore_index=True)

neighbors_tab.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A\n,North York\n,Parkwoods\n
1,M4A\n,North York\n,Victoria Village\n
2,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"
3,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights\n"
4,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government\n"


In [8]:
neighbors_tab.shape

(103, 3)

In [9]:
neighbors_tab["Neighborhood"] = neighbors_tab["Neighborhood"].str.replace("\n","")
neighbors_tab["Borough"] = neighbors_tab["Borough"].str.replace("\n","")
neighbors_tab["PostalCode"] = neighbors_tab["PostalCode"].str.replace("\n","")
neighbors_tab.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [10]:
neighbors_tab = neighbors_tab.groupby(["PostalCode", "Borough"])["Neighborhood"].apply(", ".join).reset_index()
neighbors_tab.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [11]:
neighbors_tab.shape

(103, 3)

##  Get the latitude and the longitude coordinates of each neighborhood

In [19]:
g_coor = pd.read_csv("http://cocl.us/Geospatial_data")
g_coor.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [20]:
#Merge both the data sets : Neighborhood and geospatial

toronto_dataset = pd.merge(neighbors_tab, g_coor, how='left', left_on = 'PostalCode', right_on = 'Postal Code')

# remove the "Postal Code" column
toronto_dataset.drop("Postal Code", axis=1, inplace=True)
toronto_dataset.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
