## Introduction

This Notebook builds the code to scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the neighborhoods in Toronto in a form of DataFrame


In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests

## 1. Scrape the Wikipedia page and convert data into a DataFrame

Find the first table in the webpage, scrape all the data and store data in a DataFrame

In [2]:
source  = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')
neigh = soup.find_all('table')[0]
new_table = pd.DataFrame(columns = ['PostalCode', 'Borough', 'Neighborhood'])

temp = pd.DataFrame([('','','')])
temp.columns = ['PostalCode', 'Borough', 'Neighborhood']

for row in neigh.find_all('tr'):
    j = 0
    columns = row.find_all('td')
    for column in columns:
        temp.iloc[0,j] = column.get_text().strip()
        j+=1
    new_table = new_table.append(temp)

#remove the first redandunt row
new_table = new_table[1:]
new_table.reset_index(drop=True, inplace=True)

## 2. Clean the data

Remove row where Boroughs are "Not assigned". If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. 

In [3]:
# remove rows where Borough not assigned
new_table = new_table[new_table.Borough != 'Not assigned']
new_table.sort_values(by=['PostalCode'], axis=0, inplace=True)

# if Neighborhood is unassigned, set it to Borough
not_assigned = new_table[new_table['Neighborhood']=="Not assigned"]
new_table.iloc[not_assigned.index, 2] = new_table.iloc[not_assigned.index, 1]

new_table

Unnamed: 0,PostalCode,Borough,Neighborhood
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern
29,M1C,Scarborough,Port Union
28,M1C,Scarborough,Rouge Hill
27,M1C,Scarborough,Highland Creek
42,M1E,Scarborough,Guildwood
43,M1E,Scarborough,Morningside
44,M1E,Scarborough,West Hill
53,M1G,Scarborough,Scarborough
62,M1H,Scarborough,Cedarbrae


## 3. Build the structured per instruction

Find the unique Postal Codes and then combine the Neighborhood if Borough is the same 

In [4]:
neighborhoods = new_table.groupby(['PostalCode', 'Borough']).size().reset_index(name='Neighborhood')

#for matching
codes = new_table['PostalCode'].unique().tolist()

for i in range(len(neighborhoods)):
    cut = new_table[new_table['PostalCode']==codes[i]]
    neighborhoods.iloc[i,2] = cut.iloc[0,2]
    for j in range(len(cut)-1):
        neighborhoods.iloc[i,2] = neighborhoods.iloc[i,2] + ', ' + cut.iloc[j+1,2]

print(neighborhoods)

    PostalCode           Borough  \
0          M1B       Scarborough   
1          M1C       Scarborough   
2          M1E       Scarborough   
3          M1G       Scarborough   
4          M1H       Scarborough   
5          M1J       Scarborough   
6          M1K       Scarborough   
7          M1L       Scarborough   
8          M1M       Scarborough   
9          M1N       Scarborough   
10         M1P       Scarborough   
11         M1R       Scarborough   
12         M1S       Scarborough   
13         M1T       Scarborough   
14         M1V       Scarborough   
15         M1W       Scarborough   
16         M1X       Scarborough   
17         M2H        North York   
18         M2J        North York   
19         M2K        North York   
20         M2L        North York   
21         M2M        North York   
22         M2N        North York   
23         M2P        North York   
24         M2R        North York   
25         M3A        North York   
26         M3B        North 

In [5]:
print(neighborhoods.shape)

(103, 3)


## 4. Download the geographical coordinates of a given postal code

In [6]:
!conda install -c conda-forge geocoder --yes

Solving environment: done

# All requested packages already installed.



The while loop does not stop. Read the geographical coordinates from the .csv file directly

In [7]:
"""
import geocoder # import geocoder
# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates

while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format('M5G'))
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]
print(latitude, longitude)
"""

"\nimport geocoder # import geocoder\n# initialize your variable to None\nlat_lng_coords = None\n\n# loop until you get the coordinates\n\nwhile(lat_lng_coords is None):\n  g = geocoder.google('{}, Toronto, Ontario'.format('M5G'))\n  lat_lng_coords = g.latlng\n\nlatitude = lat_lng_coords[0]\nlongitude = lat_lng_coords[1]\nprint(latitude, longitude)\n"

# 5. Add the geographical coordinates of the neighborhoods by using the downloaded file

Use merge and inner join to add the latitude and longitude

In [8]:
coordinates_df = pd.read_csv('https://cocl.us/Geospatial_data')
coordinates_df.head()
coordinates_df.rename(columns={'Postal Code': 'PostalCode'}, inplace = True)
coordinates_df.head()
neighborhoods = pd.merge(neighborhoods, coordinates_df, on='PostalCode', how='inner')
neighborhoods

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Scarborough,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Oakridge, Clairlea",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Scarborough Village West, Cliffside",43.716316,-79.239476
9,M1N,Scarborough,"Cliffside West, Birch Cliff",43.692657,-79.264848


# 6. Start segmenting and clustering neighborhoods 

Download the required libraries

In [9]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/DSX-Python35

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-2.2.2               |           py35_1         462 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         560 KB

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge


Downloading and Extracting Packages
altair-2.2.2         | 462 K

Use geopy library to get the latitude and longitude values of Toronta

In [10]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronta are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronta are 43.653963, -79.387207.


Create a map of Toronto with neighborhoods superimposed on top.

In [11]:
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

# 7. Start exploring neighborhoods 

Define Foursquare Credentials and Version

In [12]:
# The code was removed by Watson Studio for sharing.