# Segmenting and Clustering Neighborhoods in Toronto - IBM Applied Data Science Capstone Course by Coursera

## Week 3 - Part 1

### Import Libraries

In [2]:
!conda install -c conda-forge beautifulsoup4 --yes
!conda install -c conda-forge geopy --yes
!conda install -c conda-forge folium=0.5.0 --yes
!conda install -c conda-forge lxml --yes
!pip install beautifulsoup4
!pip install lxml
!pip install requests
print('Libraries installed!')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    beautifulsoup4-4.8.2       |           py36_0         157 KB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    certifi-2019.11.28         |   py36h9f0ad1d_1         149 KB  conda-forge
    openssl-1.1.1e             |       h516909a_0         2.1 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.6 MB

The following NEW packages will be INSTALLED:

    python_abi:      3.6-1_cp36m       conda-forge

The following packages will be UPDATED:

    beautifulsoup4:  4.7.1-py36_1                

In [3]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

from bs4 import BeautifulSoup
import requests

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import folium # map rendering library

# Matplotlib and associated plotting modules
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.cm as cm

import matplotlib.colors as colors

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# import k-means from clustering stage
from sklearn.cluster import KMeans

from sklearn.preprocessing import StandardScaler, normalize, scale
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score

print('Libraries imported!')

Libraries imported!


### Scrap Data to dataframe, Drop Not Assigned and Group

In [26]:
# Send the GET request
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [27]:
#BeautifulSoup to read the data
soup = BeautifulSoup(source, 'lxml')

In [28]:
#Capture the data table
table = soup.find("table", attrs={"rules":"all", "cellspacing":"0", "cellpadding":"2", "style":"width:100%; border-collapse:collapse; border:1px solid #ccc;"})

In [29]:
#Get PostalCode and Borough Columns
PostalCode = []
Borough = []
Neighborhood = []
for i in table.find_all('td'):
    PostalCode.append(i.find('b').text)
    bor = i.find('span',style ='font-size:80%;').text
    Borough.append(bor.split('(')[0])

In [30]:
#Clean and get the Neighborhood column
Neighborhood = []
for i in table.find_all('td'):
    count = 0
    Nhd =""
    for a in i.find_all('a'):
        if count==1:
            Nhd =""
            try:
                test = a.text
                Nhd =  test 
            except Exception as e:
                Nhd = None
        elif count>1:
            try:
                test = a.text
                Nhd =  Nhd +","+test
            except Exception as e:
                Nhd = None
        count+=1
    Neighborhood.append(Nhd)
    count = 0

In [72]:
#Create the dataframe
df = pd.DataFrame(list(zip(PostalCode, Borough, Neighborhood)), columns =['PostalCode', 'Borough', 'Neighborhood'])

In [73]:
#Check the head of the dataframe
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park,Harbourfront"


In [76]:
#Ignore cells with a borough that is Not assigned
df = df.loc[df['Borough']!='Not assigned',:]
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park,Harbourfront"
5,M6A,North York,"Lawrence Manor,Lawrence Heights"
6,M7A,Queen's Park / Ontario Provincial Government,Ontario Provincial Government


In [77]:
#Assign Neighborhood as the same as Borough where Neighborhood is not present
df['Neighborhood'] = df['Neighborhood'].replace('', df['Borough'])

In [83]:
#Same as required by the question
column_names = ["PostalCode", "Borough", "Neighborhood"]
test_df = pd.DataFrame(columns=column_names)
test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in test_list:
    test_df = test_df.append(df[df["PostalCode"]==postcode], ignore_index=True)
    
test_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5G,Downtown Toronto,Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Parkview Hill,Woodbine Gardens"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,East Toronto
6,M1R,Scarborough,"Wexford,Maryvale"
7,M9V,Etobicoke,"South Steeles,Silverstone,Jamestown,Mount Oliv..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower,King and Spadina,Railway Lands,Harbou..."


In [84]:
#Shape of the dataframe
df.shape

(103, 3)

## Week 3 - Part 2: Get the latitude and the longitude coordinates of each neighborhood

In [85]:
# load the coordinates from the URL
coordinates = "http://cocl.us/Geospatial_data"
coordinates_df = pd.read_csv(coordinates)
coordinates_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [87]:
coordinates_df.rename(columns={"Postal Code":"PostalCode"},inplace=True)

#Concate Latitude and Longitude values to the dataframe
df = pd.DataFrame.merge(df,coordinates_df,on='PostalCode')

In [90]:
test_df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5G,Downtown Toronto,Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Parkview Hill,Woodbine Gardens"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,East Toronto
6,M1R,Scarborough,"Wexford,Maryvale"
7,M9V,Etobicoke,"South Steeles,Silverstone,Jamestown,Mount Oliv..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower,King and Spadina,Railway Lands,Harbou..."


## Week 3 - Part 3: Explore and cluster the neighborhoods in Toronto

In [91]:
#Geopy library to get Latitude and Longitude of Toronto
address = 'Toronto'

geolocator = Nominatim(user_agent="tr_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [97]:
# Create map of Neighborhoods using latitude and longitude values of Boroughs in Toronto
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# Add markers to the map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [93]:
df_Toronto = df[df['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
df_Toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park,Harbourfront",43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District,Ryerson",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M5E,Downtown Toronto,Downtown Toronto,43.644771,-79.373306
4,M5G,Downtown Toronto,Bay Street,43.657952,-79.387383


In [94]:
address = 'Toronto'

geolocator = Nominatim(user_agent="tr_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [98]:
# Map of Neighborhoods using latitude and longitude values of Boroughs situated only in Downtown Toronto
map_toronto_v2 = folium.Map(location=[latitude, longitude], zoom_start=10)

# Add markers to the map
for lat, lng, borough, neighborhood in zip(df_Toronto['Latitude'], df_Toronto['Longitude'], df_Toronto['Borough'], df_Toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_v2)  
    
map_toronto_v2

### Maps are saved and shared as image separately.