# Toronto Neighborhood Segmenting and Clustering

This project will segment and cluster the neighborhoods in Toronto based on the Postal Code and borough information.  The neighborhood information will be taken from a Wikipedia page listing the Postal Codes and its boroughs and neighborhoods.

In [1]:
import pandas as pd
import requests

#Web-Scraping the Wikipedia page
from bs4 import BeautifulSoup 

#Import the CSV file
import csv

# convert an address into latitude and longitude values
from geopy.geocoders import Nominatim 

# tranform JSON file into a pandas dataframe
from pandas.io.json import json_normalize 

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, positive=False):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  max_n_alpha

#### The first step of the project will scrape the data from the Wikipedia table and convert it into a panda dataframe.  The code will use the BeautifulSoup API to convert each td element within the table element and puts it into rows.  Any data that is not correctly read due to the content in the td will be correctly updated.

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
data  = requests.get(url).text

soup = BeautifulSoup(data, 'html5lib')

table_contents=[]
postal_table = soup.find("table")

for row in postal_table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.b.text
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


#### In this step, the latitude and longitude will be added by importing the Geospatial_Coordinates.csv file.  The file is read into a dataframe.  It is then merged with the Borough dataframe with the Postal Code field from the CSV file excluded from the table.

In [3]:
geo_coord = pd.read_csv('Geospatial_Coordinates.csv')
pc_merged = df.merge(geo_coord, how='left',left_on='PostalCode',right_on='Postal Code',)
postal_coord = pc_merged[['PostalCode','Borough','Neighborhood','Latitude','Longitude']]
postal_coord.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


#### The data will be summarized to look at each borough and the number of neighborhoods in Toronto.

In [4]:
boroughs_count = postal_coord[['Borough','Neighborhood']]
boroughs = boroughs_count.groupby(boroughs_count['Borough'],as_index=False).count()
print(boroughs)

print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(postal_coord['Borough'].unique()),
        postal_coord.shape[0]
    )
)

                   Borough  Neighborhood
0          Central Toronto             9
1         Downtown Toronto            17
2   Downtown Toronto Stn A             1
3             East Toronto             4
4    East Toronto Business             1
5                East York             4
6   East York/East Toronto             1
7                Etobicoke            11
8      Etobicoke Northwest             1
9              Mississauga             1
10              North York            24
11            Queen's Park             1
12             Scarborough            17
13            West Toronto             6
14                    York             5
The dataframe has 15 boroughs and 103 neighborhoods.


#### The geographical coordinates are taken from the Toronto, ONT value and loaded into the Geolocator function.

In [5]:
address = 'Toronto, ONT'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto, ONT are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto, ONT are 43.678523999999996, -79.62912913064454.


#### The coordinates are used along with the boroughs dataframe coordinates to set points for each of the Postal Codes. 

In [6]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(postal_coord['Latitude'], postal_coord['Longitude'], postal_coord['Borough'], postal_coord['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### To focus on a smaller area of Toronto, we will focus on those boroughs with the keyword "Toronto".  This should focus within the main limits of the city and less on the surrounding suburbs.

In [7]:
Toronto_boroughs = postal_coord[postal_coord['Borough'].str.contains('Toronto')].reset_index(drop=True)
Toronto_boroughs.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
5,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
6,M6G,Downtown Toronto,Christie,43.669542,-79.422564
7,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
8,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259
9,M4J,East York/East Toronto,The Danforth East,43.685347,-79.338106


#### The new Toronto dataframe is summarized to show the number of boroughs with the "Toronto" keywork and the total number of neighborhoods.

In [8]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(Toronto_boroughs['Borough'].unique()),
        Toronto_boroughs.shape[0]
    )
)

The dataframe has 7 boroughs and 39 neighborhoods.


#### To focus on the boroughs selected, the midpoint is established to be displayed on a new map.  In this instance, the mean of the latitudes and longitudes of the dataframe.

In [9]:
Toronto_point = Toronto_boroughs[['Latitude','Longitude']].mean()

geolocator = Nominatim(user_agent="tor_explorer")
latitude = Toronto_point[0]
longitude = Toronto_point[1]
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Downtown Toronto are 43.6677258974359, -79.38855562564103.


#### The new coordinates are used for the new map and the Toronto boroughs are added to it.

In [10]:
# create map of MId-Toronto using latitude and longitude values
map_mid_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(Toronto_boroughs['Latitude'],Toronto_boroughs['Longitude'], Toronto_boroughs['Borough'], Toronto_boroughs['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_mid_toronto)  
    
map_mid_toronto