# Segmenting and Clustering Neighborhoods in Toronto

## Part 1

#### Import libraries

In [1]:
import pandas as pd
import numpy as np

#### Read the web source

In [2]:
pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
print("Done")

Done


#### Convert the table to the dataframe

In [3]:
df_orig = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
df_orig.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Delete lines with "Not assigned" in column _Borough_

In [4]:
df = df_orig[~df_orig.Borough.str.contains('Not assigned')] 
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Check if there are any multiple entries for M5A

In [5]:
df.loc[df['Postal Code'] == 'M5A']

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Only one

#### Check if there are any entries with _Not assigned_

In [6]:
df.loc[df['Neighbourhood'] == 'Not assigned']

Unnamed: 0,Postal Code,Borough,Neighbourhood


None

#### Let's see the amount of rows before and after the deletion of lines with "Not assigned" in column _Borough_

In [7]:
df_orig.shape

(180, 3)

In [8]:
df.shape

(103, 3)

The amount was decreased from 180 to 103.

## Part 2

#### Import libraries

In [9]:
import geocoder # import geocodera
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

#### Load CSV file with Canada's indexes

In [10]:
toronto_data = pd.read_csv ('Indexes_CA.csv')
toronto_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,T0A,54.766,-111.7174
1,T0B,53.0727,-111.5816
2,T0C,52.1431,-111.6941
3,T0E,53.6758,-115.0948
4,T0G,55.6993,-114.4529


#### Merge two dataframes by Postal code

In [11]:
df1 = df.merge(toronto_data, how='left', on=['Postal Code'])
df1.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889


## Part 3

#### Import libraries

In [12]:
import folium
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

#### Lets check the coordinates of Toronto for the staring point of the map visualization

In [13]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


#### For the clustering analisys only the boroughs that contain the word Toronto are going to be selected

In [14]:
df2 = df1[df1['Borough'].str.contains('Toronto',regex=False)]
df2.reset_index(drop=True, inplace=True)
df2

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3783
3,M5C,Downtown Toronto,St. James Town,43.6513,-79.3756
4,M4E,East Toronto,The Beaches,43.6784,-79.2941
5,M5E,Downtown Toronto,Berczy Park,43.6456,-79.3754
6,M5G,Downtown Toronto,Central Bay Street,43.6564,-79.386
7,M6G,Downtown Toronto,Christie,43.6683,-79.4205
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.6496,-79.3833
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.6655,-79.4378


#### Lets visualize these boroughs on the map

In [15]:
map_toronto = folium.Map(location=[latitude, longitude],zoom_start=10)

for lat,lng,borough,neighbourhood in zip(df2['Latitude'],df2['Longitude'],df2['Borough'],df2['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto



####  Using Kmeans 5 clusters is assigned and the column with the Cluster Name is added to the data frame

In [16]:
kclusters=5
toronto_clusters = df2.drop(['Postal Code','Borough','Neighbourhood'],1)
kmeans = KMeans(n_clusters = kclusters,random_state=0).fit(toronto_clusters)
kmeans.labels_
df2.insert(0, 'Cluster Names', kmeans.labels_)

In [17]:
df2.head()

Unnamed: 0,Cluster Names,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
1,4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889
2,4,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3783
3,4,M5C,Downtown Toronto,St. James Town,43.6513,-79.3756
4,2,M4E,East Toronto,The Beaches,43.6784,-79.2941


#### The 5 clusters on the map of Toronto

In [19]:
# create map
map_clusters = folium.Map(location=[latitude, longitude],zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(df2['Latitude'], df2['Longitude'], df2['Neighbourhood'], df2['Cluster Names']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

For the maps to be visible go by the link: https://nbviewer.jupyter.org/github/povorot/Learning/blob/main/Toronto.ipynb