# Part 1
In this section, we build the code to scrape the Wikipedia page (https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M), in order to obtain the data that is in the table of postal codes and then trasform the data into a Pandas dataframe

In [23]:
# Import the required libraries
import pandas as pd #Library for data analysis
import numpy as np #Library for handling data in vectors
import requests #Library to handle requests

# !pip install beautifulsoup4 #Uncomment this installation if needed
from bs4 import BeautifulSoup #this package is used for web scraping

from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium #Library for plotting maps

import matplotlib.cm as cm
import matplotlib.colors as colors

print("Libraries imported")

Solving environment: done

# All requested packages already installed.

Libraries imported


## Scraping the web page and reading the table

In [2]:
# Scrape the list of postal codes in Canada, from the Wikipedia page and read the table
wiki_url = " https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(wiki_url).text
soup = BeautifulSoup(source, 'xml')
postalcode_table = soup.find('table')

In [3]:
# Define a dataframe consisting of 3 columns: PostalCode, Borough and Neighborhood
column_names = ['PostalCode' , 'Borough',  'Neighborhood']
df_postalcode = pd.DataFrame(columns = column_names)
df_postalcode

Unnamed: 0,PostalCode,Borough,Neighborhood


In [4]:
# Iterate through postalcode_table and populate df_postalcode
for tr_cell in postalcode_table.find_all('tr'):
    row_data = []
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data) == 3:
        df_postalcode.loc[len(df_postalcode)] = row_data

df_postalcode.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Data Pre-processing and Cleaning

In [5]:
# Removing the rows where Borough is Not Assigned
df1=df_postalcode[df_postalcode['Borough']!='Not assigned']
df1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [6]:
# Combining the neighbourhoods with same Postalcode
df2 = df1.groupby(['PostalCode','Borough'], sort=False).agg(', '.join)
df2.reset_index(inplace=True)

# Replacing the name of the neighbourhoods which are 'Not assigned' with names of Borough
df2['Neighborhood'] = np.where(df2['Neighborhood'] == 'Not assigned',df2['Borough'], df2['Neighborhood'])

df2

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [7]:
df2.shape

(103, 3)

# Part 2
In this section, we merge the latitudes and longitutes of the corresponding neighborhoods to the dataframe

## Importing the Lat & Long for various neighborhoods in Canada

In [8]:
lat_lon = pd.read_csv('https://cocl.us/Geospatial_data')
lat_lon.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Merging the lat_lon dataframe with the PostalCode dataframe

In [9]:
lat_lon.rename(columns = {'Postal Code': 'PostalCode'}, inplace=True)
df_postalcode_latlon = pd.merge(df2, lat_lon, on='PostalCode')
df_postalcode_latlon.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


# Part 3
In this section, we shall explore and cluster the neighborhoods in Toronto

## Get all the rows that contain Toronto in their Borough

In [10]:
df_toronto = df_postalcode_latlon[df_postalcode_latlon['Borough'].str.contains('Toronto', regex=False)]
df_toronto

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


## Visualizing all the Neighborhoods of Toronto using Folium

In [17]:
lat_toronto = 43.651070 
lon_toronto = -79.347015
map_toronto = folium.Map(location = [lat_toronto, lon_toronto], zoom_start=12)

for lat,lng,borough,neighborhood in zip(df_toronto['Latitude'],df_toronto['Longitude'],df_toronto['Borough'],df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto

## Using K-Means Clustering, cluster the neighborhoods of Toronto

In [20]:
k=5 #Number of clusters
toronto_clusters = df_toronto.drop(['PostalCode', 'Borough', 'Neighborhood'], 1)
kmeans = KMeans(n_clusters = k, random_state = 0).fit(toronto_clusters)
kmeans.labels_
df_toronto.insert(0, 'Cluster Labels', kmeans.labels_)

In [21]:
df_toronto

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,0,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,0,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,0,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,4,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,0,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,3,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,0,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,1,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [24]:
# Create the map for displaying neighborhood clusters
map_clusters = folium.Map(location = [lat_toronto, lon_toronto], zoom_start=12)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighborhood'], df_toronto['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters