@@@@@@SECTION 1 SUBMISSION@@@@@@
In this section we read the wikipedia site and then tidy up the dataframe to:
1. remove items with 'not assigned' in the borough
2. merge rows where postcode and borough are the same
3. for remaining rows where neighbourhood is 'not assigned' make it equal to borough

In [1]:
import pandas as pd
import numpy as np
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [2]:
table = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [3]:
df = table[0]

In [4]:
df.loc[df['Postcode'] == 'M1A']

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned


In [5]:
indexNames = df[ df['Borough'] == 'Not assigned' ].index
 
# Delete these row indexes from dataFrame
df.drop(indexNames , inplace=True)

In [6]:
df.loc[df['Postcode'] == 'M1A']

Unnamed: 0,Postcode,Borough,Neighbourhood


In [7]:
df.loc[df['Postcode'] == 'M6A']

Unnamed: 0,Postcode,Borough,Neighbourhood
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [8]:
df = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()

In [9]:
df.loc[df['Postcode'] == 'M6A']

Unnamed: 0,Postcode,Borough,Neighbourhood
71,M6A,North York,"Lawrence Heights, Lawrence Manor"


In [10]:
df.loc[df['Postcode'] == 'M9A']

Unnamed: 0,Postcode,Borough,Neighbourhood
93,M9A,Queen's Park,Not assigned


In [11]:
for index,row in df.iterrows():
    id = row['Neighbourhood']
    place = row['Borough']
    if id == 'Not assigned':
        row['Neighbourhood'] = place

In [12]:
df.loc[df['Postcode'] == 'M9A']

Unnamed: 0,Postcode,Borough,Neighbourhood
93,M9A,Queen's Park,Queen's Park


In [13]:
df.shape

(103, 3)

@@@@@@END OF SECTION 1 SUBMISSION@@@@@@

@@@@@@SECTION 2 SUBMISSION@@@@@@
This section imports longitude and latitude and adds to the dataframe

In [14]:
import io
import requests
url="https://cocl.us/Geospatial_data"
s=requests.get(url).content
c=pd.read_csv(io.StringIO(s.decode('utf-8')))

In [15]:
c

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [16]:
dfnew = pd.merge(df, c, left_on='Postcode', right_on='Postal Code')
dfnew = dfnew.drop(columns=['Postal Code'])

In [17]:
dfnew

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437


@@@@@@END OF SECTION 2 SUBMISSION@@@@@@

@@@@@@SECTION 3 SUBMISSION@@@@@@
In this section we will cluster the neighbourhoods and then visualise

Reduce the list of boroughs to those in Toronto:

In [18]:
dfnew = dfnew[dfnew['Borough'].str.contains("Toronto")]

Install the necessary packages

In [20]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [21]:
# set number of clusters
kclusters = 5 

dfnew_clustering = dfnew.drop('Neighbourhood',1)
dfnew_clustering = dfnew_clustering.drop('Borough',1)
dfnew_clustering = dfnew_clustering.drop('Postcode',1)
dfnew_clustering

Unnamed: 0,Latitude,Longitude
37,43.676357,-79.293031
41,43.679557,-79.352188
42,43.668999,-79.315572
43,43.659526,-79.340923
44,43.72802,-79.38879
45,43.712751,-79.390197
46,43.715383,-79.405678
47,43.704324,-79.38879
48,43.689574,-79.38316
49,43.686412,-79.400049


RUN CLUSTERING

In [22]:
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(dfnew_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1])

ADD CLUSTER DATA TO DATAFRAME

In [23]:
# add clustering labels
dfnew.insert(0, 'Cluster Labels', kmeans.labels_)

In [24]:
#!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library



CREATE MAP OF CLUSTERS

In [25]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dfnew['Latitude'], dfnew['Longitude'], dfnew['Neighbourhood'], dfnew['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

@@@@@@END OF SECTION 3 SUBMISSION@@@@@@