## Week 3 - Segmenting and Clustering Neighborhoods in Toronto

In [62]:
!pip install lxml

import pandas as pd 
import numpy as np 



In [77]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [9]:
import requests
!pip install beautifulsoup4
from bs4 import BeautifulSoup

source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup=BeautifulSoup(source,'lxml')
print(soup.title)

from IPython.display import display_html
tab = str(soup.table)
# display_html(tab,raw=True)

<title>List of postal codes of Canada: M - Wikipedia</title>


In [10]:
# print(tab)

### Convert html table into Pandas DataFrame

In [41]:
dfs = pd.read_html(tab)
df_temp=dfs[0]
# df.head()
df_list = df_temp.values.tolist()
new_list = []
for row in df_list:
#     print(row, '\n\n')
    new_list.extend(row)
new_list

['M1ANot assigned',
 'M2ANot assigned',
 'M3ANorth York(Parkwoods)',
 'M4ANorth York(Victoria Village)',
 'M5ADowntown Toronto(Regent Park / Harbourfront)',
 'M6ANorth York(Lawrence Manor / Lawrence Heights)',
 "M7AQueen's Park(Ontario Provincial Government)",
 'M8ANot assigned',
 'M9AEtobicoke(Islington Avenue)',
 'M1BScarborough(Malvern / Rouge)',
 'M2BNot assigned',
 'M3BNorth York(Don Mills)North',
 'M4BEast York(Parkview Hill / Woodbine Gardens)',
 'M5BDowntown Toronto(Garden District, Ryerson)',
 'M6BNorth York(Glencairn)',
 'M7BNot assigned',
 'M8BNot assigned',
 'M9BEtobicoke(West Deane Park / Princess Gardens / Martin Grove / Islington / Cloverdale)',
 'M1CScarborough(Rouge Hill / Port Union / Highland Creek)',
 'M2CNot assigned',
 'M3CNorth York(Don Mills)South(Flemingdon Park)',
 'M4CEast York(Woodbine Heights)',
 'M5CDowntown Toronto(St. James Town)',
 'M6CYork(Humewood-Cedarvale)',
 'M7CNot assigned',
 'M8CNot assigned',
 'M9CEtobicoke(Eringate / Bloordale Gardens / Old Bu

In [42]:
table_contents = []
for new_row in new_list:
#     print(new_row)
    cell = {}
    cell['Postcode'] = new_row[:3]
#     print(cell['Postcode'])
#     print(new_row[3:])
    if new_row[3:] == 'Not Assigned':
        cell['Borough'] = 'Not Assigned'
        cell['Neighbourhood'] = 'Not Assigned'
    else:
        cell['Borough'] = (new_row[3:]).split('(')[0]
        cell['Neighbourhood'] = (((test).split('(')[1]).strip(')')).replace(' /',',').replace(')',' ')
        table_contents.append(cell)

print(table_contents[:5])

[{'Postcode': 'M1A', 'Borough': 'Not assigned', 'Neighbourhood': 'Regent Park, Harbourfront'}, {'Postcode': 'M2A', 'Borough': 'Not assigned', 'Neighbourhood': 'Regent Park, Harbourfront'}, {'Postcode': 'M3A', 'Borough': 'North York', 'Neighbourhood': 'Regent Park, Harbourfront'}, {'Postcode': 'M4A', 'Borough': 'North York', 'Neighbourhood': 'Regent Park, Harbourfront'}, {'Postcode': 'M5A', 'Borough': 'Downtown Toronto', 'Neighbourhood': 'Regent Park, Harbourfront'}]


In [35]:
# test
# test = 'Downtown Toronto(Regent Park / Harbourfront)'
# (((test).split('(')[1]).strip(')')).replace(' /',',').replace(')',' ')

'Regent Park / Harbourfront)'

In [43]:
df=pd.DataFrame(table_contents)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,"Regent Park, Harbourfront"
1,M2A,Not assigned,"Regent Park, Harbourfront"
2,M3A,North York,"Regent Park, Harbourfront"
3,M4A,North York,"Regent Park, Harbourfront"
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Data preprocessing and cleaning

In [45]:
# Dropping the rows where Borough is 'Not assigned'
df1 = df[df.Borough != 'Not assigned']

# Combining the neighbourhoods with same Postalcode
df2 = df1.groupby(['Postcode','Borough'], sort=False).agg(', '.join)
df2.reset_index(inplace=True)

# Replacing the name of the neighbourhoods which are 'Not assigned' with names of Borough
df2['Neighbourhood'] = np.where(df2['Neighbourhood'] == 'Not assigned',df2['Borough'], df2['Neighbourhood'])

df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,"Regent Park, Harbourfront"
1,M4A,North York,"Regent Park, Harbourfront"
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Regent Park, Harbourfront"
4,M7A,Queen's Park,"Regent Park, Harbourfront"


In [47]:
print("df: ", df.shape)
print("df1: ", df1.shape)
print("df2: ", df2.shape)

df:  (180, 3)
df1:  (103, 3)
df2:  (103, 3)


In [48]:
lat_lon = pd.read_csv('https://cocl.us/Geospatial_data')
lat_lon.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge two tables, for getting the Latitudes and Longitudes for different neighbourhoods

In [49]:
lat_lon.rename(columns={'Postal Code':'Postcode'},inplace=True)
df3 = pd.merge(df2,lat_lon,on='Postcode')
df3.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,"Regent Park, Harbourfront",43.753259,-79.329656
1,M4A,North York,"Regent Park, Harbourfront",43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Regent Park, Harbourfront",43.718518,-79.464763
4,M7A,Queen's Park,"Regent Park, Harbourfront",43.662301,-79.389494


In [51]:
print("df3: ", df3.shape)

df3:  (103, 5)


## Clustering and Plotting

### Keep only boroughs contain "Toronto"

In [131]:
df4 = df3[df3['Borough'].str.contains('Toronto',regex=False)]
df4.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
9,M5B,Downtown Toronto,"Regent Park, Harbourfront",43.657162,-79.378937
15,M5C,Downtown Toronto,"Regent Park, Harbourfront",43.651494,-79.375418
19,M4E,East Toronto,"Regent Park, Harbourfront",43.676357,-79.293031
20,M5E,Downtown Toronto,"Regent Park, Harbourfront",43.644771,-79.373306


In [80]:
print("df4: ", df4.shape)

df4:  (39, 5)


### Visualizing function - Folium

In [87]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [130]:
## test
# num_color = 10
# x = np.arange(num_color)
# ys = [i + x + (i*x)**2 for i in range(num_color)]
# colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
# rainbow = [colors.rgb2hex(i) for i in colors_array]
# rainbow

In [127]:
def plot_toronto(df, k=1):
    
    if k > 1:
        # create map
        map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

        # set color scheme for the clusters
        x = np.arange(k)
        ys = [i + x + (i*x)**2 for i in range(k)]
        colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
        rainbow = [colors.rgb2hex(i) for i in colors_array]

        # add markers to the map
        # markers_colors = []
        for lat, lon, neighbourhood, cluster in zip(df4['Latitude'], df4['Longitude'], df4['Neighbourhood'], df4['Cluster Labels']):
            label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
            folium.CircleMarker(
                [lat, lon],
                radius=5,
                popup=label,
                color=rainbow[cluster-1],
                fill=True,
                fill_color=rainbow[cluster-1],
                fill_opacity=0.7).add_to(map_clusters)
        return map_clusters

    else:
        # initialte the map
        map_toronto = folium.Map(location=[43.651070,-79.347015],zoom_start=10)
        
        for lat,lng,borough,neighbourhood in zip(df['Latitude'],df['Longitude'],df['Borough'],df['Neighbourhood']):
            label = '{}, {}'.format(neighbourhood, borough)
            label = folium.Popup(label, parse_html=True)
            folium.CircleMarker([lat,lng],
                                radius=5,
                                popup=label,
                                color='blue',
                                fill=True,
                                fill_color='#3186cc',
                                fill_opacity=0.7,
                                parse_html=False).add_to(map_toronto)

        return map_toronto


### Plot Toronto map with Borough contains Toronto

In [128]:
plot_toronto(df4, )

### KMeans clsutering of the neighbourhoods

In [132]:
from sklearn.cluster import KMeans

k=8
toronto_clustering = df4.drop(['Postcode','Borough','Neighbourhood'],1)
k_means = KMeans(n_clusters = k,random_state=0).fit(toronto_clustering)
print(k_means.labels_)
df4.insert(0, 'Cluster Labels', k_means.labels_)

[0 0 0 3 0 0 1 0 2 5 0 7 5 0 7 3 0 5 4 4 4 4 2 4 1 2 4 1 2 6 0 1 0 6 0 6 0
 6 3]


In [133]:
df4.head()

Unnamed: 0,Cluster Labels,Postcode,Borough,Neighbourhood,Latitude,Longitude
2,0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
9,0,M5B,Downtown Toronto,"Regent Park, Harbourfront",43.657162,-79.378937
15,0,M5C,Downtown Toronto,"Regent Park, Harbourfront",43.651494,-79.375418
19,3,M4E,East Toronto,"Regent Park, Harbourfront",43.676357,-79.293031
20,0,M5E,Downtown Toronto,"Regent Park, Harbourfront",43.644771,-79.373306


In [134]:
plot_toronto(df4, k)