In [37]:

import pandas as pd  
import numpy as np  
import random  
from bs4 import BeautifulSoup
import requests

#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

#!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library

print('Folium installed')
print('Libraries imported.')

Folium installed
Libraries imported.


In [46]:
#Send the get request
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(source,'lxml')


In [7]:
# create three lists to store table data
postalCodeList = []
boroughList = []
neighborhoodList = []

In [8]:
#adding the data to the lists
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if len(cells)>0:
        postalCodeList.append(cells[0].text)
        boroughList.append(cells[1].text)
        neighborhoodList.append(cells[2].text.rstrip('\n'))

In [9]:
#creating a dataframe using the lists
toronto_df = pd.DataFrame({"PostalCode" : postalCodeList,
                          "Borough" : boroughList,
                          "Neighborhood" : neighborhoodList})
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,
1,M2A\n,Not assigned\n,
2,M3A\n,North York\n,Parkwoods
3,M4A\n,North York\n,Victoria Village
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront"


In [10]:
#removing the 'not assighned' Borough
toronto_df = toronto_df[toronto_df.Borough != "Not assigned\n"].reset_index(drop=True)
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A\n,North York\n,Parkwoods
1,M4A\n,North York\n,Victoria Village
2,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront"
3,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights"
4,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government"


In [11]:
#grouping neighborhoods in the same borough
toronto_df_grouped = toronto_df.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
toronto_df_grouped.head()


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B\n,Scarborough\n,"Malvern, Rouge"
1,M1C\n,Scarborough\n,"Rouge Hill, Port Union, Highland Creek"
2,M1E\n,Scarborough\n,"Guildwood, Morningside, West Hill"
3,M1G\n,Scarborough\n,Woburn
4,M1H\n,Scarborough\n,Cedarbrae


In [12]:
toronto_df_grouped.shape

(103, 3)

In [13]:
#reading the csv file for the coordinates
csv_path = "https://cocl.us/Geospatial_data"
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
df.rename(columns={"Postal Code":"PostalCode"},inplace = True)
df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [15]:
#modifying the 1st dataframe
toronto_df_grouped['Neighborhood'] = toronto_df_grouped['Neighborhood'].str.replace(' /', ',')
toronto_df_grouped['PostalCode'] = toronto_df_grouped['PostalCode'].str.replace('\n', '')
toronto_df_grouped['Borough'] = toronto_df_grouped['Borough'].str.replace('\n', '')
toronto_df_grouped.head() 


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [16]:
#checking whether the column to be merged are equal
toronto_df_grouped.iloc[0,0] == df.iloc[0,0]

True

In [17]:
#merging the two tables
toronto_df_new = pd.merge(left=toronto_df_grouped,right=df,left_on='PostalCode', right_on='PostalCode')
toronto_df_new.head() 

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [18]:
toronto_df_new.shape

(103, 5)

In [35]:
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

In [20]:
#working with only boroughs that contain the word Toronto
areas = ['Downtown Toronto','East Toronto','Central Toronto','West Toronto']
toronto_data = toronto_df_new[toronto_df_new.Borough.isin(areas)].reset_index()
toronto_data.head()
        
        

Unnamed: 0,index,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,37,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,43,M4M,East Toronto,Studio District,43.659526,-79.340923
4,44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [21]:
toronto_data.drop(['index'],axis=1,inplace=True)
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [23]:
#Finding the longitude and lattitude of Toronto
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [24]:
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [33]:
#to check how many clusters 
toronto_data["Borough"].value_counts()

Downtown Toronto    19
Central Toronto      9
West Toronto         6
East Toronto         5
Name: Borough, dtype: int64

In [50]:
#getting the dummy variables
toronto_onehot = pd.get_dummies(toronto_data['Borough'], prefix="", prefix_sep="")
toronto_onehot['Neighborhood'] = toronto_data['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()


Unnamed: 0,Neighborhood,Central Toronto,Downtown Toronto,East Toronto,West Toronto
0,The Beaches,0,0,1,0
1,"The Danforth West, Riverdale",0,0,1,0
2,"India Bazaar, The Beaches West",0,0,1,0
3,Studio District,0,0,1,0
4,Lawrence Park,1,0,0,0


In [51]:
# set number of clusters
kclusters = 4

toronto_grouped_clustering = toronto_onehot.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 2, 2, 2, 2, 2, 2], dtype=int32)

In [71]:
# add clustering labels
toronto_onehot.insert(1, 'Cluster Labels', kmeans.labels_)
toronto_onehot.head()

Unnamed: 0,Neighborhood,Cluster Labels,Central Toronto,Downtown Toronto,East Toronto,West Toronto
0,The Beaches,0,0,0,1,0
1,"The Danforth West, Riverdale",0,0,0,1,0
2,"India Bazaar, The Beaches West",0,0,0,1,0
3,Studio District,0,0,0,1,0
4,Lawrence Park,2,1,0,0,0


In [77]:
toronto_merged = toronto_data
toronto_merged = toronto_merged.join(toronto_onehot.set_index('Neighborhood'), on='Neighborhood')
toronto_merged.head() 

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,Central Toronto,Downtown Toronto,East Toronto,West Toronto
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,0,0,1,0
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0,0,0,1,0
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572,0,0,0,1,0
3,M4M,East Toronto,Studio District,43.659526,-79.340923,0,0,0,1,0
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,2,1,0,0,0


In [80]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters