# Importing required libraries:

In [1]:
!pip install beautifulsoup4
!pip install lxml
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import random # library for random number generation

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
#from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML
from IPython.display import display_html

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans
from bs4 import BeautifulSoup

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')
print('folium installed.')

Solving environment: done

# All requested packages already installed.

Libraries imported.
folium installed.


# Data acquisition and cleaning:

In [2]:
## Data acuisation from wikipedia page
Data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
BS=BeautifulSoup(Data,'lxml')
print(BS.title)

<title>List of postal codes of Canada: M - Wikipedia</title>


In [3]:
#table = str(BS.table)
table = BS.find('table',{'class':'wikitable sortable'})
#display_html(table,raw=True)

#### Change the Html table to Panda DataFrame

In [4]:
table_rows = table.find_all('tr')  # to read table rows

In [5]:
dataset = []
for row in table_rows:
    dataset.append([t.text.strip() for t in row.find_all('td')])

df = pd.DataFrame(dataset, columns=['PostalCode', 'Borough', 'Neighbourhood'])
df = df[~df['PostalCode'].isnull()]  # to filter out bad rows
print(df.head())
print(df.tail())

  PostalCode           Borough     Neighbourhood
1        M1A      Not assigned      Not assigned
2        M2A      Not assigned      Not assigned
3        M3A        North York         Parkwoods
4        M4A        North York  Victoria Village
5        M5A  Downtown Toronto      Harbourfront
    PostalCode       Borough          Neighbourhood
283        M8Z     Etobicoke              Mimico NW
284        M8Z     Etobicoke     The Queensway West
285        M8Z     Etobicoke  Royal York South West
286        M8Z     Etobicoke         South of Bloor
287        M9Z  Not assigned           Not assigned


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 287 entries, 1 to 287
Data columns (total 3 columns):
PostalCode       287 non-null object
Borough          287 non-null object
Neighbourhood    287 non-null object
dtypes: object(3)
memory usage: 9.0+ KB


In [8]:
df.shape

(287, 3)

#### To Ignore "Not assigned" celles from "Borough" column

In [9]:
df.drop(df[df['Borough']=="Not assigned"].index,axis=0, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor


In [10]:
# Reset the index
df1= df.reset_index()
df2=df1[["PostalCode","Borough","Neighbourhood"]]
df2.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [11]:
# Combining the neighbourhoods with same Postalcode
df3 = df2.groupby(['PostalCode','Borough'], sort=False).agg(', '.join)
df3.reset_index(inplace=True)
df3.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned
5,M9A,Downtown Toronto,Queen's Park
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [12]:
# Replacing the name of the neighbourhoods which are 'Not assigned' with names of Borough
df3['Neighbourhood'] = np.where(df3['Neighbourhood'] == 'Not assigned',df3['Borough'], df3['Neighbourhood'])

df3.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [13]:
df3.shape

(103, 3)

# Lattitud and Longtiude coordinates of Toronto's Neighborhoods

In [14]:
#### Latitude and Longitude of Toronto's Neighborhoof is find from "http://cocl.us/Geospatial_data"
lat_lon = pd.read_csv('https://cocl.us/Geospatial_data')
lat_lon.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Merging df3 and lat_long tables

In [15]:
lat_lon.rename(columns={'Postal Code':'PostalCode'}, inplace=True) #similirize the 'PostalCode' column name
df4= pd.merge(df3,lat_lon,on='PostalCode')
df4.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


# Exploring and Clustering the Borough having the word 'Toronto'

In [16]:
df5 = df4[df4['Borough'].str.contains('Toronto',regex=False)]
df5.reset_index(drop=True)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
1,M9A,Downtown Toronto,Queen's Park,43.667856,-79.532242
2,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564
8,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568
9,M6H,West Toronto,"Dovercourt Village, Dufferin",43.669005,-79.442259


In [17]:
df5.shape

(39, 5)

#### Let's get the geographical coordinates of Toronto,Canada

In [18]:
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          91 KB

The following NEW packages will be INSTALLED:

    geographiclib: 1.50-py_0   conda-forge
    geopy:         1.20.0-py_0 conda-forge


Downloading and Extracting Packages
geographiclib-1.50   | 34 KB     | ##################################### | 100% 
geopy-1.20.0         | 57 KB     | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done


In [19]:
# finding Latitude and Longitude of Toronto, CA for Folium map
address = 'Toronto, CA'
geolocator = Nominatim(user_agent="ca_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


#### Visualizing the Dataframe of Toronto Neighborhood Borough name containe 'Toronto'

In [20]:
Toronto_map = folium.Map(location=[43.653963,-79.387207],zoom_start=12)

for lat,lng,borough,neighbourhood in zip(df5['Latitude'],df5['Longitude'],df5['Borough'],df5['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='red',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.8,
    parse_html=False).add_to(Toronto_map)
Toronto_map

# Clustering Toronto's Neighborhoods using K-mean

In [21]:
# set number of clusters
kclusters = 6
toronto_clustering = df5.drop(['PostalCode','Neighbourhood','Borough'], 1)
Kmeans = KMeans(n_clusters = kclusters,random_state=0).fit(toronto_clustering)
# check cluster labels generated for each row in the dataframe
Kmeans.labels_ 

array([0, 5, 0, 0, 3, 0, 0, 4, 0, 1, 0, 4, 3, 0, 4, 3, 0, 3, 2, 2, 2, 2,
       1, 2, 4, 1, 2, 4, 1, 2, 4, 2, 0, 0, 0, 0, 0, 0, 3], dtype=int32)

In [22]:
#insert the created cluster labels on the dataframe
df6=df5[['PostalCode','Neighbourhood','Borough','Latitude','Longitude']]
df6.insert(0,'Cluster Labels', Kmeans.labels_)

In [23]:
df6.head()

Unnamed: 0,Cluster Labels,PostalCode,Neighbourhood,Borough,Latitude,Longitude
2,0,M5A,Harbourfront,Downtown Toronto,43.65426,-79.360636
5,5,M9A,Queen's Park,Downtown Toronto,43.667856,-79.532242
9,0,M5B,"Ryerson, Garden District",Downtown Toronto,43.657162,-79.378937
15,0,M5C,St. James Town,Downtown Toronto,43.651494,-79.375418
19,3,M4E,The Beaches,East Toronto,43.676357,-79.293031


# Visualize the resulting Clusters

In [24]:
# create map
Toronto_clusters = folium.Map(location= [43.653963,-79.387207],zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df6['Latitude'], df6['Longitude'], df6['Neighbourhood'], df6['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(Toronto_clusters)
       
Toronto_clusters