# Scraping data from the source and creating a dataframe

In [1]:
import pandas as pd
import numpy as np

In [2]:
pip install lxml

Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/dd/ba/a0e6866057fc0bbd17192925c1d63a3b85cf522965de9bc02364d08e5b84/lxml-4.5.0-cp36-cp36m-manylinux1_x86_64.whl (5.8MB)
[K     |████████████████████████████████| 5.8MB 19.5MB/s eta 0:00:01     |██████████████████████████▌     | 4.8MB 19.5MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.5.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
df=pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")[0]
df

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
...,...,...,...
175,M5Z,Not assigned,
176,M6Z,Not assigned,
177,M7Z,Not assigned,
178,M8Z,Etobicoke,Mimico NW / The Queensway West / South of Bloo...


In [4]:
df.dropna(inplace=True)
neighborhood_data=df.reset_index(drop=True)

In [5]:
neigh=neighborhood_data['Neighborhood'].str.replace('/', ',', regex=False)

In [6]:
neighborhood_data=pd.concat([neighborhood_data, neigh], axis=1)

In [7]:
neighborhood_data.drop(['Neighborhood'], inplace=True, axis=1)

In [8]:
neighborhood_data=pd.concat([neighborhood_data, neigh], axis=1)
neighborhood_data

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing CentrE
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,..."


In [9]:
neighborhood_data.sort_values(by=['Postal code'], ascending=True, inplace=True)

In [13]:
neigh=neighborhood_data.reset_index(drop=True)
neigh

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern , Rouge"
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village , St. Phillips , Martin Grov..."
101,M9V,Etobicoke,"South Steeles , Silverstone , Humbergate , Jam..."


In [14]:
neighborhood_data.shape

(103, 3)

# Adding the Latitudes and Longitudes to the dataframe

In [15]:
df_coords=pd.read_csv("http://cocl.us/Geospatial_data")
df_coords.set_index('Postal Code')

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476
...,...,...
M9N,43.706876,-79.518188
M9P,43.696319,-79.532242
M9R,43.688905,-79.554724
M9V,43.739416,-79.588437


In [16]:
coordinates=df_coords[['Latitude', 'Longitude']]

In [17]:
neigh_coordinates=pd.concat([neigh, coordinates], axis=1)

In [18]:
neigh_coordinates

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village , St. Phillips , Martin Grov...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles , Silverstone , Humbergate , Jam...",43.739416,-79.588437


# Clustering of the neighborhood and plotting on the map

In [19]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
!conda install -c conda-forge folium=0.5.0 --yes
import folium

Solving environment: done


  current version: 4.5.11
  latest version: 4.8.3

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    pandas-0.24.2              |   py36hf484d3e_0        11.1 MB  conda-forge
    requests-2.23.0            |     pyh8c360ce_2          47 KB  conda-forge
    altair-4.1.0               |             py_1         614 KB  conda-forge
    jsonschema-3.2.0           |   py36h9f0ad1d_1          89 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    chardet-3.0.4              |py36h9f0ad1d_1006         188 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    toolz-0.10.0        

In [20]:
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim

Solving environment: done


  current version: 4.5.11
  latest version: 4.8.3

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          92 KB

The following NEW packages will be INSTALLED:

    geographiclib: 1.50-py_0   conda-forge
    geopy:         1.21.0-py_0 conda-forge


Downloading and Extracting Packages
geopy-1.21.0         | 58 KB     | ##################################### | 100% 
geographiclib-1.50   | 34 KB     | ###

In [21]:
address = 'Toronto City, Ontario'

geolocator = Nominatim(user_agent="tr_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.6534817, -79.3839347.


In [25]:
toronto_map=folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(neigh_coordinates['Latitude'], neigh_coordinates['Longitude'], neigh_coordinates['Borough'], neigh_coordinates['Neighborhood']):
    label= '{}, {}'.format(neighborhood, borough)
    label= folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat, lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.65,
    parse_html=False
    ).add_to(toronto_map)
    
toronto_map

In [30]:
borough=neigh_coordinates[['Borough', 'Latitude', 'Longitude']]
borough

Unnamed: 0,Borough,Latitude,Longitude
0,Scarborough,43.806686,-79.194353
1,Scarborough,43.784535,-79.160497
2,Scarborough,43.763573,-79.188711
3,Scarborough,43.770992,-79.216917
4,Scarborough,43.773136,-79.239476
...,...,...,...
98,York,43.706876,-79.518188
99,Etobicoke,43.696319,-79.532242
100,Etobicoke,43.688905,-79.554724
101,Etobicoke,43.739416,-79.588437


In [47]:
boroughs=pd.get_dummies(borough['Borough'])
boroughs=pd.concat([borough, boroughs], axis=1)
boroughs

Unnamed: 0,Borough,Latitude,Longitude,Central Toronto,Downtown Toronto,East Toronto,East York,Etobicoke,Mississauga,North York,Scarborough,West Toronto,York
0,Scarborough,43.806686,-79.194353,0,0,0,0,0,0,0,1,0,0
1,Scarborough,43.784535,-79.160497,0,0,0,0,0,0,0,1,0,0
2,Scarborough,43.763573,-79.188711,0,0,0,0,0,0,0,1,0,0
3,Scarborough,43.770992,-79.216917,0,0,0,0,0,0,0,1,0,0
4,Scarborough,43.773136,-79.239476,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,York,43.706876,-79.518188,0,0,0,0,0,0,0,0,0,1
99,Etobicoke,43.696319,-79.532242,0,0,0,0,1,0,0,0,0,0
100,Etobicoke,43.688905,-79.554724,0,0,0,0,1,0,0,0,0,0
101,Etobicoke,43.739416,-79.588437,0,0,0,0,1,0,0,0,0,0


In [48]:
boroughs=boroughs[['Latitude','Longitude','Central Toronto', 'Downtown Toronto','East Toronto','East York','Etobicoke','Mississauga','North York','Scarborough','West Toronto','York']]

In [56]:
kclusters=4

toronto_clusters=KMeans(n_clusters=kclusters, random_state=0).fit(boroughs)

toronto_clusters.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2,
       2, 2, 2, 2, 2, 0, 0, 3, 3, 2, 2, 2, 2, 0, 3, 3, 2, 2, 2, 2, 2, 2,
       3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 3, 3, 3, 3, 3], dtype=int32)

In [57]:
borough.insert(0, 'Cluster labels', toronto_clusters.labels_)

In [58]:
borough

Unnamed: 0,Cluster labels,Borough,Latitude,Longitude
0,1,Scarborough,43.806686,-79.194353
1,1,Scarborough,43.784535,-79.160497
2,1,Scarborough,43.763573,-79.188711
3,1,Scarborough,43.770992,-79.216917
4,1,Scarborough,43.773136,-79.239476
...,...,...,...,...
98,3,York,43.706876,-79.518188
99,3,Etobicoke,43.696319,-79.532242
100,3,Etobicoke,43.688905,-79.554724
101,3,Etobicoke,43.739416,-79.588437


In [61]:
toronto_merged=borough
toronto_merged


Unnamed: 0,Cluster labels,Borough,Latitude,Longitude
0,1,Scarborough,43.806686,-79.194353
1,1,Scarborough,43.784535,-79.160497
2,1,Scarborough,43.763573,-79.188711
3,1,Scarborough,43.770992,-79.216917
4,1,Scarborough,43.773136,-79.239476
...,...,...,...,...
98,3,York,43.706876,-79.518188
99,3,Etobicoke,43.696319,-79.532242
100,3,Etobicoke,43.688905,-79.554724
101,3,Etobicoke,43.739416,-79.588437


In [63]:
toronto_map_clusters= folium.Map(location=[latitude, longitude], zoom_start=10)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]


markers_colors = []
for lat, lng, borough, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Borough'], toronto_merged['Cluster labels']):
    label = folium.Popup(str(borough) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(toronto_map_clusters)
       
toronto_map_clusters