In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import requests
from bs4 import BeautifulSoup

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.17.0               |             py_0          49 KB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          82 KB

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.17.0-py_0 conda-forge


Downloading and Extracting Packages
geopy-1.17.0         | 49 KB     | ##################################### | 100% 
geographiclib-1.49   | 32 KB     | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Solving environme

In [2]:
wikipedia_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
raw_wikipedia_page = requests.get(wikipedia_link)
page= raw_wikipedia_page.text
soup = BeautifulSoup(page, 'lxml')


table = soup.find("table")
df  = pd.DataFrame( columns = ['Postal Code', 'Borough', 'Neighborhood'])
data = {}

i = 0
for row in table.findAll("tr"):
    cells = row.findAll("td")
    ls= []
    for cell in cells:
        ls.append(cell.find(text=True))
        
    data[i]= ls
    i = i+1

for x in range (1, len(data)):
    if data[x][1] == 'Not assigned':
        continue
    else:  
        if data[x][2].strip() == 'Not assigned':
            data[x][2] = data[x][1]
            
        
        df = df.append(
        {
          'Postal Code': data[x][0],
          'Borough': data[x][1],
          'Neighborhood': data[x][2]
        }, ignore_index=True)  
       

df = df.reset_index(drop=True)
#print (df.head(20))
modified_data = {}
for index, row in df.iterrows():
    if row['Postal Code'] in modified_data.keys():
        modified_data[row['Postal Code']][2] = modified_data[row['Postal Code']][2].replace('\n','') + ',' + row['Neighborhood'].replace('\n','')
    else:    
        modified_data[row['Postal Code']] = [row['Postal Code'], row['Borough'].replace('\n',''), row['Neighborhood'].replace('\n','')]

#print ( modified_data)
dfObj = pd.DataFrame(modified_data).T 
dfObj = dfObj.reset_index(drop=True)
dfObj.columns =  ['Postal Code', 'Borough', 'Neighborhood']
print(dfObj.head(10))

  Postal Code           Borough                     Neighborhood
0         M3A        North York                        Parkwoods
1         M4A        North York                 Victoria Village
2         M5A  Downtown Toronto         Harbourfront,Regent Park
3         M6A        North York  Lawrence Heights,Lawrence Manor
4         M7A      Queen's Park                     Queen's Park
5         M9A         Etobicoke                 Islington Avenue
6         M1B       Scarborough                    Rouge,Malvern
7         M3B        North York                  Don Mills North
8         M4B         East York   Woodbine Gardens,Parkview Hill
9         M5B  Downtown Toronto          Ryerson,Garden District


In [3]:
dfObj.shape

(103, 3)

In [8]:
url = 'http://cocl.us/Geospatial_data'
data = pd.read_csv(url)
df_geocode  = pd.DataFrame( columns = ['Postal Code', 'Borough', 'Neighborhood', 'Latitude',  'Longitude'])
df_geocode = pd.merge(dfObj, data, on='Postal Code')
print(df_geocode.head(2))
df_geocode_grouped=  df_geocode.groupby('Neighborhood').mean().reset_index()
print(df_geocode_grouped.head())

  Postal Code     Borough      Neighborhood   Latitude  Longitude
0         M3A  North York         Parkwoods  43.753259 -79.329656
1         M4A  North York  Victoria Village  43.725882 -79.315572
                                        Neighborhood   Latitude  Longitude
0                             Adelaide,King,Richmond  43.650571 -79.384568
1                                          Agincourt  43.794200 -79.262029
2  Agincourt North,L'Amoreaux East,Milliken,Steel...  43.815252 -79.284577
3  Albion Gardens,Beaumond Heights,Humbergate,Jam...  43.739416 -79.588437
4                              Alderwood,Long Branch  43.602414 -79.543484


In [9]:
address = 'Toronto, ON'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_geocode['Latitude'],df_geocode['Longitude'], df_geocode['Borough'], df_geocode['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto



The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [10]:
# set number of clusters
kclusters = 5
print(df_geocode_grouped)
df_geocode_grouped_clustering = df_geocode_grouped.drop('Neighborhood', 1)
print(df_geocode_grouped_clustering.head())
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_geocode_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

                                          Neighborhood   Latitude  Longitude
0                               Adelaide,King,Richmond  43.650571 -79.384568
1                                            Agincourt  43.794200 -79.262029
2    Agincourt North,L'Amoreaux East,Milliken,Steel...  43.815252 -79.284577
3    Albion Gardens,Beaumond Heights,Humbergate,Jam...  43.739416 -79.588437
4                                Alderwood,Long Branch  43.602414 -79.543484
5        Bathurst Manor,Downsview North,Wilson Heights  43.754328 -79.442259
6                                      Bayview Village  43.786947 -79.385975
7                     Bedford Park,Lawrence Manor East  43.733283 -79.419750
8                                          Berczy Park  43.644771 -79.373306
9                           Birch Cliff,Cliffside West  43.692657 -79.264848
10   Bloordale Gardens,Eringate,Markland Wood,Old B...  43.643515 -79.577201
11          Brockton,Exhibition Place,Parkdale Village  43.636847 -79.428191

array([0, 1, 1, 3, 3, 2, 2, 2, 0, 4], dtype=int32)

In [11]:
df_geocode_merged = df_geocode_data

# add clustering labels
df_geocode_merged['Cluster Labels'] = kmeans.labels_


manhattan_merged.head() # check the last columns!

NameError: name 'df_geocode_data' is not defined