## Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
print('Libraries imported.')

Libraries imported.


In [2]:
#get the html data from the wiki link

data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(data, 'html.parser')

In [3]:
# Define the arrays for the colums and fill it with the data extracted from html
postalCodeList = []
boroughList = []
neighborhoodList = []

for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')    
 
    if(len(cells) > 0):
        for cell in cells:
            postalCodeList.append(cell.find('b').text.strip())
            if cell.find('i'):
                boroughList.append(cell.find('i').text.strip())
                neighborhoodList.append('Not assigned')
            else:
                text_str = cell.find('span').text    
                if text_str.find('(') != -1:
                    separatorindex = text_str.index('(')
                    boroughList.append(cell.find('span').text[:separatorindex].replace('(','').replace(')','').strip())
                    neighborhoodList.append(cell.find('span').text[separatorindex:].replace('(','').replace(')','').strip())
                else:
                    boroughList.append(text_str.strip())
                    neighborhoodList.append('Not assigned')
                
            
            
print(len(boroughList))
print(len(postalCodeList))
print(len(neighborhoodList))

180
180
180


In [4]:
# assign the array data to dictionary and then convert it to data frame

toronto_neighorhood = [('PostalCode', postalCodeList),
                      ('Borough', boroughList),
                      ('Neighborhood', neighborhoodList)]
toronto_df = pd.DataFrame.from_dict(dict(toronto_neighorhood))
toronto_df.head(15)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Queen's Park / Ontario Provincial Government,Not assigned
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,Malvern / Rouge


In [5]:
#remove the non assigned row

toronto_df = toronto_df[toronto_df.Borough != 'Not assigned']
toronto_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Queen's Park / Ontario Provincial Government,Not assigned
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,Malvern / Rouge
11,M3B,North York,Don MillsNorth
12,M4B,East York,Parkview Hill / Woodbine Gardens
13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [6]:
#replace the / with , 
toronto_df = toronto_df.replace('/',',', regex=True)
toronto_df = toronto_df.reset_index(drop=True)
toronto_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,"Queen's Park , Ontario Provincial Government",Not assigned
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern , Rouge"
7,M3B,North York,Don MillsNorth
8,M4B,East York,"Parkview Hill , Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [7]:
#shape of the dataframe
print(toronto_df.shape)

(103, 3)


# Part 1

## Geospatial data


In [10]:

url2="http://cocl.us/Geospatial_data"
geo_data=pd.read_csv(url2)
geo_data.head(10)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [13]:
print(list(toronto_df))
print(list(geo_data))

full_table = toronto_df.set_index('PostalCode').join(geo_data.set_index('Postal Code'))
full_table = full_table.sample(frac=1).reset_index(drop=True)
full_table.head(20)

['PostalCode', 'Borough', 'Neighborhood']
['Postal Code', 'Latitude', 'Longitude']


Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,York,"Del Ray , Mount Dennis , Keelsdale and Silvert...",43.691116,-79.476013
1,Scarborough,"Wexford , Maryvale",43.750072,-79.295849
2,North York,DownsviewWest,43.739015,-79.506944
3,North York,"Bathurst Manor , Wilson Heights , Downsview North",43.754328,-79.442259
4,Central Toronto,"The Annex , North Midtown , Yorkville",43.67271,-79.405678
5,"Queen's Park , Ontario Provincial Government",Not assigned,43.662301,-79.389494
6,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,...",43.636258,-79.498509
7,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
8,North York,Don MillsSouthFlemingdon Park,43.7259,-79.340923
9,North York,"Bedford Park , Lawrence Manor East",43.733283,-79.41975


# Part 2

## Creating a Map of Geo location findings

In [9]:
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    openssl-1.1.1e             |       h516909a_0         2.1 MB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    certifi-2019.11.28         |   py36h9f0ad1d_1         149 KB  conda-forge
    branca-0.4.0               |             py_0          26 KB  conda-forge
    altair-4.0.1               |             py_0         575 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    ------------------------------------------------------------
                       

In [14]:
address = 'Toronto'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

  app.launch_new_instance()


The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [17]:
map_geo = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(full_table['Latitude'], full_table['Longitude'], full_table['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_geo)  
    
map_geo