## Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
print('Libraries imported.')

Libraries imported.


In [2]:
#get the html data from the wiki link

data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(data, 'html.parser')

In [3]:
# Define the arrays for the colums and fill it with the data extracted from html
postalCodeList = []
boroughList = []
neighborhoodList = []

for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')    
 
    if(len(cells) > 0):
        for cell in cells:
            postalCodeList.append(cell.find('b').text.strip())
            if cell.find('i'):
                boroughList.append(cell.find('i').text.strip())
                neighborhoodList.append('Not assigned')
            else:
                text_str = cell.find('span').text    
                if text_str.find('(') != -1:
                    separatorindex = text_str.index('(')
                    boroughList.append(cell.find('span').text[:separatorindex].replace('(','').replace(')','').strip())
                    neighborhoodList.append(cell.find('span').text[separatorindex:].replace('(','').replace(')','').strip())
                else:
                    boroughList.append(text_str.strip())
                    neighborhoodList.append('Not assigned')
                


In [4]:
# assign the array data to dictionary and then convert it to data frame

toronto_neighorhood = [('PostalCode', postalCodeList),
                      ('Borough', boroughList),
                      ('Neighborhood', neighborhoodList)]
toronto_df = pd.DataFrame.from_dict(dict(toronto_neighorhood))
toronto_df.head(15)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Queen's Park / Ontario Provincial Government,Not assigned
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,Malvern / Rouge


In [5]:
#remove the non assigned row

toronto_df = toronto_df[toronto_df.Borough != 'Not assigned']
toronto_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Queen's Park / Ontario Provincial Government,Not assigned
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,Malvern / Rouge
11,M3B,North York,Don MillsNorth
12,M4B,East York,Parkview Hill / Woodbine Gardens
13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [6]:
#replace the / with , 
toronto_df = toronto_df.replace('/',',', regex=True)
toronto_df = toronto_df.reset_index(drop=True)
toronto_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,"Queen's Park , Ontario Provincial Government",Not assigned
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern , Rouge"
7,M3B,North York,Don MillsNorth
8,M4B,East York,"Parkview Hill , Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [7]:
#shape of the dataframe
print(toronto_df.shape)

(103, 3)


# Part 1

## Geospatial data


In [8]:

url2="http://cocl.us/Geospatial_data"
geo_data=pd.read_csv(url2)
geo_data.head(10)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [9]:
print(list(toronto_df))
print(list(geo_data))

full_table = toronto_df.set_index('PostalCode').join(geo_data.set_index('Postal Code'))
full_table = full_table.sample(frac=1).reset_index(drop=True)
full_table.head(20)

['PostalCode', 'Borough', 'Neighborhood']
['Postal Code', 'Latitude', 'Longitude']


Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
1,Downtown Toronto,Berczy Park,43.644771,-79.373306
2,North York,"Northwood Park , York University",43.76798,-79.487262
3,North York,"Bedford Park , Lawrence Manor East",43.733283,-79.41975
4,Scarborough,Agincourt,43.7942,-79.262029
5,EtobicokeNorthwest,"Clairville , Humberwood , Woodbine Downs , Wes...",43.706748,-79.594054
6,North York,Glencairn,43.709577,-79.445073
7,Central Toronto,Forest Hill North & West,43.696948,-79.411307
8,Etobicoke,"South Steeles , Silverstone , Humbergate , Jam...",43.739416,-79.588437
9,East YorkEast Toronto,The Danforth East,43.685347,-79.338106


# Part 2

## Creating a Map of Geo location findings

In [10]:
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

## finding toronto coordinates

In [11]:
address = 'Toronto'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

  app.launch_new_instance()


The geograpical coordinate of Toronto are 43.653963, -79.387207.


 ## Explore and cluster the neighborhoods in Toronto

In [12]:

CLIENT_ID = '2MY4LJEFD4NRY2J21OJNWS20AZU4FNF0IOXXYR5ACBZXVQ5L' # your Foursquare ID
CLIENT_SECRET = 'TYQ2P0LOWAAKGGOE4R4EZOHWNMRE0J4GZTLJEPJDO4IEMKJJ' # your Foursquare Secret

VERSION = '20180605' # Foursquare API version
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 2MY4LJEFD4NRY2J21OJNWS20AZU4FNF0IOXXYR5ACBZXVQ5L
CLIENT_SECRET:TYQ2P0LOWAAKGGOE4R4EZOHWNMRE0J4GZTLJEPJDO4IEMKJJ


In [13]:
full_table.loc[0, 'Neighborhood']

'Church and Wellesley'

In [14]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [15]:

toronto_venues = getNearbyVenues(names=full_table['Neighborhood'],
                                   latitudes=full_table['Latitude'],
                                   longitudes=full_table['Longitude']
                                  )
print(toronto_venues.shape)
toronto_venues.head()

Church and Wellesley
Berczy Park
Northwood Park , York University
Bedford Park , Lawrence Manor East
Agincourt
Clairville , Humberwood , Woodbine Downs , West Humber , Kipling Heights , Rexdale , Elms , Tandridge , Old Rexdale
Glencairn
Forest Hill North & West
South Steeles , Silverstone , Humbergate , Jamestown , Mount Olive , Beaumond Heights , Thistletown , Albion Gardens
The Danforth  East
Not assigned
Mimico NW , The Queensway West , South of Bloor , Kingsway Park South West , Royal York South West
Upper Rouge
Runnymede , Swansea
Humber Summit
Bathurst Manor , Wilson Heights , Downsview North
Kennedy Park , Ionview , East Birchmount Park
WillowdaleWest
New Toronto , Mimico South , Humber Bay Shores
Del Ray , Mount Dennis , Keelsdale and Silverthorn
Regent Park , Harbourfront
Steeles West , L'Amoreaux West
Brockton , Parkdale Village , Exhibition Place
Milliken , Agincourt North , Steeles East , L'Amoreaux East
Willowdale , Newtonbrook
Clarks Corners , Tam O'Shanter , Sullivan
Law

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Church and Wellesley,43.66586,-79.38316,Storm Crow Manor,43.66684,-79.381593,Theme Restaurant
1,Church and Wellesley,43.66586,-79.38316,DanceLifeX Centre,43.666956,-79.385297,Dance Studio
2,Church and Wellesley,43.66586,-79.38316,The Alley,43.665922,-79.385567,Bubble Tea Shop
3,Church and Wellesley,43.66586,-79.38316,Sansotei Ramen 三草亭,43.666735,-79.385353,Ramen Restaurant
4,Church and Wellesley,43.66586,-79.38316,Smith,43.666927,-79.381421,Breakfast Spot


## Drawing a Map using found co-ordinates

In [16]:
map_geo = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(full_table['Latitude'], full_table['Longitude'], full_table['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_geo)  
    
map_geo