# Segmenting and Clustering Neighborhoods in Toronto

In [46]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

import bs4 as bs
import urllib.request

print('Libraries imported.')

Libraries imported.


# Part 1

### Scrap Toronto neighborhood data from https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [47]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'


source = urllib.request.urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').read()
soup = bs.BeautifulSoup(source,'html.parser')
table = soup.find('table',{'class':'wikitable sortable'})


### Extract column headers

In [48]:
headers=table.findAll('th')
for i, head in enumerate(headers): 
    headers[i]=str(headers[i]).replace("<th>","").replace("</th>","").replace('\n', "")
print(headers)

['Postal code', 'Borough', 'Neighborhood']


### Extract data rows

In [49]:
rows=table.findAll('tr')
rows=rows[1:]
l = []
for tr in rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        l.append(row)

### Assign rows to dataframe

In [50]:
df=pd.DataFrame(l, columns = headers)
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


### Ignore cells with a borough that is "Not assigned"

In [51]:
df = df[df.Borough != 'Not assigned']
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


### Combine rows of same the PostalCode and Borough into one row with the neighborhoods separated with a comma.

In [52]:
df = df.groupby(['Postal code', 'Borough']).agg(', '.join)
df = df.reset_index()
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### if a cell has a borough but a "Not assigned" neighborhood, then the neighborhood will be the same as the borough.

In [53]:
df.loc[df['Neighborhood']=='Not assigned', ['Neighborhood']] = 'Queen\'s Park'
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [54]:
df.shape

(103, 3)

# Part 2

### Get the longitude and latitude of the neighborhoods

In [58]:
df_geo= pd.read_csv("http://cocl.us/Geospatial_data")
df_geo.rename(columns={'Postal Code':'Postal code'}, inplace=True)
toronto_data=pd.merge(df, df_geo)
toronto_data.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Part 3

### Define Foursquare Credentials and Version

In [60]:
CLIENT_ID = '135TWBFTCYNYOUUIHSHAD0KVAAVBCNQWGL0VMRVZOFGZ1DB5' # your Foursquare ID
CLIENT_SECRET = 'QQAA5TBAVT3FEOY1ELHKPSCIKBB5YO123FIJWLOANYPGZUM3' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 135TWBFTCYNYOUUIHSHAD0KVAAVBCNQWGL0VMRVZOFGZ1DB5
CLIENT_SECRET:QQAA5TBAVT3FEOY1ELHKPSCIKBB5YO123FIJWLOANYPGZUM3


### Let's explore the first Borough in our dataframe.

In [61]:
toronto_data.loc[0, 'Borough']

'Scarborough'

### Get the neighborhood's latitude and longitude values.

In [62]:
borough_latitude = toronto_data.loc[0, 'Latitude'] # neighborhood latitude value
borough_longitude = toronto_data.loc[0, 'Longitude'] # neighborhood longitude value

borough_name = toronto_data.loc[0, 'Borough'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(borough_name, 
                                                               borough_latitude, 
                                                               borough_longitude))

Latitude and longitude values of Scarborough are 43.806686299999996, -79.19435340000001.


### Now, let's get the top 50 venues that are in Marble Hill within a radius of 1000 meters.

In [74]:
LIMIT = 50
radius = 1000

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    borough_latitude, 
    borough_longitude, 
    radius, 
    LIMIT)

results = requests.get(url).json()

In [75]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [78]:
venues = results['response']['groups'][0]['items']

nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Harvey's,Restaurant,43.80002,-79.198307
1,Wendy's,Fast Food Restaurant,43.802008,-79.19808
2,Wendy’s,Fast Food Restaurant,43.807448,-79.199056
3,RBC Royal Bank,Bank,43.798782,-79.19709
4,Caribbean Wave,Caribbean Restaurant,43.798558,-79.195777


In [79]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

18 venues were returned by Foursquare.
