# Capstone Project - Segmenting and Clustering Neighborhoods in Toronto


### Required imports

In [2]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


### Converting the data in wikipedia page to a csv file

In [3]:
response = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

tables = pd.read_html(response.text)

tables[0].to_csv("Canada_data.csv")

# write holiday table data into `holiday_data` csv file


In [4]:
df = pd.read_csv("Canada_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Postal Code,Borough,Neighborhood
0,0,M1A,Not assigned,
1,1,M2A,Not assigned,
2,2,M3A,North York,Parkwoods
3,3,M4A,North York,Victoria Village
4,4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Data Cleaning

In [5]:
df = df.drop(['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [6]:
df = df[df.Borough != 'Not assigned']
df = df.reset_index()
df = df.drop(['index'], axis=1)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Total number of rows in the dataframe

In [7]:
df.shape

(103, 3)

# Adding Geospatial data to the data frame

In [8]:
df_geodata = pd.read_csv('http://cocl.us/Geospatial_data')
df_geodata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
geospatial_data = df.merge(df_geodata, on='Postal Code')
geospatial_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [10]:
geospatial_data.shape

(103, 5)

In [20]:
Toronto_data = geospatial_data[geospatial_data['Borough'].str.contains("Toronto")].reset_index(drop=True)
Toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


#### Defining Foursquare Credentials

In [26]:
CLIENT_ID = 'XFYWWV0CE4G25EKPHJVHXSHCZFGQCLNPIYERPQZBMOP0HJ01' 
CLIENT_SECRET = '32MTFFKBBTVXHALNZYPSJI1FJPU525NJ0MZFHW52AK3210PS' 
VERSION = '20180605'
print('Credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Credentails:
CLIENT_ID: XFYWWV0CE4G25EKPHJVHXSHCZFGQCLNPIYERPQZBMOP0HJ01
CLIENT_SECRET:32MTFFKBBTVXHALNZYPSJI1FJPU525NJ0MZFHW52AK3210PS


## Exploring the first neighborhood in dataframe.

In [23]:
Toronto_data.loc[0, 'Neighborhood']

'Regent Park, Harbourfront'

In [24]:
neighborhood_latitude = Toronto_data.loc[0, 'Latitude'] 
neighborhood_longitude = Toronto_data.loc[0, 'Longitude'] 

neighborhood_names = Toronto_data.loc[0, 'Neighborhood'] 

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_names, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Regent Park, Harbourfront are 43.6542599, -79.3606359.


In [40]:
LIMIT = 100
radius = 500 



# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

results = requests.get(url).json()
#results

In [28]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [39]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

nearby_venues.head()

48 venues were returned by Foursquare.


Unnamed: 0,name,categories,lat,lng
0,Roselle Desserts,Bakery,43.653447,-79.362017
1,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,Morning Glory Cafe,Breakfast Spot,43.653947,-79.361149
3,Cooper Koo Family YMCA,Distribution Center,43.653249,-79.358008
4,Body Blitz Spa East,Spa,43.654735,-79.359874


In [38]:
map_nearby_venues = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for name, categories, lat, lng  in zip(nearby_venues['name'], nearby_venues['categories'], nearby_venues['lat'], nearby_venues['lng']):
    categories = folium.Popup(categories, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=categories,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_nearby_venues)  
    
map_nearby_venues