##                Project Capstone - Torronto Neighborhood
##### Import all the libraries

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


##### Data Acquistion

In [2]:
#importing the data by scrubbing the wiki page of canada postal codes
df_Canada  = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
df_Torronto = pd.DataFrame(df_Canada[0])
df_Torronto.shape

(287, 3)

##### Data Wrangling

In [3]:
#Data wrangling

#remove unassigned boroughs
indexnamesTodrop = df_Torronto[ df_Torronto['Borough'] == 'Not assigned' ].index
df_Torronto.drop( indexnamesTodrop, inplace=True)
print(df_Torronto.shape)

(210, 3)


In [4]:
#mark the borough name for the unassigned neighborhood
df_Torronto.loc[df_Torronto['Neighbourhood'] == ('Not assigned'), 'Neighbourhood'] = df_Torronto['Borough']
print(df_Torronto[df_Torronto['Neighbourhood'] == 'Not assigned'])

Empty DataFrame
Columns: [Postcode, Borough, Neighbourhood]
Index: []


In [5]:
#merge the rows which has same postal code
df_Torronto = df_Torronto.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df_Torronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [6]:
#get the final shape of the dataframe
df_Torronto.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [7]:
#get the final record count in the dataframe
df_Torronto.shape

(103, 3)

### Get the geo co-ordinates

In [8]:
import geocoder # import geocoder

In [9]:
%%time
g = geocoder.google('M1B, Toronto, Ontario')
print('M4Y', g.latlng)

#from google lat and longitude is not returning any, so reading from csv file

M4Y None
CPU times: user 24 ms, sys: 0 ns, total: 24 ms
Wall time: 65.9 ms


In [10]:
df_Coordinates = pd.read_csv("http://cocl.us/Geospatial_data")
df_Coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
#embed the lat and longitude to the Torronto postal code
df_TorrontoFinal = df_Torronto.merge(df_Coordinates,how ='left',left_on='Postcode',right_on='Postal Code')
df_TorrontoFinal.drop('Postal Code',axis=1,inplace=True)
df_TorrontoFinal.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [12]:
#get the lat and long for Torronto(whole city)
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
TR_latitude = location.latitude
TR_longitude = location.longitude
print('The geograpical coordinate of Torronto are {}, {}.'.format(TR_latitude, TR_longitude))


The geograpical coordinate of Torronto are 43.653963, -79.387207.


In [13]:
# create map of New York using latitude and longitude values
map_Torronto = folium.Map(location=(TR_latitude, TR_longitude), zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_TorrontoFinal['Latitude'], df_TorrontoFinal['Longitude'], df_TorrontoFinal['Borough'], df_TorrontoFinal['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Torronto)  
    
map_Torronto

### Segementation of Neighborhood

In [14]:
#lets explore the neighborhood which starts with Torronto
df_TR_Neigh = (df_TorrontoFinal[df_TorrontoFinal['Borough'].str.contains('East Tor')]).reset_index(drop=True)
df_TR_Neigh.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558


In [15]:
#with foursquare lets explore more!!!!!

#credentials
CLIENT_ID = 'FXAKR2VKXED2PPGI2ZPFMEJD5LAXQ3M4KI01THJWPMWAQHDC' # your Foursquare ID
CLIENT_SECRET = 'EJR3IXSZBX2KX0TAP1FYJBZLLPEXPCWV2WEW5XL4ZPM31JD3' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT=100


In [16]:
#function definition... lets loop through all boroughs and explore the venues surrounding it
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [17]:
#Get all the nearby venues by calling the function that we expored using foursquare
Torronto_venues = getNearbyVenues( names=df_TR_Neigh['Neighbourhood'],
                                   latitudes=df_TR_Neigh['Latitude'],
                                   longitudes=df_TR_Neigh['Longitude']
                                 )


In [52]:
Torronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Glen Stewart Ravine,43.6763,-79.294784,Other Great Outdoors
3,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
4,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood


In [23]:
Torronto_grouped = Torronto_venues.groupby('Venue Category').count()
Torronto_grouped.head()

Unnamed: 0_level_0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude
Venue Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
American Restaurant,3,3,3,3,3,3
Auto Workshop,1,1,1,1,1,1
Bakery,3,3,3,3,3,3
Bank,1,1,1,1,1,1
Bar,1,1,1,1,1,1


##### one hot coding to cluster the neighborhood

In [42]:
# one hot encoding
Toronto_onehot = pd.get_dummies(Torronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighborhood'] = Torronto_venues['Neighborhood'] 

# move neighborhood column to the first column
#print([Toronto_onehot.columns[0]])
#list(Toronto_onehot.columns[:-1])
fixed_columns = [Toronto_onehot.columns[48]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,Auto Workshop,Bakery,Bank,Bar,Bookstore,Brewery,Bubble Tea Shop,Burger Joint,Burrito Place,Butcher,Café,Caribbean Restaurant,Cheese Shop,Clothing Store,Coffee Shop,Comfort Food Restaurant,Comic Shop,Convenience Store,Cosmetics Shop,Coworking Space,Dessert Shop,Diner,Farmers Market,Fast Food Restaurant,Fish & Chips Shop,Fish Market,Food,Food & Drink Shop,Frozen Yogurt Shop,Fruit & Vegetable Store,Furniture / Home Store,Garden Center,Gastropub,Gay Bar,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Health Food Store,Ice Cream Shop,Italian Restaurant,Juice Bar,Latin American Restaurant,Light Rail Station,Liquor Store,Middle Eastern Restaurant,Movie Theater,Neighborhood.1,Other Great Outdoors,Park,Pet Store,Pizza Place,Pub,Recording Studio,Restaurant,Sandwich Place,Seafood Restaurant,Skate Park,Smoke Shop,Spa,Sports Bar,Stationery Store,Steakhouse,Sushi Restaurant,Thai Restaurant,Trail,Wine Bar
0,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,The Beaches,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,The Beaches,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [43]:
# set number of clusters
kclusters = 5

Cluster_TorontoNeighbor = Toronto_onehot.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Cluster_TorontoNeighbor)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 4, 2, 0, 0, 2], dtype=int32)

In [57]:
# add clustering labels
Torronto_venues.insert(0, 'Cluster Labels', kmeans.labels_)
Torronto_venues_Cluster = Torronto_venues
Torronto_venues_Cluster.head(15) # check the last columns!

Unnamed: 0,Cluster Labels,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,0,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,0,The Beaches,43.676357,-79.293031,Glen Stewart Ravine,43.6763,-79.294784,Other Great Outdoors
3,0,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
4,0,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
5,4,The Beaches,43.676357,-79.293031,Dip 'n Sip,43.678897,-79.297745,Coffee Shop
6,2,"The Danforth West, Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant
7,0,"The Danforth West, Riverdale",43.679557,-79.352188,MenEssentials,43.67782,-79.351265,Cosmetics Shop
8,0,"The Danforth West, Riverdale",43.679557,-79.352188,Cafe Fiorentina,43.677743,-79.350115,Italian Restaurant
9,2,"The Danforth West, Riverdale",43.679557,-79.352188,Mezes,43.677962,-79.350196,Greek Restaurant


##### analyse each cluster label

In [63]:
Torronto_venues_Cluster.loc[Torronto_venues_Cluster['Cluster Labels'] == 0, Torronto_venues_Cluster.columns[[1] + list(range(5, Torronto_venues_Cluster.shape[1]))]]

Unnamed: 0,Neighborhood,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676821,-79.293942,Trail
1,The Beaches,43.678879,-79.297734,Health Food Store
2,The Beaches,43.6763,-79.294784,Other Great Outdoors
3,The Beaches,43.679181,-79.297215,Pub
4,The Beaches,43.680563,-79.292869,Neighborhood
7,"The Danforth West, Riverdale",43.67782,-79.351265,Cosmetics Shop
8,"The Danforth West, Riverdale",43.677743,-79.350115,Italian Restaurant
12,"The Danforth West, Riverdale",43.677622,-79.352116,Yoga Studio
13,"The Danforth West, Riverdale",43.677663,-79.351313,Brewery
15,"The Danforth West, Riverdale",43.677062,-79.353934,Italian Restaurant


In [62]:
print(list(range(5, Torronto_venues_Cluster.shape[1])))
print(Torronto_venues_Cluster.shape[1])
print(Torronto_venues_Cluster.columns[1])

[5, 6, 7]
8
Neighborhood
