## Capstone Assignment 03

1. Combine the Toronto boroughs with coordinate data
2. Choose East Toronto to analyze the venues distribution and clustering with K-Means 

In [1]:
# Request Data from FourSquare
import requests
import json
from bs4 import BeautifulSoup


# Data Analysis Module
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize

# Plot Module 
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
from geopy.geocoders import Nominatim
import folium

# Scikit Learn Module
from sklearn.cluster import KMeans

In [2]:
# Read the coordinates from the csv file
Toronto_data = pd.read_csv("CapstoneData_Toronto.csv",index_col=0)
coords_data = pd.read_csv("Geospatial_Coordinates.csv",index_col=None)

In [3]:
Toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [36]:
# Merge with Toronto data with each coordinate
Toronto_coords = pd.merge(Toronto_data,coords_data,on='Postal Code')
Toronto_coords.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [5]:
# Another method to require the coordinates
import geocoder

In [6]:
def getlat_lng(postal_code):
    lat_lng_coords = None
    while (lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [11]:
# Get the coordinate of "Toronto"
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent='ny_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {},{}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963,-79.387207.


In [15]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(Toronto_coords['Latitude'],Toronto_coords['Longitude'],Toronto_coords['Borough'],Toronto_coords['Neighborhood']):
    label = '{},{}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat,lng],
        radius=3,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
        
map_toronto

In [14]:
# Select East Toronto to process below analysis
east_toronto_data = Toronto_coords[Toronto_coords['Borough']=='East Toronto'].reset_index(drop=True)

Address1 = 'East Toronto'

geolocator = Nominatim(user_agent='ny_explorer')
location1 = geolocator.geocode(Address1)
latitude1 = location1.latitude
longitude1 = location1.longitude
print('The geograpical coordinate of East Toronto are {},{}.'.format(latitude1, longitude1))

The geograpical coordinate of East Toronto are 43.653963,-79.387207.


In [34]:
east_toronto_data['Neighborhood']

0                                          The Beaches
1                          The Danforth West,Riverdale
2                        The Beaches West,India Bazaar
3                                      Studio District
4    Business Reply Mail Processing Centre 969 Eastern
Name: Neighborhood, dtype: object

In [33]:
map_east_toronto = folium.Map(location=[latitude1, longitude1], zoom_start=13)

#Add Marker to map
for lat, lng, label in zip(east_toronto_data['Latitude'],east_toronto_data['Longitude'],east_toronto_data['Neighborhood']):    
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat,lng],
        radius=3,
        popup=label,
        color='green',
        fill=True,
        fill_color = '#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_east_toronto)
map_east_toronto

In [18]:
neighborhood_latitude = east_toronto_data.loc[0,'Latitude']
neighborhood_longitude = east_toronto_data.loc[0,'Longitude']
neighborhood_name=east_toronto_data.loc[0,'Neighborhood']
print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, neighborhood_latitude,neighborhood_longitude))

Latitude and longitude values of The Beaches are 43.67635739999999, -79.2930312.


In [22]:
clientid = "5PM542SKNZ2I5CILMMSP4OIUNFFWHYRYUM3VAV45HEJCMX1V"
clientsecret = "POQVL03MINSUUFQNO4MA40YXB4HZ5U2YW5SYOEK1S1KZOZON"
version = '20190520'
radius = 500
limit = 100
url = "https://api.foursquare.com/v2/venues/explore?&client_id={}&\
client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(clientid,clientsecret,version,latitude1,longitude1,radius,limit)

In [23]:
response = requests.get(url).json()

In [24]:
venues = response['response']['groups'][0]['items']
nearby_venues = json_normalize(venues)

In [25]:
filter_columns=['venue.name','venue.categories','venue.location.lat','venue.location.lng']
nearby_venues=nearby_venues.loc[:,filter_columns]

In [26]:
def get_category(row):
    try:
        category_list = row['categories']
    except:
        category_list = row['venue.categories']
    
    if len(category_list) == 0:
        return None
    else:
        return category_list[0]['name']

In [27]:
nearby_venues['venue.categories'] = nearby_venues.apply(get_category, axis=1)

In [28]:
nearby_venues.columns = [col.split('.')[-1] for col in nearby_venues.columns]

In [29]:
nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Downtown Toronto,Neighborhood,43.653232,-79.385296
1,Textile Museum of Canada,Art Museum,43.654396,-79.3865
2,Sansotei Ramen 三草亭,Ramen Restaurant,43.655157,-79.386501
3,Cafe Plenty,Café,43.654571,-79.38945
4,Tsujiri,Tea Room,43.655374,-79.385354


In [30]:
def getNearbyVenues(names, latitudes, longitudes, radius=500,limit=50):
    venues_list=[]
    for name, lat, lng in zip(names,latitudes,longitudes):
        print(name)
        url = "https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(clientid,clientsecret,version,lat,lng,radius,limit)
        response = requests.get(url).json()['response']['groups'][0]['items']
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in response])        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    
    nearby_venues.columns = ['Neighborhood',
                            'Neighborhood Latitude',
                            'Neighborhood Longitude',
                            'Venue',
                            'Venue Latitude',
                            'Venue Longitude',
                            'Venue Category']
    return(nearby_venues)

In [31]:
east_toronto_data['Neighborhood']

0                                          The Beaches
1                          The Danforth West,Riverdale
2                        The Beaches West,India Bazaar
3                                      Studio District
4    Business Reply Mail Processing Centre 969 Eastern
Name: Neighborhood, dtype: object

In [37]:
east_toronto_venues = getNearbyVenues(names= east_toronto_data['Neighborhood'],
                                      latitudes=east_toronto_data['Latitude'],
                                      longitudes=east_toronto_data['Longitude'])

The Beaches
The Danforth West,Riverdale
The Beaches West,India Bazaar
Studio District
Business Reply Mail Processing Centre 969 Eastern


In [38]:
east_toronto_onehot = pd.get_dummies(east_toronto_venues[['Venue Category']], prefix="", prefix_sep="")

In [39]:
east_toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
1,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
2,The Beaches,43.676357,-79.293031,Glen Stewart Park,43.675278,-79.294647,Park
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,"The Danforth West,Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


In [40]:
map_venues = folium.Map(location=[latitude, longitude], zoom_start=11)

markers_colors = []
for lat, lng, name, category in zip(east_toronto_venues['Venue Latitude'],east_toronto_venues['Venue Longitude'],east_toronto_venues['Venue'],east_toronto_venues['Venue Category']):
    label = folium.Popup(str(category) + ' : '+ str(name), parse_html=True)
    folium.CircleMarker(
        [lat,lng],
        radius = 5,
        popup = label,
        color = 'green',
        fill = True,
        fill_color = 'red',
        fill_opacity=0.7).add_to(map_venues)
    
map_venues

In [41]:
east_toronto_venues.groupby(['Venue Category']).count()['Venue'].head()

Venue Category
American Restaurant    3
Auto Workshop          1
Bakery                 3
Bank                   1
Bar                    1
Name: Venue, dtype: int64

In [42]:
east_toronto_onehot['Neighborhood'] = east_toronto_venues['Neighborhood']

In [43]:
fixed_columnorder = [east_toronto_onehot.columns[-1]] + list(east_toronto_onehot.columns[:-1])

In [47]:
# grouped data based on the categories of venues
insertcolumn = east_toronto_onehot['Neighborhood']
east_toronto_onehot = east_toronto_onehot.drop('Neighborhood',axis=1)
east_toronto_onehot.insert(0,'Neighborhood',insertcolumn)
east_toronto_venues_grouped = east_toronto_onehot.groupby('Neighborhood').mean().reset_index()
east_toronto_venues_grouped.head()

Unnamed: 0,Neighborhood,American Restaurant,Auto Workshop,Bakery,Bank,Bar,Board Shop,Bookstore,Brewery,Bubble Tea Shop,...,Restaurant,Sandwich Place,Seafood Restaurant,Skate Park,Spa,Stationery Store,Steakhouse,Sushi Restaurant,Trail,Yoga Studio
0,Business Reply Mail Processing Centre 969 Eastern,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,...,0.066667,0.0,0.0,0.066667,0.066667,0.0,0.0,0.0,0.0,0.066667
1,Studio District,0.051282,0.0,0.051282,0.025641,0.025641,0.0,0.025641,0.025641,0.0,...,0.0,0.025641,0.025641,0.025641,0.0,0.025641,0.0,0.0,0.0,0.025641
2,The Beaches,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"The Beaches West,India Bazaar",0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.05,0.0,...,0.0,0.05,0.0,0.0,0.0,0.0,0.05,0.05,0.0,0.0
4,"The Danforth West,Riverdale",0.02381,0.0,0.02381,0.0,0.0,0.0,0.02381,0.02381,0.02381,...,0.02381,0.0,0.0,0.0,0.02381,0.0,0.0,0.0,0.02381,0.02381


In [48]:
# Function for finding most common venues
def find_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [49]:
# Top 3 venues 
num_top_venues =3
indicators = ['st','nd','rd']
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}{} Most Common Venue'.format(ind+1))

        
neighborhood_venue_sort = pd.DataFrame(columns = columns)
neighborhood_venue_sort['Neighborhood'] = east_toronto_venues_grouped['Neighborhood']

In [50]:
for ind in np.arange(east_toronto_venues_grouped.shape[0]):
    neighborhood_venue_sort.iloc[ind,1:] = find_most_common_venues(east_toronto_venues_grouped.iloc[ind,:],num_top_venues)

In [51]:
neighborhood_venue_sort.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,Business Reply Mail Processing Centre 969 Eastern,Yoga Studio,Pizza Place,Auto Workshop
1,Studio District,Café,Coffee Shop,American Restaurant
2,The Beaches,Health Food Store,Park,Pub
3,"The Beaches West,India Bazaar",Park,Pizza Place,Ice Cream Shop
4,"The Danforth West,Riverdale",Greek Restaurant,Coffee Shop,Italian Restaurant


## Use 5 clusters to do K-Means analysis

In [58]:
kclusters = 5
east_toronto_cluster = east_toronto_venues_grouped.drop('Neighborhood',1)

In [59]:
kmeans = KMeans(n_clusters=kclusters, random_state= 0).fit(east_toronto_cluster)
kmeans.labels_[:]

array([3, 4, 1, 2, 0])

In [60]:
east_cluster = east_toronto_data
east_cluster['Cluster Labels'] = kmeans.labels_

In [61]:
east_cluster = east_cluster.join(neighborhood_venue_sort.set_index('Neighborhood'), on='Neighborhood')

In [62]:
east_cluster

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,3,Health Food Store,Park,Pub
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,4,Greek Restaurant,Coffee Shop,Italian Restaurant
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572,1,Park,Pizza Place,Ice Cream Shop
3,M4M,East Toronto,Studio District,43.659526,-79.340923,2,Café,Coffee Shop,American Restaurant
4,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558,0,Yoga Studio,Pizza Place,Auto Workshop


## Visualize the clusters

In [63]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys=[i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0,1,len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lng, poi, cluster in zip(east_cluster['Latitude'],east_cluster['Longitude'],east_cluster['Neighborhood'],east_cluster['Cluster Labels']):
    label = folium.Popup(str(poi) + 'Cluster'+ str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat,lng],
        radius = 5,
        popup = label,
        color = rainbow[cluster-1],
        fill = True,
        fill_color = rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
    
map_clusters