# Scope of this Notebook

Explore and cluster the neighborhoods in Toronto. You can decide to work with only boroughs that contain the word Toronto and then replicate the same analysis we did to the New York City data. It is up to you.

Just make sure:
1. to add enough Markdown cells to explain what you decided to do and to report any observations you make.
2. to generate maps to visualize your neighborhoods and how they cluster together.

Once you are happy with your analysis, submit a link to the new Notebook on your Github repository. __(3 marks)__

In [68]:
# @hidden_cell
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim #convert an address into latitude and longitude values
!conda install -c conda-forge folium=0.5.0 --yes
import folium #map rendering library
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
from pandas.io.json import json_normalize
import numpy as np
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
geopy                     1.17.0                     py_0    conda-forge
Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
folium                    0.5.0                      py_0    conda-forge


# Map displaying Toronto's Neighborhoods based on Postal Code

In [69]:
# @hidden_cell
url_wiki = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page=requests.get(url_wiki)
bs=BeautifulSoup(page.content, "lxml")
table_body=bs.find('tbody')
rows = table_body.find_all('tr')
col_values=[]
for row in rows:
    col=row.find_all('td')
    col=[x.text.strip() for x in col]
    if col:
        if col[1]!='Not assigned':
            if col[2]=='Not assigned':
                col[2]=col[1]
            col_values.append(col)
col_names=["PostalCode","Borough","Neighborhood"]
df_toronto=pd.DataFrame(col_values, columns=col_names)
df_toronto=df_toronto.groupby(["PostalCode","Borough"],)["Neighborhood"].apply(', '.join).reset_index().sort_values(by="PostalCode")
url_geodata='http://cocl.us/Geospatial_data'
df_geodata=pd.read_csv(url_geodata)
df_geodata.set_index("Postal Code", inplace = True)
df_toronto=df_toronto.join(df_geodata, on="PostalCode")
address = 'Toronto, CA'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
#print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))




In [70]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = 'Neighborhood - {}, Borough - {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

map_toronto

In [71]:
# @hidden_cell
CLIENT_ID = 'UFNACJPWWMJG4FOMNIIWCBGLVD1Z3LB5HTUQBBB35U3KMAZI' # your Foursquare ID
CLIENT_SECRET = 'ZYGCW1HY5CXFPGIH5XBOPXFU2H5VRA5K3J0M33143PVM3HWW' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

### Let's Explore Ryerson, Garden District Neighborhood in the Downtown Borough.

In [72]:
# @hidden_cell
downtown_toronto_data = df_toronto[df_toronto.Neighborhood =='Ryerson, Garden District'].reset_index(drop = True)
neighborhood_latitude = downtown_toronto_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = downtown_toronto_data.loc[0, 'Longitude'] # neighborhood longitude value
neighborhood_name = downtown_toronto_data.loc[0, 'Neighborhood'] # neighborhood name
print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                              neighborhood_latitude, 
                                                              neighborhood_longitude))
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 5000 # define radius
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
#print(url) # display URL
results = requests.get(url).json()

Latitude and longitude values of Ryerson, Garden District are 43.6571618, -79.37893709999999.


In [73]:
# @hidden_cell
#function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [74]:
# @hidden_cell
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues) # flatten JSON
# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
try:
    nearby_venues =nearby_venues.loc[:, filtered_columns]
    # filter the category for each row
    nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)
    # clean columns
    nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
except:
    nearby_venues=pd.DataFrame()
nearby_venues.columns=["Venue Name","Venue Category","Latitude","Longitude"]
print('{} Venues were returned for Neighborhood - {} , Borough - {} by Foursquare.'.format(nearby_venues.shape[0], downtown_toronto_data.Neighborhood[0], downtown_toronto_data.Borough[0]))

100 Venues were returned for Neighborhood - Ryerson, Garden District , Borough - Downtown Toronto by Foursquare.


### Printing the Top 10 Venue Categories in this Neighborhood

In [75]:
# @hidden_cells
top_categories = nearby_venues.groupby("Venue Category").count().drop(columns={"Latitude","Longitude"}).sort_values(by="Venue Name", ascending = False).head(10)
top_categories
 

Unnamed: 0_level_0,Venue Name
Venue Category,Unnamed: 1_level_1
Coffee Shop,10
Hotel,5
Restaurant,3
Diner,3
Gastropub,3
Park,3
Concert Hall,3
Café,3
Japanese Restaurant,3
Theater,3


### Printing the Details about the Venues related to Top 10 Categories in this Neighborhood

In [76]:
# @hidden_cell
nearby_venues[nearby_venues["Venue Category"].isin(top_categories.reset_index()["Venue Category"])].reset_index(drop = True)

Unnamed: 0,Venue Name,Venue Category,Latitude,Longitude
0,Page One Cafe,Café,43.657772,-79.376073
1,Ed Mirvish Theatre,Theater,43.655102,-79.379768
2,Kinka Izakaya Original,Japanese Restaurant,43.660596,-79.378891
3,Elgin And Winter Garden Theatres,Theater,43.653394,-79.378507
4,The Grand Hotel & Suites Toronto,Hotel,43.656449,-79.37411
5,The Queen and Beaver Public House,Gastropub,43.657472,-79.383524
6,GEORGE Restaurant,Restaurant,43.653346,-79.374445
7,Jimmy's Coffee,Coffee Shop,43.658421,-79.385613
8,Fahrenheit Coffee,Coffee Shop,43.652384,-79.372719
9,Dineen Coffee,Café,43.650497,-79.378765


### Map displaying these 100 venues

In [77]:
# @hidden_cell
address = 'Downtown, Toronto, CA'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown, Toronto are {}, {}.'.format(latitude, longitude))




The geograpical coordinate of Downtown, Toronto are 43.655115, -79.380219.


In [78]:
map_downtown_toronto = folium.Map(location=[latitude, longitude], zoom_start=14)
for lat, lng, vname, vcat in zip(nearby_venues['Latitude'], nearby_venues['Longitude'], nearby_venues['Venue Name'], nearby_venues['Venue Category']):
    label = '{}, {}'.format(vcat, vname)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_downtown_toronto)  

map_downtown_toronto

### Let's Cluster these 100 Venues into 4 groups and diplay them in a map in different colors

In [79]:
# @hidden_cell
# import k-means from clustering stage
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 4

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(nearby_venues.drop(["Venue Category", "Venue Name"],1))

# check cluster labels generated for each row in the dataframe
kmeans.labels_

nearby_venues["Cluster"]=kmeans.labels_



In [80]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=14)

rainbow = ['red','blue','yellow','green']

# add markers to the map
markers_colors = []
for lat, lon, vname, cluster in zip(nearby_venues['Latitude'], nearby_venues['Longitude'], nearby_venues['Venue Name'], nearby_venues['Cluster']):
    label = folium.Popup(str(vname) + ' Cluster - ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
map_clusters

### Hope you enjoyed this analysis