# Introduction/Business Problem
### To identify the best location to set up a new Park in Toronto
### This could help the government bodies choose the right location for parks
### Hence, making sure that the parks are evenly distributed and  larger number of people visit the park, hence increasing the overall health of the populaton




# Data
### 1. Data on Toronto Neighbourhoods taken from Wikipedia
### 2. Geospatial data to get the latitude and longitude of each neighbourhood 
### 3. Using the latitude and longitude, we make API calls to FOURSQUARE, and get data on the venues in neighbourhood like parks, restaurants etc. 
### 4. Using the geospatial data, we can also plot them on a map to give us a better understanding of the data

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis


import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    folium-0.5.0               |             py_0          45 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    altair-3.2.0               |           py36_0         770 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         868 KB

The following NEW packages will be INSTALLED:

    altair:  3.2.0-py36_0 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forg

In [2]:
!pip install --upgrade BeautifulSoup4
!pip install lxml

Requirement already up-to-date: BeautifulSoup4 in /opt/conda/envs/Python36/lib/python3.6/site-packages (4.8.1)


## Getting data using Beautiful Soup

In [3]:
from bs4 import BeautifulSoup
CAsource = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
CAsoup = BeautifulSoup(CAsource.text, 'html.parser')
#using soup object, iterate the .wikitable to get the data from the HTML page and store it into a list
data = []
columns = []
table = CAsoup.find(class_='wikitable')
for index, tr in enumerate(table.find_all('tr')):
    section = []
    for td in tr.find_all(['th','td']):
        section.append(td.text.rstrip())
    
    #First row of data is the header
    if (index == 0):
        columns = section
    else:
        data.append(section)

#convert list into Pandas DataFrame
CA_df = pd.DataFrame(data = data,columns = columns)
CA_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Removing Neighbourhoods where the Borough is not assigned

In [4]:
CA_df = CA_df[CA_df['Borough'] != 'Not assigned']
CA_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### Merging Neighbourhoods with the same postal code

In [5]:
CA_df["Neighbourhood"] = CA_df.groupby("Postcode")["Neighbourhood"].transform(lambda neigh: ', '.join(neigh))
#print(CA_df)
CA_df = CA_df.drop_duplicates('Postcode')


CA_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Harbourfront, Regent Park"
6,M6A,North York,"Lawrence Heights, Lawrence Manor"
8,M7A,Queen's Park,Not assigned


### Replacing Neighbourhood with Borough names, in case not assigned

In [6]:
CA_df['Neighbourhood'].replace("Not assigned", CA_df["Borough"],inplace=True)
CA_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Harbourfront, Regent Park"
6,M6A,North York,"Lawrence Heights, Lawrence Manor"
8,M7A,Queen's Park,Queen's Park


In [7]:
CA_df.shape

(103, 3)

### Geeting Latitutde and Longitude for the Neighbourhoods

In [8]:
CA2_df=CA_df 

In [9]:
CA2_df.head()
a = pd.read_csv('https://cocl.us/Geospatial_data')
gdf = pd.DataFrame(a)
gdf=gdf.rename(columns = {'Postal Code':'Postcode'})
CA2_df.head()
gdf.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
CA_df_merge_col = pd.merge(CA2_df, gdf, on='Postcode')
CA_df_merge_col
CAlatlong=CA_df_merge_col

In [11]:
CAlatlong.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [12]:
#!pip install folium
import folium
from geopy.geocoders import Nominatim
address = 'Toronto, Canada'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [39]:
toronto = folium.Map(location=[43.6532, -79.3832], zoom_start=11)
for lat, lng, label in zip( CAlatlong['Latitude'], CAlatlong['Longitude'], CAlatlong['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='purple',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto) 
toronto

## Using FOURSQUARE API to get Data of Venues nearby the Neighbourhoods

In [16]:
FOURSQUARE_CLIENT_ID = '1ZJFHXNX2PAGW21KI0N5QYTI3WENFALQOA42OLIZIM3OKDQV'
FOURSQUARE_CLIENT_SECRET = 'HULDXQ2NZEAOBLHEI35P1APEBLTNNPZIAN5DZXJBF4WNO2ZL'
RADIUS = 4000
LIMIT=100
NO_OF_VENUES = 100
VERSION = '20191007' 

In [19]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            FOURSQUARE_CLIENT_ID, 
            FOURSQUARE_CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # making the request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)
toronto_venues = getNearbyVenues(names=CAlatlong['Neighbourhood'],
                                latitudes=CAlatlong['Latitude'],
                                longitudes=CAlatlong['Longitude'])

Parkwoods
Victoria Village
Harbourfront, Regent Park
Lawrence Heights, Lawrence Manor
Queen's Park
Islington Avenue
Rouge, Malvern
Don Mills North
Woodbine Gardens, Parkview Hill
Ryerson, Garden District
Glencairn
Cloverdale, Islington, Martin Grove, Princess Gardens, West Deane Park
Highland Creek, Rouge Hill, Port Union
Flemingdon Park, Don Mills South
Woodbine Heights
St. James Town
Humewood-Cedarvale
Bloordale Gardens, Eringate, Markland Wood, Old Burnhamthorpe
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Downsview North, Wilson Heights
Thorncliffe Park
Adelaide, King, Richmond
Dovercourt Village, Dufferin
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto
Harbourfront East, Toronto Islands, Union Station
Little Portugal, Trinity
East Birchmount Park, Ionview, Kennedy Park
Bayview Village
CFB Toronto, Downsview East
The D

In [40]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


### Using One Hot Encoding

In [24]:
# one hot encoding
to_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
to_onehot['Neighborhoods'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [to_onehot.columns[-1]] + list(to_onehot.columns[:-1])
to_onehot = to_onehot[fixed_columns]

print(to_onehot.shape)
to_onehot.head()

(2235, 277)


Unnamed: 0,Neighborhoods,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
groups = to_onehot.groupby(["Neighborhoods"]).mean().reset_index()

print(groups.shape)
groups.head(2)

(99, 277)


Unnamed: 0,Neighborhoods,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.02,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
park = groups[["Neighborhoods","Park"]]
park.head()

Unnamed: 0,Neighborhoods,Park
0,"Adelaide, King, Richmond",0.0
1,Agincourt,0.0
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.333333
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0
4,"Alderwood, Long Branch",0.0


## Applying K-means Clustering

In [30]:
toclusters = 5

to_clustering = park.drop(["Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=toclusters, random_state=0).fit(to_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

to_clustering.head()

Unnamed: 0,Park
0,0.0
1,0.0
2,0.333333
3,0.0
4,0.0


In [32]:
to_merged = park.copy()
to_merged["Cluster Labels"] = kmeans.labels_

In [33]:
to_merged = to_merged.join(toronto_venues.set_index("Neighborhood"), on="Neighborhoods")

print(to_merged.shape)
to_merged.head()

(2235, 9)


Unnamed: 0,Neighborhoods,Park,Cluster Labels,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Adelaide, King, Richmond",0.0,0,43.650571,-79.384568,Four Seasons Centre for the Performing Arts,43.650592,-79.385806,Concert Hall
0,"Adelaide, King, Richmond",0.0,0,43.650571,-79.384568,The Keg Steakhouse & Bar,43.649937,-79.384196,Steakhouse
0,"Adelaide, King, Richmond",0.0,0,43.650571,-79.384568,Nathan Phillips Square,43.65227,-79.383516,Plaza
0,"Adelaide, King, Richmond",0.0,0,43.650571,-79.384568,Rosalinda,43.650252,-79.385156,Vegetarian / Vegan Restaurant
0,"Adelaide, King, Richmond",0.0,0,43.650571,-79.384568,Shangri-La Toronto,43.649129,-79.386557,Hotel


In [34]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [38]:
# create map
#print(latitude,longitude)
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

sample_to_merged= to_merged.sample(n=100)

colordict = {0: 'lightblue', 1: 'lightgreen', 2: 'orange', 3: 'red'}
for lat, lon, poi, cluster in zip(sample_to_merged['Neighborhood Latitude'], sample_to_merged['Neighborhood Longitude'], sample_to_merged['Neighborhoods'], sample_to_merged['Cluster Labels']):
    label = folium.Popup((str(poi) + ' - cluster ' + str(cluster)),parse_html=True)
    ##print(label)

    if cluster == 0 :
        folium.CircleMarker(
            [lat, lon],
            radius=5,
            popup=label,
            color = 'red',
            fill_color= '#3186cc',
            fill_opacity=0.7).add_to(map_clusters)

    if cluster == 1:
            folium.CircleMarker(
            [lat, lon],
            radius=5,
            popup=label,
            color = 'darkblue',
            fill_color= '#3186cc',
            fill_opacity=0.7).add_to(map_clusters)

    if cluster == 2:
            folium.CircleMarker(
            [lat, lon],
            radius=5,
            popup=label,
            color = 'orange',
            fill_color= '#3186cc',
            fill_opacity=0.7).add_to(map_clusters)

        
map_clusters