# Coursera Project

In [87]:
from IPython.display import HTML
from IPython.display import display

# Taken from https://stackoverflow.com/questions/31517194/how-to-hide-one-specific-cell-input-or-output-in-ipython-notebook
tag = HTML('''<script>
code_show=true; 
function code_toggle() {
    if (code_show){
        $('div.cell.code_cell.rendered.selected div.input').hide();
    } else {
        $('div.cell.code_cell.rendered.selected div.input').show();
    }
    code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
Sensitive info <a href="javascript:code_toggle()">here</a>.''')
display(tag)

#### This a notebook that will be used for the "Battle of the Neighborhoods" Capstone project

In [2]:
# Importing numpy and pandas libraries 
import pandas as pd
import numpy as np
import json as js

In [3]:
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

#### This part of the notebook will be used to do the assignment: "Segmenting and Clustering Neighborhoods in Toronto"

In [4]:
import requests as rq
from bs4 import BeautifulSoup 

In [5]:
# Fetching request for Library

fh = rq.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [6]:
# Parsing and Creatinbg the table

html = BeautifulSoup(fh.text, "html.parser")

TFSA = {
    "Postal Code": [],
    "Borough": [],
    "Neighborhood": [],
}

Data = []

for item in html.find("table",class_ = "wikitable sortable").findChildren('td'):
    Data.append(item.text.strip("\n"))
    
    
TFSA["Postal Code"] = Data[::3]
TFSA["Borough"] = Data[1::3]
TFSA["Neighborhood"] = Data[2::3]

DF = pd.DataFrame(TFSA)

DF

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [7]:
# Sorting and Preparing the Data
DF = DF[DF.Borough != "Not assigned"]
DF = DF.reset_index(drop = True)
DF.sort_values(["Postal Code"])

Unnamed: 0,Postal Code,Borough,Neighborhood
6,M1B,Scarborough,"Malvern, Rouge"
12,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
18,M1E,Scarborough,"Guildwood, Morningside, West Hill"
22,M1G,Scarborough,Woburn
26,M1H,Scarborough,Cedarbrae
...,...,...,...
64,M9N,York,Weston
70,M9P,Etobicoke,Westmount
77,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
89,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


This is the final dataframe after scraping and formating the information from the wikipedia page. Next I will use the CSV document with the Lat, Long positions and append them to this data frame.

#### Finding Geo Coordinates of Neighborhoods

In [8]:
# Reading the CSV file
fh = pd.read_csv("C:/Users/howto/Downloads/CourseraProject/Geospatial_Coordinates.csv")
fh = pd.DataFrame(fh)
fh = round(fh, 2)
fh.sort_values(["Postal Code"])


# Join on in Pandas
final = DF.join(fh.set_index('Postal Code'), on='Postal Code')
final

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75,-79.33
1,M4A,North York,Victoria Village,43.73,-79.32
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65,-79.36
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72,-79.46
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66,-79.39
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.65,-79.51
99,M4Y,Downtown Toronto,Church and Wellesley,43.67,-79.38
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.66,-79.32
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.64,-79.50


In this section I appended the lat, long coordinates of the Neighborhoods from the CSV file provided by the course.

#### Plotting the Neighborhoods in Toronto

In this section I will plot the various neighborhoods found in toronto from my organized data frame.

In [9]:
# Finding address of toronto
from geopy.geocoders import Nominatim as nm
import folium as fl

address = 'Toronto, CA'

gl= nm(user_agent = "coursera project")
loc = gl.geocode(address)
latitude = loc.latitude
longitude = loc.longitude
print('Coords Toronto = Lat: {}, Long: {}.'.format(latitude, longitude))

Coords Toronto = Lat: 43.6534817, Long: -79.3839347.


In [10]:
# Creating map of toronto
map_t = fl.Map(location = [latitude, longitude], zoom_start = 10)


# Plotting data on the map
for lat, lng, borough, neighborhood in zip(final['Latitude'], final['Longitude'], final['Borough'], final['Neighborhood']):
    # Labeling Marker
    label = '{}, {}'.format(neighborhood, borough)
    label = fl.Popup(label, parse_html=True)
    # Creating Marker
    fl.CircleMarker(
        [lat, lng],
        radius=1,
        popup=label,
        color='red',
        parse_html=False).add_to(map_t)  
    
map_t

This map shows a visualisation of the locations of the different neighborhoods found within Toronto

#### Clustering for Downtown Toronto 

For my Clustering Project I decided to focus on the borough "Downtown Toronoto" and explore mexican restaurants found there. First of all I need to create a new dataframe with just the specified borough.

In [11]:
dnew = final[final['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
dnew.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65,-79.36
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66,-79.39
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.66,-79.38
3,M5C,Downtown Toronto,St. James Town,43.65,-79.38
4,M5E,Downtown Toronto,Berczy Park,43.64,-79.37


#### Function to get Neary by Venues

Here I define a function which allows me to convert my dataframe into a dataframe containing the mexican restaurants and their locations using the folium api.

In [85]:
display(tag)
cid = 'JTUTIJJCWTO3TF4CWTGUBGAIG4P4CQMX1QNCVVERXCX0VBBH'
cs = 'DHP1QE34WDFGYCCVG2BTGXCWLSCP52YNLI5IMEV0P2KMV2XU'

In [86]:
def NearbyVenues(venue, radius, names, latitudes, longitudes):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # set an api call
        
        par = {'client_id': cid,
        'client_secret': cs,
        'v': '20190425', 
        'limit': 5,
        'll':  str(lat)+","+str(lng),
        'query': venue,
        'radius': radius}

        # Parse through json and extract nessassery information
    
        file = rq.get("https://api.foursquare.com/v2/venues/search", params = par).json()["response"]["venues"]
        for v in file:
            venues_list.append([
            name, 
            lat, 
            lng, 
            v['name'], 
            v['location']['lat'], 
            v['location']['lng'],  
            ])
        
        # Create Data Frame for the extracted information
        
        nvenues = pd.DataFrame([item for venue_list in venues_list for item in venues_list])
        nvenues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude']
    return(nvenues)

#### Finding Mexican Restaurant in Toronto 

Using the previously defined function I create a new dataset containing the different mexican restaurant locations

In [83]:
nvenues = NearbyVenues("Mexican Restaurant", 500, dnew['Neighborhood'], dnew['Latitude'], dnew['Longitude'])


In [68]:
nvenues.groupby('Neighborhood').count()[["Venue"]]

Unnamed: 0_level_0,Venue
Neighborhood,Unnamed: 1_level_1
Berczy Park,77
Central Bay Street,385
Christie,231
Church and Wellesley,385
"Commerce Court, Victoria Hotel",385
"First Canadian Place, Underground city",385
"Garden District, Ryerson",385
"Harbourfront East, Union Station, Toronto Islands",385
"Kensington Market, Chinatown, Grange Park",385
"Queen's Park, Ontario Provincial Government",385


Check the number of venues returned in a 500 mile radiues from the original geolocation

In [69]:
nvenues

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude
0,"Regent Park, Harbourfront",43.65,-79.36,Archeo,43.650667,-79.359431
1,"Regent Park, Harbourfront",43.65,-79.36,Site Of Great Canary Restaurant,43.653323,-79.357883
2,"Regent Park, Harbourfront",43.65,-79.36,Morning Glory Cafe,43.653947,-79.361149
3,"Queen's Park, Ontario Provincial Government",43.66,-79.39,Subway,43.659875,-79.388606
4,"Queen's Park, Ontario Provincial Government",43.66,-79.39,Some Time BBQ Grill Restaurant 碳烤屋,43.655874,-79.393826
...,...,...,...,...,...,...
5924,Church and Wellesley,43.67,-79.38,Matisse Restaurant And Bar,43.670574,-79.384856
5925,Church and Wellesley,43.67,-79.38,Subway,43.671232,-79.380731
5926,Church and Wellesley,43.67,-79.38,Saxony Restaurant,43.670932,-79.376372
5927,Church and Wellesley,43.67,-79.38,Mahal Kita Restaurant. Take-out & Catering,43.671215,-79.375069


#### Finding the cluster

Using the sk.learn library I use kmeans clustering to create 5 clusters based on these locations

In [64]:
from sklearn.cluster import KMeans as km

kclusters = 5
clst = nvenues[["Venue Latitude", "Venue Longitude"]]
kmeans = km(n_clusters = kclusters, random_state = 0).fit(clst)

I append the returned array with the cluster to which they belong to

In [65]:
cluster = pd.DataFrame(kmeans.labels_, columns = ["cluster"])
fincluster = pd.concat([nvenues,cluster], axis = 1)
fincluster

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,cluster
0,"Regent Park, Harbourfront",43.65,-79.36,Archeo,43.650667,-79.359431,4
1,"Regent Park, Harbourfront",43.65,-79.36,Site Of Great Canary Restaurant,43.653323,-79.357883,4
2,"Regent Park, Harbourfront",43.65,-79.36,Morning Glory Cafe,43.653947,-79.361149,4
3,"Queen's Park, Ontario Provincial Government",43.66,-79.39,Subway,43.659875,-79.388606,0
4,"Queen's Park, Ontario Provincial Government",43.66,-79.39,Some Time BBQ Grill Restaurant 碳烤屋,43.655874,-79.393826,0
...,...,...,...,...,...,...,...
5924,Church and Wellesley,43.67,-79.38,Matisse Restaurant And Bar,43.670574,-79.384856,2
5925,Church and Wellesley,43.67,-79.38,Subway,43.671232,-79.380731,2
5926,Church and Wellesley,43.67,-79.38,Saxony Restaurant,43.670932,-79.376372,2
5927,Church and Wellesley,43.67,-79.38,Mahal Kita Restaurant. Take-out & Catering,43.671215,-79.375069,2


Next I plot the previous dataframe using foliumn and matplotlib colors to show the distinct clusters

In [66]:
# add colors
import matplotlib.cm as cm
import matplotlib.colors as colors


# create map
clustermp = fl.Map(location=[latitude , longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(fincluster['Venue Latitude'], fincluster['Venue Longitude'], fincluster['Venue'], fincluster['cluster']):
    label = fl.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    fl.CircleMarker(
        [lat, lon],
        radius=2,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(clustermp)
       
clustermp

Based on this map it is possible to see that the clusters formed are around distinct areas. Based on these clusters the best locations to get mexican food would be: "Toronto Union Station", "King", "Chinatown", "Parliment Street", "Wellesley" and  "Christie". However from the map the most dense cluster is located near "King" which is likely the best location to get Mexican Food. However this all depends where you start your journey from so each of these locations would be valid but "King" would be more prefered.