## Applied Data Science Capstone
This notebook will be used to demonstrate some skills that is necessary for a data science project.

In [1]:
import pandas as pd
import numpy as np

In [2]:
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


## Week 3 Assignment: Segmenting and Clustering Neighborhoods in Toronto, Canada

In [3]:
wiki_path = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
canada_df = pd.read_html(wiki_path)
canada_df[0]

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West
285,M8Z,Etobicoke,South of Bloor


- The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
- More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.
- If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park.
- Clean your Notebook and add Markdown cells to explain your work and any assumptions you are making.
- In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.

In [4]:
# remove Borough with 'Not assigned'
postcode_data = canada_df[0][canada_df[0]['Borough'] != 'Not assigned']

In [5]:
postcode_data

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
...,...,...,...
281,M8Z,Etobicoke,Kingsway Park South West
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West


In [6]:
cleaned_dict = {'PostalCode' : [], 'Borough': [], 'Neighborhood' : []}
for post, borough, neighb in zip(postcode_data['Postcode'], postcode_data['Borough'], 
        postcode_data['Neighbourhood']):
    if (post in cleaned_dict['PostalCode']):
        index = cleaned_dict['PostalCode'].index(post)
        current_string = cleaned_dict['Neighborhood'][index]
        current_string = current_string + ", " + neighb
        #del cleaned_dict['Neighborhood'][index]
        #cleaned_dict['Neighborhood'].insert(index, current_string)
        cleaned_dict['Neighborhood'][index] = current_string
    
    else:
        cleaned_dict['PostalCode'].append(post)
        cleaned_dict['Borough'].append(borough)
        cleaned_dict['Neighborhood'].append(neighb)

In [7]:
canada_clean_df = pd.DataFrame.from_dict(cleaned_dict)
canada_clean_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


In [8]:
canada_clean_df[canada_clean_df['PostalCode'] == 'M5V'].Neighborhood.values

array(['CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara'],
      dtype=object)

In [9]:
canada_clean_df.shape

(103, 3)

In [10]:
geo_coord_path = 'Geospatial_Coordinates.csv'
geo_coord_df = pd.read_csv(geo_coord_path)
geo_coord_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
geo_coord_df.shape

(103, 3)

In [12]:
canada_clean_df = pd.merge(canada_clean_df, geo_coord_df, left_on='PostalCode', 
         right_on='Postal Code', how='inner')
canada_clean_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M3A,North York,Parkwoods,M3A,43.753259,-79.329656
1,M4A,North York,Victoria Village,M4A,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,M5A,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",M6A,43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,M7A,43.662301,-79.389494


In [13]:
canada_clean_df.shape

(103, 6)

In [14]:
# remove extra column after merge
canada_clean_df.drop(columns=['Postal Code'] , inplace=True)
print(canada_clean_df.shape)
canada_clean_df.head()

(103, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494


In [15]:
# list all borough without repeat
canada_clean_df.Borough.unique()

array(['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough',
       'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

In [21]:
# create new dataframe that only contain Toronto area.
toronto_data = canada_clean_df[canada_clean_df['Borough'].isin(['Downtown Toronto',
                               'East Toronto',
                               'West Toronto',
                               'Central Toronto'])].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
1,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
2,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [22]:
from geopy.geocoders import Nominatim # convert address into latitude and longitude
import folium # map rendering library

In [23]:
address = 'Downtown Toronto, ON'

geolocator = Nominatim(user_agent='canada_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinate of Downtown Toronto are {}, {}'.format(latitude, longitude))

The geographical coordinate of Downtown Toronto are 43.6563221, -79.3809161


### Visualization
**Create a map of Ontario, Canada with neighborhoods superimposed on top.***

In [24]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False
    ).add_to(map_toronto)

map_toronto

## Use FourSquare data

In [28]:
# read credential from local file
cred_file_path = './foursquare.cre'
CREDENTIALS = {}
with open(cred_file_path, 'r') as file_object:
    CLIENT_ID = file_object.readline()
    CREDENTIALS['CLIENT_ID'] = CLIENT_ID
    CLIENT_SECRET = file_object.readline()
    CREDENTIALS['CLIENT_SECRET'] = CLIENT_SECRET

### Explore the first neighborhood in the dataframe

In [29]:
toronto_data.loc[0, 'Neighborhood']

'Harbourfront'

In [30]:
neighborhood_latitude = toronto_data.loc[0, 'Latitude']
neighborhood_longitude = toronto_data.loc[0, 'Longitude']

neighborhood_name = toronto_data.loc[0, 'Neighborhood']

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name,
                                                               neighborhood_latitude,
                                                               neighborhood_longitude))

Latitude and longitude values of Harbourfront are 43.6542599, -79.3606359.


In [33]:
# Get the top 100 venues for Harbourfront within a radius 500 meters.
LIMIT = 100

radius = 500

VERSION = '20180604'

url = url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CREDENTIALS['CLIENT_ID'], 
    CREDENTIALS['CLIENT_SECRET'], 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

In [35]:
import requests
import json
from pandas.io.json import json_normalize

results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e605015edbcad001b18b504'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Corktown',
  'headerFullLocation': 'Corktown, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 49,
  'suggestedBounds': {'ne': {'lat': 43.6587599045, 'lng': -79.3544279001486},
   'sw': {'lat': 43.6497598955, 'lng': -79.36684389985142}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '54ea41ad498e9a11e9e13308',
       'name': 'Roselle Desserts',
       'location': {'address': '362 King St E',
        'crossStreet': 'Trinity St',
        'lat': 43.653446723052674,
        'lng': -79.3620167174383,
        'labeledLatLngs': [{'label': 'display',
 

In [38]:
# create a function that extracts the category of the venue
def get_category_type(row):
    try:
        category_list = row['categories']
    except:
        category_list = row['venue.categories']
        
    if len(category_list) == 0:
        return None
    else:
        return category_list[0]['name']

In [40]:
venues = results['response']['groups'][0]['items']

nearby_venues = json_normalize(venues) # creates a dataframe

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Roselle Desserts,Bakery,43.653447,-79.362017
1,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,Cooper Koo Family YMCA,Distribution Center,43.653249,-79.358008
3,Body Blitz Spa East,Spa,43.654735,-79.359874
4,Morning Glory Cafe,Breakfast Spot,43.653947,-79.361149


In [41]:
print('{} venues were returned by Foursquare'.format(nearby_venues.shape[0]))

49 venues were returned by Foursquare
