# Importing some libraries

In [442]:
import pandas as pd
import numpy as np
# import k-means from clustering stage
from sklearn.cluster import KMeans
import requests # library to handle requests
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

## Loading and scrapping the Wikipedia page

In [443]:
# URL of the data to be read from wiki
url_data = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [444]:
# Using pandas to read html
df_wiki = pd.read_html(url_data)

# Capture only the table data
df = df_wiki[0]


In [445]:
# Remove rows with "Notassigned"
df = df[df.Neighbourhood != 'Not assigned']

Editing the records to allow one or more neighborhood in one postal code area

In [446]:
# Unique postcode
postcodes = df.Postcode.unique()

In [447]:
new_df = []
for i in range(len(postcodes)):
    idx = df.loc[df['Postcode'] == postcodes[i]]
    str_found = pd.DataFrame(idx.Neighbourhood)
    [r, c] = str_found.shape
    neighborhood_name = []
    for j in range(r):
        neighborhood_name.append(str_found.iat[j,0])
    tmp = [postcodes[i], df.iloc[i]['Borough'], neighborhood_name]
    new_df.append(tmp)


In [448]:
#new_df

In [449]:
new_df = pd.DataFrame(new_df, columns = ['Postcode','Borough' , 'Neighbourhood'])

**new_df** is the new dataframe after allowing one post code can contain more than one neighborhoods

In [450]:
new_df.shape

(102, 3)

Getting the **Latitude** and **Longitudet** and put them to our **new_df** 

In [451]:
#!pip install geocoder
postcodes = new_df.Postcode.unique()

In [452]:
la = pd.DataFrame(columns=['Latitude'])
long = pd.DataFrame(columns=['Longtitude'])

In [453]:
# Reading from CSV file
df_la_long = pd.read_csv('Geospatial_Coordinates.csv')

Adding two columns to store **Latitude** and **Longitude**

In [454]:
new_df = new_df.reindex(columns=[*new_df.columns.tolist(), 'Latitude', 'Longitude'], fill_value=0.00000000000)

**Looking** for latitudet and longitude of the Post code and fill up to data frame

In [455]:
for i in range(new_df.shape[0]):
    idx = df_la_long.loc[df_la_long['PostalCode'] == new_df.iat[i,0]]
    #print(new_df.iat[i,0])
    #print(idx)
    new_df.iat[i,3] = idx.iat[0,1]
    new_df.iat[i,4] = idx.iat[0,2]

**Below codes** aim at removing the "[" and "]" from the dataframe. 

In [456]:
for n in range(len(new_df['Neighbourhood'])):
    a = new_df.iat[n,2]
    if len(a) > 1:
        for i in range(len(a)):
            if i == 0:
                b = a[0]
            if i > 0:
                b = b+", " + a[i]
    else:
        b = a[0]
    new_df.iat[n,2] = b

In [457]:
new_df

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.654260,-79.360636
3,M6A,Downtown Toronto,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M9A,North York,Islington Avenue,43.667856,-79.532242
5,M1B,North York,"Rouge, Malvern",43.806686,-79.194353
6,M3B,Etobicoke,Don Mills North,43.745906,-79.352188
7,M4B,Scarborough,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
8,M5B,Scarborough,"Ryerson, Garden District",43.657162,-79.378937
9,M6B,North York,Glencairn,43.709577,-79.445073


In [458]:
print('The dataframe has {} Toronto and {} neighborhoods.'.format(
        len(new_df['Borough'].unique()),
        new_df.shape[0]
    )
)

The dataframe has 8 Toronto and 102 neighborhoods.


In [459]:
import geocoder # import geocoder
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


**Creating a map of Toronto with neighborhoods superimposed on top**

In [460]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(new_df['Latitude'], new_df['Longitude'], new_df['Borough'], new_df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [461]:
#toronto_data = new_df[new_df['Borough'] == 'Toronto'].reset_index(drop=True)
toronto_data = new_df[new_df['Borough'].str.contains('Toronto')]
toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,Downtown Toronto,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
12,M3C,Downtown Toronto,"Flemingdon Park, Don Mills South",43.7259,-79.340923
13,M4C,Downtown Toronto,Woodbine Heights,43.695344,-79.318389
26,M2H,Downtown Toronto,Hillcrest Village,43.803762,-79.363452


In [462]:
# create map of Toronto with neighbourhood that containing Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Borough'], toronto_data['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Next, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them.

***Define Foursquare Credentials and Version***

In [463]:
#  Foursquare credentials are defined in  cell bellow

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: TA3O2UUNPDOYBTRP3DMA1Y0ZHDCCCGEHZKO0UEN0LLYYWV2R
CLIENT_SECRET:KMHPB0NX2OWQPVXEETUGZXTFQGNMD2SE0RIZ5WDBAP4LH5YB


In [464]:
# Get the neighbourhood name
toronto_data.iat[0, 1]

'Downtown Toronto'

**Function to repeat the same process to all the neighborhoods in Toronto**

In [465]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=30):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [466]:
toronto_venues = getNearbyVenues(names=toronto_data['Neighbourhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

Harbourfront, Regent Park
Lawrence Heights, Lawrence Manor
Flemingdon Park, Don Mills South
Woodbine Heights
Hillcrest Village
Harbourfront East, Toronto Islands, Union Station
Little Portugal, Trinity
The Danforth West, Riverdale
Design Exchange, Toronto Dominion Centre
Downsview, North Park, Upwood Park
Humber Summit
Cliffcrest, Cliffside, Scarborough Village West
Newtonbrook, Willowdale
Downsview Central
Lawrence Park
Roselawn
The Junction North, Runnymede
Weston
Dorset Park, Scarborough Town Centre, Wexford Heights
Willowdale West
North Toronto West
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Canada Post Gateway Processing Centre
Kingsview Village, Martin Grove Gardens, Richview Gardens, St. Phillips
Agincourt
Agincourt North, L'Amoreaux East, Milliken, Steeles East
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Humber Bay Shores, Mimico South

In [467]:
print(toronto_venues.shape)
toronto_venues.head()

(387, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Harbourfront, Regent Park",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Harbourfront, Regent Park",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Harbourfront, Regent Park",43.65426,-79.360636,Toronto Cooper Koo Family Cherry St YMCA Centre,43.653191,-79.357947,Gym / Fitness Center
3,"Harbourfront, Regent Park",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Harbourfront, Regent Park",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot


In [468]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 149 uniques categories.


**Analyze Each Neighborhood**

In [469]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Aquarium,...,Tea Room,Thai Restaurant,Theater,Theme Restaurant,Trail,Train Station,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [470]:
toronto_onehot.shape

(387, 149)

**Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category**

In [471]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Tea Room,Thai Restaurant,Theater,Theme Restaurant,Trail,Train Station,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Women's Store
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Agincourt North, L'Amoreaux East, Milliken, St...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0,0.0625,0.0625,0.0625,0.125,0.1875,0.125,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Canada Post Gateway Processing Centre,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Church and Wellesley,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.033333,0.033333,0.0,0.033333,0.0,0.0,0.0,0.033333,0.0,0.0


In [472]:
toronto_grouped.shape

(31, 149)

Let's print each neighborhood along with the top 5 most common venues

In [473]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
                venue  freq
0  Chinese Restaurant   0.2
1              Lounge   0.2
2        Skating Rink   0.2
3      Sandwich Place   0.2
4      Breakfast Spot   0.2


----Agincourt North, L'Amoreaux East, Milliken, Steeles East----
                       venue  freq
0                       Park   0.5
1                 Playground   0.5
2  Middle Eastern Restaurant   0.0
3         Mac & Cheese Joint   0.0
4           Malay Restaurant   0.0


----CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara----
              venue  freq
0   Airport Service  0.19
1    Airport Lounge  0.12
2  Airport Terminal  0.12
3     Boat or Ferry  0.06
4               Bar  0.06


----Canada Post Gateway Processing Centre----
                       venue  freq
0                Coffee Shop  0.18
1                      Hotel  0.18
2        American Restaurant  0.09
3   Mediterranean Restaurant  0.09
4  Middle Eastern Restaurant  0.09


----

**Let's put that into a pandas dataframe**  
First, let's write a function to sort the venues in descending order.

In [474]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [475]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Chinese Restaurant,Breakfast Spot,Skating Rink,Lounge,Sandwich Place,Coffee Shop,Concert Hall,Cocktail Bar,Construction & Landscaping,Convenience Store
1,"Agincourt North, L'Amoreaux East, Milliken, St...",Playground,Park,Women's Store,Cosmetics Shop,Dessert Shop,Deli / Bodega,Dance Studio,Curling Ice,Cuban Restaurant,Creperie
2,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Service,Airport Lounge,Airport Terminal,Boat or Ferry,Airport,Airport Food Court,Airport Gate,Boutique,Harbor / Marina,Coffee Shop
3,Canada Post Gateway Processing Centre,Hotel,Coffee Shop,Fried Chicken Joint,Mediterranean Restaurant,Middle Eastern Restaurant,American Restaurant,Sandwich Place,Gym / Fitness Center,Burrito Place,Concert Hall
4,Church and Wellesley,Gay Bar,Bubble Tea Shop,Ice Cream Shop,Hobby Shop,Salon / Barbershop,Dance Studio,Men's Store,Restaurant,Bookstore,Breakfast Spot


**Cluster neighbourhood**

Run k-means to cluster the neighborhood into 5 clusters.

In [476]:
# set number of clusters
kclusters = 7

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:50] 

array([0, 1, 0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 6, 0, 6, 0, 2,
       0, 0, 4, 0, 0, 0, 1, 0, 0], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [477]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

In [478]:
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

#toronto_merged # check the last columns!

In [479]:
# Drop NaN records
toronto_merged = toronto_merged.dropna()
toronto_merged.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,0,Coffee Shop,Park,Bakery,Breakfast Spot,Gym / Fitness Center,Café,Mexican Restaurant,French Restaurant,Theater,Dessert Shop
3,M6A,Downtown Toronto,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763,0,Clothing Store,Furniture / Home Store,Arts & Crafts Store,Coffee Shop,Miscellaneous Shop,Boutique,Event Space,Athletics & Sports,Sporting Goods Shop,Women's Store
12,M3C,Downtown Toronto,"Flemingdon Park, Don Mills South",43.7259,-79.340923,0,Coffee Shop,Beer Store,Gym,Asian Restaurant,General Entertainment,Sporting Goods Shop,Clothing Store,Bike Shop,Dim Sum Restaurant,Restaurant
13,M4C,Downtown Toronto,Woodbine Heights,43.695344,-79.318389,0,Park,Curling Ice,Skating Rink,Spa,Beer Store,Athletics & Sports,Cosmetics Shop,Pharmacy,Concert Hall,Construction & Landscaping
26,M2H,Downtown Toronto,Hillcrest Village,43.803762,-79.363452,0,Golf Course,Fast Food Restaurant,Dog Run,Mediterranean Restaurant,Pool,Deli / Bodega,Dance Studio,Curling Ice,Cuban Restaurant,Creperie


In [480]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

In [481]:
# add markers to the map
markers_colors = []
toronto_merged[list(['Cluster Labels'])] = toronto_merged[list(['Cluster Labels'])].astype(int)
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster],
        fill=True,
        fill_color=rainbow[cluster],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters