In [16]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.0.2p             |       h470a237_1         3.1 MB  conda-forge
    certifi-2018.10.15         |        py36_1000         138 KB  conda-forge
    geopy-1.17.0               |             py_0          49 KB  conda-forge
    ca-certificates-2018.10.15 |       ha4d7672_0         135 KB  conda-forge
    conda-4.5.11               |        py36_1000         651 KB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         4.1 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.49-py_0            conda-forge
    geopy:           

### Load the wiki page and parse out the table with ```wikitable``` CSS class

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'html.parser')
table = soup.find_all("table", class_="wikitable")[0]

### Parse the table rows and columns and convert them into an array

In [3]:
records = []
for row in table.find_all('tr'):
        cols = row.find_all('td')
        if len(cols) == 3:
            records.append((cols[0].text.strip(), cols[1].text.strip(), cols[2].text.strip()))
data_array = np.asarray(records)

### Build the data frame using the array with the specified column titles

In [4]:
df = pd.DataFrame(data_array, columns = ['PostalCode', 'Borough', 'Neighborhood'])

### Drop the items with no ```Borough``` name

In [5]:
df = df[df.Borough != 'Not assigned']

### Replace _Not assigned_ values in  ```Neighborhood``` column with the value from  ```Borough```  column 

In [6]:
df['Neighborhood'] = np.where(df['Neighborhood'] == 'Not assigned', df['Borough'], df['Neighborhood'])

### Finally group the items by  ```PostalCode``` value and join  ```Neighborhood``` values

In [7]:
df = df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(', '.join).reset_index()

### Print the shape of data frame

In [8]:
df.shape

(103, 3)

### Read the latitude and longitude information for postal codes into a data frame

In [9]:
df_geo = pd.read_csv('https://cocl.us/Geospatial_data')

### Join the columns from both data frames on  ```PostalCode```  so that each postal code in our data frme will have a longitude and lattitude columns

In [10]:
df = df.join(df_geo.set_index('Postal Code'), on='PostalCode')

### Check the shape of the new data frame to make sure we have 2 additional columns.

In [11]:
df.shape

(103, 5)

In [15]:
df_toronto = df[df['Borough'].str.contains('Toronto')].reset_index(drop=True)
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [19]:
address = 'Toronto, Ontario, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [20]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [26]:
VERSION = '20180605' # Foursquare API version
LIMIT = 100
radius = 500

In [27]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [28]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [29]:
toronto_venues = getNearbyVenues(names=df_toronto['Neighborhood'], latitudes=df_toronto['Latitude'], longitudes=df_toronto['Longitude'])
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

The Beaches
The Danforth West, Riverdale
The Beaches West, India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park, Summerhill East
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront, Regent Park
Ryerson, Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide, King, Richmond
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North, Forest Hill West
The Annex, North Midtown, Yorkville
Harbord, University of Toronto
Chinatown, Grange Park, Kensington Market
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie
Dovercourt Village, Dufferin
Little Portugal, Trinity
Brockton, Exhibition Place, Parkdale Village
High Park, The 

In [30]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Steakhouse,Thai Restaurant,Gym,American Restaurant,Hotel,Bar,Breakfast Spot,Restaurant
1,Berczy Park,Cocktail Bar,Restaurant,Coffee Shop,Seafood Restaurant,Cheese Shop,Steakhouse,Bakery,Farmers Market,Café,Pub
2,"Brockton, Exhibition Place, Parkdale Village",Coffee Shop,Café,Breakfast Spot,Grocery Store,Climbing Gym,Stadium,Caribbean Restaurant,Bar,Italian Restaurant,Furniture / Home Store
3,Business reply mail Processing Centre969 Eastern,Yoga Studio,Pizza Place,Auto Workshop,Burrito Place,Skate Park,Fast Food Restaurant,Farmers Market,Brewery,Spa,Restaurant
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Terminal,Airport Service,Airport Lounge,Boat or Ferry,Airport Gate,Boutique,Plane,Harbor / Marina,Airport Food Court,Airport
5,"Cabbagetown, St. James Town",Restaurant,Coffee Shop,Café,Pet Store,Pizza Place,Park,Italian Restaurant,Pub,Bakery,Indian Restaurant
6,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Chinese Restaurant,Burger Joint,Bar,Ice Cream Shop,Salad Place,Sandwich Place,Sushi Restaurant
7,"Chinatown, Grange Park, Kensington Market",Bar,Café,Vietnamese Restaurant,Vegetarian / Vegan Restaurant,Bakery,Coffee Shop,Chinese Restaurant,Mexican Restaurant,Dumpling Restaurant,Caribbean Restaurant
8,Christie,Grocery Store,Café,Park,Italian Restaurant,Coffee Shop,Convenience Store,Restaurant,Baby Store,Nightclub,Athletics & Sports
9,Church and Wellesley,Japanese Restaurant,Coffee Shop,Gay Bar,Burger Joint,Sushi Restaurant,Restaurant,Mediterranean Restaurant,Café,Men's Store,Yoga Studio


In [31]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [32]:
df_merged = df_toronto

# add clustering labels
df_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
df_merged = df_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

df_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Coffee Shop,Pub,Deli / Bodega,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Discount Store
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Ice Cream Shop,Restaurant,Italian Restaurant,Fruit & Vegetable Store,Pub,Pizza Place,Liquor Store,Juice Bar
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,0,Pizza Place,Movie Theater,Ice Cream Shop,Fast Food Restaurant,Fish & Chips Shop,Sushi Restaurant,Burrito Place,Pub,Burger Joint,Italian Restaurant
3,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Café,Coffee Shop,American Restaurant,Gastropub,Bakery,Italian Restaurant,Yoga Studio,Stationery Store,Bank,Bar
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,0,Park,Dim Sum Restaurant,Swim School,Bus Line,Women's Store,Department Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant


In [33]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Neighborhood'], df_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Cluster 1

In [34]:
df_merged.loc[df_merged['Cluster Labels'] == 0, df_merged.columns[[1] + list(range(5, df_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,0,Coffee Shop,Pub,Deli / Bodega,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Discount Store
1,East Toronto,0,Greek Restaurant,Coffee Shop,Ice Cream Shop,Restaurant,Italian Restaurant,Fruit & Vegetable Store,Pub,Pizza Place,Liquor Store,Juice Bar
2,East Toronto,0,Pizza Place,Movie Theater,Ice Cream Shop,Fast Food Restaurant,Fish & Chips Shop,Sushi Restaurant,Burrito Place,Pub,Burger Joint,Italian Restaurant
3,East Toronto,0,Café,Coffee Shop,American Restaurant,Gastropub,Bakery,Italian Restaurant,Yoga Studio,Stationery Store,Bank,Bar
4,Central Toronto,0,Park,Dim Sum Restaurant,Swim School,Bus Line,Women's Store,Department Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant
5,Central Toronto,0,Park,Clothing Store,Breakfast Spot,Sandwich Place,Food & Drink Shop,Hotel,Burger Joint,Design Studio,Dumpling Restaurant,Donut Shop
6,Central Toronto,0,Clothing Store,Coffee Shop,Sporting Goods Shop,Grocery Store,Health & Beauty Service,Sandwich Place,Salon / Barbershop,Rental Car Location,Park,Mexican Restaurant
7,Central Toronto,0,Sandwich Place,Dessert Shop,Sushi Restaurant,Pharmacy,Restaurant,Italian Restaurant,Café,Pizza Place,Seafood Restaurant,Coffee Shop
8,Central Toronto,0,Gym,Playground,Trail,Tennis Court,Women's Store,Discount Store,Design Studio,Dessert Shop,Dim Sum Restaurant,Diner
9,Central Toronto,0,Coffee Shop,Pub,American Restaurant,Bagel Shop,Supermarket,Sushi Restaurant,Light Rail Station,Convenience Store,Pizza Place,Vietnamese Restaurant


#### Cluster 2

In [35]:
df_merged.loc[df_merged['Cluster Labels'] == 1, df_merged.columns[[1] + list(range(5, df_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
28,Downtown Toronto,1,Coffee Shop,Restaurant,Café,Cocktail Bar,Pub,Seafood Restaurant,Italian Restaurant,Hotel,Bakery,Art Gallery


#### Cluster 3

In [36]:
df_merged.loc[df_merged['Cluster Labels'] == 2, df_merged.columns[[1] + list(range(5, df_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
12,Downtown Toronto,2,Japanese Restaurant,Coffee Shop,Gay Bar,Burger Joint,Sushi Restaurant,Restaurant,Mediterranean Restaurant,Café,Men's Store,Yoga Studio


#### Cluster 4

In [37]:
df_merged.loc[df_merged['Cluster Labels'] == 3, df_merged.columns[[1] + list(range(5, df_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
17,Downtown Toronto,3,Coffee Shop,Café,Italian Restaurant,Chinese Restaurant,Burger Joint,Bar,Ice Cream Shop,Salad Place,Sandwich Place,Sushi Restaurant
22,Central Toronto,3,Garden,Women's Store,Deli / Bodega,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Discount Store
27,Downtown Toronto,3,Airport Terminal,Airport Service,Airport Lounge,Boat or Ferry,Airport Gate,Boutique,Plane,Harbor / Marina,Airport Food Court,Airport


#### Cluster 5

In [38]:
df_merged.loc[df_merged['Cluster Labels'] == 4, df_merged.columns[[1] + list(range(5, df_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
24,Central Toronto,4,Café,Coffee Shop,Sandwich Place,Pizza Place,Liquor Store,Jewish Restaurant,BBQ Joint,Pub,Flower Shop,Burger Joint
