# PART 1

In [3]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import requests # Library for web scraping

In [4]:
df = pd.read_html('https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=942655599')[0]
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Etobicoke,Islington Avenue


### Removing the not assigned borough

In [5]:
df.drop(df[df['Borough']=="Not assigned"].index,axis=0, inplace=True)

In [6]:
df

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Etobicoke,Islington Avenue
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


In [7]:
df.shape

(210, 3)

### More than one neighborhood can exist in one postal code area, M5A is listed twice and has two neighborhoods Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma using groupby

In [8]:
df1 = df.reset_index()
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 4 columns):
index            210 non-null int64
Postcode         210 non-null object
Borough          210 non-null object
Neighbourhood    210 non-null object
dtypes: int64(1), object(3)
memory usage: 6.6+ KB


In [9]:
df1.shape

(210, 4)

In [10]:
df2= df1.groupby('Postcode').agg(lambda x: ','.join(x))
df2.info()
df2.shape

<class 'pandas.core.frame.DataFrame'>
Index: 103 entries, M1B to M9W
Data columns (total 2 columns):
Borough          103 non-null object
Neighbourhood    103 non-null object
dtypes: object(2)
memory usage: 2.4+ KB


(103, 2)

### There are also cells that have an assigned neighbouhoods,like M7A, lets assign their boroughs as their neighbourhood and removing the duplicate borough

In [11]:
df2.loc[df2['Neighbourhood']=="Not assigned",'Neighbourhood']=df2.loc[df2['Neighbourhood']=="Not assigned",'Borough']
df3 = df2.reset_index()
df3['Borough']= df3['Borough'].str.replace('nan|[{}\s]','').str.split(',').apply(set).str.join(',').str.strip(',').str.replace(",{2,}",",")
df3

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [12]:
df3.shape

(103, 3)

# PART 2

#### Getting the latitude and the longitude coordinates of each neighborhood

In [13]:
df4=pd.read_csv('http://cocl.us/Geospatial_data')
df4

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [14]:
df4 = df4.rename(columns={'Postal Code': 'Postcode'}, index={'ONE': 'one'})
df4

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


#### Merging two Dataframes to include the latitudes and longitudes

In [15]:
df5 = pd.merge(df3, df4, on='Postcode')
df5

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


## Part 3: Build a test set with boroughs in Toronto

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    certifi-2019.11.28         |   py36h9f0ad1d_1         149 KB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    openssl-1.1.1e             |       h516909a_0         2.1 MB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1

In [16]:
import pandas as pd

import json

df5.to_json(path_or_buf='geo_toronto.json', orient='table')
with open('geo_toronto.json') as json_data:
    Toronto_data = json.load(json_data)
neighborhoods_data = Toronto_data['data']
neighborhoods_data[0]

{'index': 0,
 'Postcode': 'M1B',
 'Borough': 'Scarborough',
 'Neighbourhood': 'Rouge,Malvern',
 'Latitude': 43.8066863,
 'Longitude': -79.1943534}

In [18]:
df5.info()
df5.shape
df5.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103 entries, 0 to 102
Data columns (total 5 columns):
Postcode         103 non-null object
Borough          103 non-null object
Neighbourhood    103 non-null object
Latitude         103 non-null float64
Longitude        103 non-null float64
dtypes: float64(2), object(3)
memory usage: 9.8+ KB


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [19]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df5['Borough'].unique()),
        df5.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


In [20]:
import pandas as pd
import folium

print('imported pandas & folium')

import pandas as pd
import folium

#grab a random sample from df
subset_of_df = df5.sample(n=11)
map_test = folium.Map(location=[subset_of_df['Latitude'].mean(), 
                                subset_of_df['Longitude'].mean()], 
                      zoom_start=10)
#creating a Marker for each point in df_sample. Each point will get a popup with their zip
for row in subset_of_df.itertuples():
    map_test.add_child(folium.Marker(location=[row.Latitude ,row.Longitude],
           popup=row.Borough))

    
#map_test

#open map_test.html in browser
map_test.save("map_test.html")

imported pandas & folium


In [21]:
from folium.plugins import MarkerCluster
map_borough = folium.Map(location=[subset_of_df['Latitude'].mean(), 
 subset_of_df['Longitude'].mean()], 
 zoom_start=10)
mc = MarkerCluster()
#creating a Marker for each point in df_sample. Each point will get a popup with their zip
for row in subset_of_df.itertuples():
    mc.add_child(folium.Marker(location=[row.Latitude,  row.Longitude],
                 popup=row.Borough))
    map_borough.add_child(mc)


#map_borough

#open in map_borough.html browser 
map_borough.save("map_borough.html")

In [22]:
import pandas as pd
import folium



#grab a random sample from df
toronto_n = df5.sample(n=20)
map_toronto = folium.Map(location=[toronto_n['Latitude'].mean(), 
                                toronto_n['Longitude'].mean()], 
                      zoom_start=10)
#creating a Marker for each point in df_sample. Each point will get a popup with their zip
for row in toronto_n.itertuples():
    map_toronto.add_child(folium.Marker(location=[row.Latitude ,row.Longitude],
           popup=row.Neighbourhood))

    
map_toronto 

#open map_toronto.html in browser

map_toronto.save("map_toronto20.html")

In [23]:
df5.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [24]:
df5.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103 entries, 0 to 102
Data columns (total 5 columns):
Postcode         103 non-null object
Borough          103 non-null object
Neighbourhood    103 non-null object
Latitude         103 non-null float64
Longitude        103 non-null float64
dtypes: float64(2), object(3)
memory usage: 9.8+ KB


In [25]:
address = 'Toronto, CA'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

  app.launch_new_instance()


The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [26]:
map_toronto_neighbourhoods = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(df5['Latitude'],df5['Longitude'], df5['Borough'], df5['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=2,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_neighbourhoods)  
    
map_toronto_neighbourhoods

map_toronto_neighbourhoods.save("map_toronto_neighbourhoods.html")

In [27]:
address = 'York, Toronto'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of York, Toronto are {}, {}.'.format(latitude, longitude))

  app.launch_new_instance()


The geograpical coordinates of York, Toronto are 43.67910515, -79.49118414007154.


In [28]:
york_data = df5[df5['Borough'] == 'York'].reset_index(drop=True)
york_data

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M6C,York,Humewood-Cedarvale,43.693781,-79.428191
1,M6E,York,Caledonia-Fairbanks,43.689026,-79.453512
2,M6M,York,"Del Ray,Keelesdale,Mount Dennis,Silverthorn",43.691116,-79.476013
3,M6N,York,"The Junction North,Runnymede",43.673185,-79.487262
4,M9N,York,Weston,43.706876,-79.518188


In [29]:
york_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
Postcode         5 non-null object
Borough          5 non-null object
Neighbourhood    5 non-null object
Latitude         5 non-null float64
Longitude        5 non-null float64
dtypes: float64(2), object(3)
memory usage: 280.0+ bytes


In [30]:
map_york_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(york_data['Latitude'], york_data['Longitude'], york_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_york_toronto)  
    
map_york_toronto

map_york_toronto.save("map_york_toronto.html")

In [31]:
CLIENT_ID = 'YV5M5XEMIX53QNMYDHNSTNZ2NSD35I1B1HSFAUSUBTBX1TMS' # your Foursquare ID
CLIENT_SECRET = 'NKJLVTI355ZE2QRBLUFTWAPCCXULVUNK0JUC4ULPPP35ACZY' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: YV5M5XEMIX53QNMYDHNSTNZ2NSD35I1B1HSFAUSUBTBX1TMS
CLIENT_SECRET:NKJLVTI355ZE2QRBLUFTWAPCCXULVUNK0JUC4ULPPP35ACZY


In [32]:
york_data.info()

neighbourhood_latitude = york_data.loc[0, 'Latitude'] # neighborhood latitude value
neighbourhood_longitude = york_data.loc[0, 'Longitude'] # neighborhood longitude value

neighbourhood_name = york_data.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
Postcode         5 non-null object
Borough          5 non-null object
Neighbourhood    5 non-null object
Latitude         5 non-null float64
Longitude        5 non-null float64
dtypes: float64(2), object(3)
memory usage: 280.0+ bytes
Latitude and longitude values of Humewood-Cedarvale are 43.6937813, -79.42819140000002.


In [33]:
LIMIT = 100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID,CLIENT_SECRET,VERSION,neighbourhood_latitude,neighbourhood_longitude,radius,LIMIT)
url
york_results = requests.get(url).json()

### Get the most common venue categories in each neighborhood

In [34]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
york_venues = york_results['response']['groups'][0]['items']
    
york_nearby_venues = json_normalize(york_venues) # flatten JSON

# filter columns
york_filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
york_nearby_venues = york_nearby_venues.loc[:, york_filtered_columns]

# filter the category for each row
york_nearby_venues['venue.categories'] = york_nearby_venues.apply(get_category_type, axis=1)

# clean columns
york_nearby_venues.columns = [col.split(".")[-1] for col in york_nearby_venues.columns]

york_nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Cedarvale Park,Field,43.692535,-79.428705
1,Cedarvale Ravine,Trail,43.690188,-79.426106
2,Glen Cedar Park,Playground,43.695399,-79.429253
3,Phil White Arena,Hockey Arena,43.691303,-79.431761


In [35]:
york_nearby_venues.info()
print('{} venues were returned by Foursquare.'.format(york_nearby_venues.shape[0]))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
name          4 non-null object
categories    4 non-null object
lat           4 non-null float64
lng           4 non-null float64
dtypes: float64(2), object(2)
memory usage: 208.0+ bytes
4 venues were returned by Foursquare.


In [36]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [37]:
york_venues = getNearbyVenues(names=york_data['Neighbourhood'],
                                   latitudes=york_data['Latitude'],
                                   longitudes=york_data['Longitude']
                                  )

york_venues.head()

Humewood-Cedarvale
Caledonia-Fairbanks
Del Ray,Keelesdale,Mount Dennis,Silverthorn
The Junction North,Runnymede
Weston


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Humewood-Cedarvale,43.693781,-79.428191,Cedarvale Park,43.692535,-79.428705,Field
1,Humewood-Cedarvale,43.693781,-79.428191,Cedarvale Ravine,43.690188,-79.426106,Trail
2,Humewood-Cedarvale,43.693781,-79.428191,Glen Cedar Park,43.695399,-79.429253,Playground
3,Humewood-Cedarvale,43.693781,-79.428191,Phil White Arena,43.691303,-79.431761,Hockey Arena
4,Caledonia-Fairbanks,43.689026,-79.453512,Nairn Park,43.690654,-79.4563,Park


In [38]:
york_venues.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 7 columns):
Neighbourhood              17 non-null object
Neighbourhood Latitude     17 non-null float64
Neighbourhood Longitude    17 non-null float64
Venue                      17 non-null object
Venue Latitude             17 non-null float64
Venue Longitude            17 non-null float64
Venue Category             17 non-null object
dtypes: float64(4), object(3)
memory usage: 1.0+ KB


#### Let's see how many venues were returned for each neighborhood:

In [40]:
york_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Caledonia-Fairbanks,4,4,4,4,4,4
"Del Ray,Keelesdale,Mount Dennis,Silverthorn",4,4,4,4,4,4
Humewood-Cedarvale,4,4,4,4,4,4
"The Junction North,Runnymede",4,4,4,4,4,4
Weston,1,1,1,1,1,1


In [41]:
print('There are {} uniques categories.'.format(len(york_venues['Venue Category'].unique())))

There are 15 uniques categories.


In [42]:
york_onehot = pd.get_dummies(york_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
york_onehot['Neighbourhood'] = york_venues['Neighbourhood'] 

# move neighborhood column to the first column
york_fixed_columns = [york_onehot.columns[-1]] + list(york_onehot.columns[:-1])
york_onehot = york_onehot[york_fixed_columns]

york_onehot.head()

Unnamed: 0,Neighbourhood,Brewery,Bus Line,Fast Food Restaurant,Field,Fried Chicken Joint,Grocery Store,Hockey Arena,Market,Park,Pizza Place,Playground,Restaurant,Sandwich Place,Trail,Women's Store
0,Humewood-Cedarvale,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,Humewood-Cedarvale,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,Humewood-Cedarvale,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,Humewood-Cedarvale,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4,Caledonia-Fairbanks,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [43]:
york_onehot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 16 columns):
Neighbourhood           17 non-null object
Brewery                 17 non-null uint8
Bus Line                17 non-null uint8
Fast Food Restaurant    17 non-null uint8
Field                   17 non-null uint8
Fried Chicken Joint     17 non-null uint8
Grocery Store           17 non-null uint8
Hockey Arena            17 non-null uint8
Market                  17 non-null uint8
Park                    17 non-null uint8
Pizza Place             17 non-null uint8
Playground              17 non-null uint8
Restaurant              17 non-null uint8
Sandwich Place          17 non-null uint8
Trail                   17 non-null uint8
Women's Store           17 non-null uint8
dtypes: object(1), uint8(15)
memory usage: 471.0+ bytes


#### Let's group by neighbourhoods:

In [44]:
york_grouped = york_onehot.groupby('Neighbourhood').mean().reset_index()
york_grouped.head()

Unnamed: 0,Neighbourhood,Brewery,Bus Line,Fast Food Restaurant,Field,Fried Chicken Joint,Grocery Store,Hockey Arena,Market,Park,Pizza Place,Playground,Restaurant,Sandwich Place,Trail,Women's Store
0,Caledonia-Fairbanks,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.5,0.0,0.0,0.0,0.0,0.0,0.25
1,"Del Ray,Keelesdale,Mount Dennis,Silverthorn",0.0,0.0,0.25,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.25,0.0,0.0
2,Humewood-Cedarvale,0.0,0.0,0.0,0.25,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.25,0.0
3,"The Junction North,Runnymede",0.25,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0
4,Weston,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
york_grouped.info()
york_grouped.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 16 columns):
Neighbourhood           5 non-null object
Brewery                 5 non-null float64
Bus Line                5 non-null float64
Fast Food Restaurant    5 non-null float64
Field                   5 non-null float64
Fried Chicken Joint     5 non-null float64
Grocery Store           5 non-null float64
Hockey Arena            5 non-null float64
Market                  5 non-null float64
Park                    5 non-null float64
Pizza Place             5 non-null float64
Playground              5 non-null float64
Restaurant              5 non-null float64
Sandwich Place          5 non-null float64
Trail                   5 non-null float64
Women's Store           5 non-null float64
dtypes: float64(15), object(1)
memory usage: 720.0+ bytes


Unnamed: 0,Neighbourhood,Brewery,Bus Line,Fast Food Restaurant,Field,Fried Chicken Joint,Grocery Store,Hockey Arena,Market,Park,Pizza Place,Playground,Restaurant,Sandwich Place,Trail,Women's Store
0,Caledonia-Fairbanks,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.5,0.0,0.0,0.0,0.0,0.0,0.25
1,"Del Ray,Keelesdale,Mount Dennis,Silverthorn",0.0,0.0,0.25,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.25,0.0,0.0
2,Humewood-Cedarvale,0.0,0.0,0.0,0.25,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.25,0.0
3,"The Junction North,Runnymede",0.25,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0
4,Weston,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Let's find the top venues:

In [46]:
num_top_venues = 3

for hood in york_grouped['Neighbourhood']:
    print("----"+hood+"----")
    york_temp = york_grouped[york_grouped['Neighbourhood'] == hood].T.reset_index()
    york_temp.columns = ['venue','freq']
    york_temp = york_temp.iloc[1:]
    york_temp['freq'] = york_temp['freq'].astype(float)
    york_temp = york_temp.round({'freq': 2})
    print(york_temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Caledonia-Fairbanks----
           venue  freq
0           Park  0.50
1         Market  0.25
2  Women's Store  0.25


----Del Ray,Keelesdale,Mount Dennis,Silverthorn----
                  venue  freq
0  Fast Food Restaurant  0.25
1   Fried Chicken Joint  0.25
2            Restaurant  0.25


----Humewood-Cedarvale----
          venue  freq
0         Field  0.25
1  Hockey Arena  0.25
2    Playground  0.25


----The Junction North,Runnymede----
           venue  freq
0        Brewery  0.25
1       Bus Line  0.25
2  Grocery Store  0.25


----Weston----
      venue  freq
0      Park   1.0
1   Brewery   0.0
2  Bus Line   0.0




In [47]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 17

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
york_neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)

york_neighbourhoods_venues_sorted

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue,16th Most Common Venue,17th Most Common Venue


In [48]:
york_neighbourhoods_venues_sorted['Neighbourhood'] = york_grouped['Neighbourhood']

#york_neighbourhoods_venues_sorted.drop(columns=['17th Most Common Venue', '16th Most Common Venue','15th Most Common Venue'],axis=1)
york_neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue,16th Most Common Venue,17th Most Common Venue
0,Caledonia-Fairbanks,,,,,,,,,,,,,,,,,
1,"Del Ray,Keelesdale,Mount Dennis,Silverthorn",,,,,,,,,,,,,,,,,
2,Humewood-Cedarvale,,,,,,,,,,,,,,,,,
3,"The Junction North,Runnymede",,,,,,,,,,,,,,,,,
4,Weston,,,,,,,,,,,,,,,,,


In [49]:
for ind in np.arange(york_grouped.shape[0]):
    york_neighbourhoods_venues_sorted.iloc[ind, 1:16] = return_most_common_venues(york_grouped.iloc[ind, :], num_top_venues)

york_neighbourhoods_venues_sorted.head(2)
# set number of clusters
kclusters = 2

york_grouped_clustering = york_grouped.drop('Neighbourhood', 1)

# run k-means clustering
york_kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(york_grouped_clustering)

# check cluster labels generated for each row in the dataframe
york_kmeans.labels_[0:5]

array([0, 1, 1, 1, 0], dtype=int32)

In [50]:
york_neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', york_kmeans.labels_)

york_merged = york_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
york_merged = york_merged.join(york_neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

york_merged

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue,16th Most Common Venue,17th Most Common Venue
0,M6C,York,Humewood-Cedarvale,43.693781,-79.428191,1,Trail,Playground,Hockey Arena,Field,Women's Store,Sandwich Place,Restaurant,Pizza Place,Park,Market,Grocery Store,Fried Chicken Joint,Fast Food Restaurant,Bus Line,Brewery,,
1,M6E,York,Caledonia-Fairbanks,43.689026,-79.453512,0,Park,Women's Store,Market,Trail,Sandwich Place,Restaurant,Playground,Pizza Place,Hockey Arena,Grocery Store,Fried Chicken Joint,Field,Fast Food Restaurant,Bus Line,Brewery,,
2,M6M,York,"Del Ray,Keelesdale,Mount Dennis,Silverthorn",43.691116,-79.476013,1,Sandwich Place,Restaurant,Fried Chicken Joint,Fast Food Restaurant,Women's Store,Trail,Playground,Pizza Place,Park,Market,Hockey Arena,Grocery Store,Field,Bus Line,Brewery,,
3,M6N,York,"The Junction North,Runnymede",43.673185,-79.487262,1,Pizza Place,Grocery Store,Bus Line,Brewery,Women's Store,Trail,Sandwich Place,Restaurant,Playground,Park,Market,Hockey Arena,Fried Chicken Joint,Field,Fast Food Restaurant,,
4,M9N,York,Weston,43.706876,-79.518188,0,Park,Women's Store,Trail,Sandwich Place,Restaurant,Playground,Pizza Place,Market,Hockey Arena,Grocery Store,Fried Chicken Joint,Field,Fast Food Restaurant,Bus Line,Brewery,,


In [51]:
york_merged.drop(columns=['16th Most Common Venue','17th Most Common Venue'])
york_merged

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue,16th Most Common Venue,17th Most Common Venue
0,M6C,York,Humewood-Cedarvale,43.693781,-79.428191,1,Trail,Playground,Hockey Arena,Field,Women's Store,Sandwich Place,Restaurant,Pizza Place,Park,Market,Grocery Store,Fried Chicken Joint,Fast Food Restaurant,Bus Line,Brewery,,
1,M6E,York,Caledonia-Fairbanks,43.689026,-79.453512,0,Park,Women's Store,Market,Trail,Sandwich Place,Restaurant,Playground,Pizza Place,Hockey Arena,Grocery Store,Fried Chicken Joint,Field,Fast Food Restaurant,Bus Line,Brewery,,
2,M6M,York,"Del Ray,Keelesdale,Mount Dennis,Silverthorn",43.691116,-79.476013,1,Sandwich Place,Restaurant,Fried Chicken Joint,Fast Food Restaurant,Women's Store,Trail,Playground,Pizza Place,Park,Market,Hockey Arena,Grocery Store,Field,Bus Line,Brewery,,
3,M6N,York,"The Junction North,Runnymede",43.673185,-79.487262,1,Pizza Place,Grocery Store,Bus Line,Brewery,Women's Store,Trail,Sandwich Place,Restaurant,Playground,Park,Market,Hockey Arena,Fried Chicken Joint,Field,Fast Food Restaurant,,
4,M9N,York,Weston,43.706876,-79.518188,0,Park,Women's Store,Trail,Sandwich Place,Restaurant,Playground,Pizza Place,Market,Hockey Arena,Grocery Store,Fried Chicken Joint,Field,Fast Food Restaurant,Bus Line,Brewery,,


#### Generate maps to visualize clustering

In [53]:
# create map
york_map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(york_merged['Latitude'], york_merged['Longitude'], york_merged['Neighbourhood'], york_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(york_map_clusters)
       
york_map_clusters

york_map_clusters.save("york_map_clusters.html")

In [None]:
york_map_clusters

### Examine Clusters

In [None]:
york_merged.loc[york_merged['Cluster Labels'] == 0, york_merged.columns[[1] + list(range(5, york_merged.shape[1]))]]

In [None]:
york_merged.loc[york_merged['Cluster Labels'] == 1, york_merged.columns[[1] + list(range(5, york_merged.shape[1]))]]