Import the required libraries. Using BeautifulSoup for scrappoing Wikipedia's Toronto Neighbourhood Data

In [None]:
import pandas as pd
import numpy as np
import requests as requests
from bs4 import BeautifulSoup

In [2]:
URL="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

Load the URL and extract the data using BeautifulSoup

In [3]:
res = requests.get(URL).text
soup = BeautifulSoup(res,'html.parser')

Create dataframe with the columns

In [4]:
columns=['Postalcode','Borough','Neighbourhood']
toronto_df = pd.DataFrame(columns=columns)

In [5]:
toronto_df

Unnamed: 0,Postalcode,Borough,Neighbourhood


Loop through the data and fill the dataframe with each row at a time

In [6]:
for items in soup.find('table', class_='wikitable').find_all('tr'):
    data = items.find_all(['td','/td'])
    try:
        postcode = data[0].text
        borough = data[0].find_next_sibling().text
        neighbourhood = data[0].find_next_sibling().find_next_sibling().text
        toronto_df = toronto_df.append({'Postalcode': postcode,
                                                      'Borough': borough,
                                                      'Neighbourhood': neighbourhood}, ignore_index=True)
    except IndexError:pass

Let's examine the dataframe

In [7]:
toronto_df.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


Remove the '\n' from Neighbourhood column

In [8]:
toronto_df['Neighbourhood'] = toronto_df['Neighbourhood'].replace('\n','', regex=True)
toronto_df.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Dropping rows with Borough = 'Not assigned'

In [10]:
toronto_df = toronto_df[toronto_df['Borough'] != 'Not assigned']

Restting the index

In [11]:
toronto_df.index = np.arange(0, len(toronto_df))

Assigning the value of Borough if the Neighbourhood value = 'Not assigned'

In [13]:
toronto_df['Neighbourhood'].loc[toronto_df['Neighbourhood'] == 'Not assigned'] = toronto_df['Borough']

In [14]:
toronto_df.head(10)

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Queen's Park,Queen's Park
6,M9A,Queen's Park,Queen's Park
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


Merging the rows with same postal code

In [15]:
toronto_df = toronto_df.groupby(['Postalcode', 'Borough'])['Neighbourhood'].apply(list)
toronto_df = toronto_df.sample(frac=1).reset_index()
toronto_df['Neighbourhood'] = toronto_df['Neighbourhood'].str.join(',')

In [16]:
toronto_df.head(10)

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M6A,North York,"Lawrence Heights,Lawrence Manor"
1,M4Y,Downtown Toronto,Church and Wellesley
2,M4R,Central Toronto,North Toronto West
3,M5R,Central Toronto,"The Annex,North Midtown,Yorkville"
4,M1X,Scarborough,Upper Rouge
5,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
6,M2P,North York,York Mills West
7,M6R,West Toronto,"Parkdale,Roncesvalles"
8,M5V,Downtown Toronto,"CN Tower,Bathurst Quay,Island airport,Harbourf..."
9,M1P,Scarborough,"Dorset Park,Scarborough Town Centre,Wexford He..."


In [17]:
toronto_df.shape

(103, 3)

Fetching the longitude and Latitude from the csv file

In [19]:
lat_long_coor_df = pd.read_csv("http://cocl.us/Geospatial_data")

In [20]:
lat_long_coor_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merging Neigubourhood data and geocode data

In [21]:
toronto_df = pd.merge(toronto_df, lat_long_coor_df, left_on='Postalcode', right_on='Postal Code')

In [22]:
toronto_df.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M6A,North York,"Lawrence Heights,Lawrence Manor",M6A,43.718518,-79.464763
1,M4Y,Downtown Toronto,Church and Wellesley,M4Y,43.66586,-79.38316
2,M4R,Central Toronto,North Toronto West,M4R,43.715383,-79.405678
3,M5R,Central Toronto,"The Annex,North Midtown,Yorkville",M5R,43.67271,-79.405678
4,M1X,Scarborough,Upper Rouge,M1X,43.836125,-79.205636


In [23]:
bor_toronto_df = toronto_df[toronto_df['Borough'].str.contains('Toronto', na=False)]

In [24]:
bor_toronto_df.head(10)

Unnamed: 0,Postalcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
1,M4Y,Downtown Toronto,Church and Wellesley,M4Y,43.66586,-79.38316
2,M4R,Central Toronto,North Toronto West,M4R,43.715383,-79.405678
3,M5R,Central Toronto,"The Annex,North Midtown,Yorkville",M5R,43.67271,-79.405678
7,M6R,West Toronto,"Parkdale,Roncesvalles",M6R,43.64896,-79.456325
8,M5V,Downtown Toronto,"CN Tower,Bathurst Quay,Island airport,Harbourf...",M5V,43.628947,-79.39442
13,M6H,West Toronto,"Dovercourt Village,Dufferin",M6H,43.669005,-79.442259
15,M6S,West Toronto,"Runnymede,Swansea",M6S,43.651571,-79.48445
16,M4P,Central Toronto,Davisville North,M4P,43.712751,-79.390197
21,M5S,Downtown Toronto,"Harbord,University of Toronto",M5S,43.662696,-79.400049
24,M4K,East Toronto,"The Danforth West,Riverdale",M4K,43.679557,-79.352188


In [25]:
import geocoder

In [26]:
address = 'Toronto, Ontario'
g = geocoder.google(address)
latitude = g.latitude
longitude = g.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are None, None.


In [27]:
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



Fetch the latitude and longitude for Toronoto

In [28]:
geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [29]:
!conda install -c conda-forge folium=0.5.0 --yes

Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [30]:
import folium

create map of Toronto using latitude and longitude values

In [31]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(bor_toronto_df['Latitude'], bor_toronto_df['Longitude'], bor_toronto_df['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Next, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them.

In [32]:
CLIENT_ID = 'FDTRVYJUIMW5NIUX1P05QXRCVBYF44E2GGER5C4JR32JO21U'
CLIENT_SECRET = '1GBIIWCMLOMCHNWN2LBH5WVGZDY1LL2MHCD2MFF4FFGLKLJV'
VERSION = '20180605'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: FDTRVYJUIMW5NIUX1P05QXRCVBYF44E2GGER5C4JR32JO21U
CLIENT_SECRET:1GBIIWCMLOMCHNWN2LBH5WVGZDY1LL2MHCD2MFF4FFGLKLJV


#### Let's explore the first neighborhood in our dataframe.

Get the neighborhood's name.

In [36]:
bor_toronto_df.head()

Unnamed: 0,index,Postalcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,1,M4Y,Downtown Toronto,Church and Wellesley,M4Y,43.66586,-79.38316
1,2,M4R,Central Toronto,North Toronto West,M4R,43.715383,-79.405678
2,3,M5R,Central Toronto,"The Annex,North Midtown,Yorkville",M5R,43.67271,-79.405678
3,7,M6R,West Toronto,"Parkdale,Roncesvalles",M6R,43.64896,-79.456325
4,8,M5V,Downtown Toronto,"CN Tower,Bathurst Quay,Island airport,Harbourf...",M5V,43.628947,-79.39442


In [35]:
bor_toronto_df = bor_toronto_df.reset_index()

In [37]:
bor_toronto_df.drop('index', axis=1)

Unnamed: 0,Postalcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M4Y,Downtown Toronto,Church and Wellesley,M4Y,43.66586,-79.38316
1,M4R,Central Toronto,North Toronto West,M4R,43.715383,-79.405678
2,M5R,Central Toronto,"The Annex,North Midtown,Yorkville",M5R,43.67271,-79.405678
3,M6R,West Toronto,"Parkdale,Roncesvalles",M6R,43.64896,-79.456325
4,M5V,Downtown Toronto,"CN Tower,Bathurst Quay,Island airport,Harbourf...",M5V,43.628947,-79.39442
5,M6H,West Toronto,"Dovercourt Village,Dufferin",M6H,43.669005,-79.442259
6,M6S,West Toronto,"Runnymede,Swansea",M6S,43.651571,-79.48445
7,M4P,Central Toronto,Davisville North,M4P,43.712751,-79.390197
8,M5S,Downtown Toronto,"Harbord,University of Toronto",M5S,43.662696,-79.400049
9,M4K,East Toronto,"The Danforth West,Riverdale",M4K,43.679557,-79.352188


## Explore Neighborhoods in Toronoto

#### Let's create a function to repeat the all the neighborhoods in Toronoto

In [38]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            30)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Now write the code to run the above function on each neighborhood and create a new dataframe called *toronto_venues*.

In [42]:
toronto_venues = getNearbyVenues(names=bor_toronto_df['Neighbourhood'],
                                   latitudes=bor_toronto_df['Latitude'],
                                   longitudes=bor_toronto_df['Longitude']
                                  )

Church and Wellesley
North Toronto West
The Annex,North Midtown,Yorkville
Parkdale,Roncesvalles
CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara
Dovercourt Village,Dufferin
Runnymede,Swansea
Davisville North
Harbord,University of Toronto
The Danforth West,Riverdale
Little Portugal,Trinity
Business Reply Mail Processing Centre 969 Eastern
Design Exchange,Toronto Dominion Centre
The Beaches West,India Bazaar
St. James Town
Lawrence Park
Harbourfront East,Toronto Islands,Union Station
High Park,The Junction South
Roselawn
Central Bay Street
Stn A PO Boxes 25 The Esplanade
Commerce Court,Victoria Hotel
Cabbagetown,St. James Town
Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West
Ryerson,Garden District
Christie
Harbourfront
First Canadian Place,Underground city
Chinatown,Grange Park,Kensington Market
Studio District
Davisville
Forest Hill North,Forest Hill West
Rosedale
Brockton,Exhibition Place,Parkdale Village
Moore Park,Summ

Let's check the size of the resulting dataframe

In [43]:
print(toronto_venues.shape)
toronto_venues.head()

(828, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Church and Wellesley,43.66586,-79.38316,Storm Crow Manor,43.66684,-79.381593,Theme Restaurant
1,Church and Wellesley,43.66586,-79.38316,DanceLifeX Centre,43.666956,-79.385297,Dance Studio
2,Church and Wellesley,43.66586,-79.38316,Smith,43.666927,-79.381421,Breakfast Spot
3,Church and Wellesley,43.66586,-79.38316,Como En Casa,43.66516,-79.384796,Mexican Restaurant
4,Church and Wellesley,43.66586,-79.38316,The Alley,43.665922,-79.385567,Bubble Tea Shop


Let's check how many venues were returned for each neighborhood

In [44]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,King,Richmond",30,30,30,30,30,30
Berczy Park,30,30,30,30,30,30
"Brockton,Exhibition Place,Parkdale Village",22,22,22,22,22,22
Business Reply Mail Processing Centre 969 Eastern,18,18,18,18,18,18
"CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara",14,14,14,14,14,14
"Cabbagetown,St. James Town",30,30,30,30,30,30
Central Bay Street,30,30,30,30,30,30
"Chinatown,Grange Park,Kensington Market",30,30,30,30,30,30
Christie,16,16,16,16,16,16
Church and Wellesley,30,30,30,30,30,30


Let's find out how many unique categories can be curated from all the returned venues

In [46]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 182 uniques categories.


## Analyze Each Neighborhood


In [53]:
# one hot encoding
tor_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
tor_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [tor_onehot.columns[-1]] + list(tor_onehot.columns[:-1])
tor_onehot = tor_onehot[fixed_columns]

tor_onehot.head()

Unnamed: 0,Yoga Studio,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Aquarium,Art Gallery,...,Theme Restaurant,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Let's examine the new dataframe size.

In [57]:
tor_onehot.shape

(828, 182)

Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [58]:
tor_grouped = tor_onehot.groupby('Neighborhood').mean().reset_index()
tor_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Aquarium,...,Theme Restaurant,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop
0,"Adelaide,King,Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0
2,"Brockton,Exhibition Place,Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",0.0,0.071429,0.071429,0.071429,0.142857,0.142857,0.071429,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Cabbagetown,St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Chinatown,Grange Park,Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.066667,0.033333,0.0
8,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Church and Wellesley,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333


In [59]:
tor_grouped.shape

(38, 182)

#### Let's print each neighborhood along with the top 5 most common venues

In [60]:
num_top_venues = 5

for hood in tor_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = tor_grouped[tor_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide,King,Richmond----
              venue  freq
0        Steakhouse  0.10
1              Café  0.10
2             Hotel  0.07
3  Asian Restaurant  0.07
4  Sushi Restaurant  0.07


----Berczy Park----
                venue  freq
0  Seafood Restaurant  0.07
1        Cocktail Bar  0.07
2      Farmers Market  0.07
3         Coffee Shop  0.07
4            Beer Bar  0.07


----Brockton,Exhibition Place,Parkdale Village----
            venue  freq
0  Breakfast Spot  0.09
1     Coffee Shop  0.09
2            Café  0.09
3         Stadium  0.05
4     Music Venue  0.05


----Business Reply Mail Processing Centre 969 Eastern----
                venue  freq
0  Light Rail Station  0.11
1         Yoga Studio  0.06
2             Brewery  0.06
3                 Spa  0.06
4          Smoke Shop  0.06


----CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara----
              venue  freq
0    Airport Lounge  0.14
1   Airport Service  0.14
2       C

#### Let's put that into a *pandas* dataframe

Write a function to sort the venues in descending order.

In [61]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [62]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = tor_grouped['Neighborhood']

for ind in np.arange(tor_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(tor_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Steakhouse,Café,Hotel,Asian Restaurant,Sushi Restaurant,Bar,Speakeasy,Smoke Shop,Seafood Restaurant,Gym / Fitness Center
1,Berczy Park,Farmers Market,Coffee Shop,Cocktail Bar,Seafood Restaurant,Beer Bar,Concert Hall,Steakhouse,Liquor Store,Breakfast Spot,Jazz Club
2,"Brockton,Exhibition Place,Parkdale Village",Breakfast Spot,Café,Coffee Shop,Pet Store,Bakery,Nightclub,Restaurant,Stadium,Italian Restaurant,Bar
3,Business Reply Mail Processing Centre 969 Eastern,Light Rail Station,Gym / Fitness Center,Spa,Auto Workshop,Brewery,Burrito Place,Comic Shop,Farmers Market,Fast Food Restaurant,Garden
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Airport Lounge,Airport Service,Boat or Ferry,Harbor / Marina,Bar,Plane,Coffee Shop,Sculpture Garden,Airport Gate,Airport Food Court


## Cluster Neighborhoods

Run *k*-means to cluster the neighborhood into 5 clusters.

In [64]:
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 5

tor_grouped_clustering = tor_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tor_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [78]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = bor_toronto_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,index,Postalcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,1,M4Y,Downtown Toronto,Church and Wellesley,M4Y,43.66586,-79.38316,2,Gay Bar,Wine Shop,Creperie,Salon / Barbershop,Bubble Tea Shop,Burger Joint,Restaurant,Ramen Restaurant,Pub,Pizza Place
1,2,M4R,Central Toronto,North Toronto West,M4R,43.715383,-79.405678,2,Sporting Goods Shop,Clothing Store,Coffee Shop,Gym / Fitness Center,Burger Joint,Chinese Restaurant,Dessert Shop,Diner,Gift Shop,Ice Cream Shop
2,3,M5R,Central Toronto,"The Annex,North Midtown,Yorkville",M5R,43.67271,-79.405678,2,Café,Sandwich Place,Coffee Shop,Cosmetics Shop,American Restaurant,Indian Restaurant,BBQ Joint,Pizza Place,History Museum,Pub
3,7,M6R,West Toronto,"Parkdale,Roncesvalles",M6R,43.64896,-79.456325,2,Coffee Shop,Gift Shop,Dessert Shop,Eastern European Restaurant,Bar,Bank,Dog Run,Italian Restaurant,Bookstore,Restaurant
4,8,M5V,Downtown Toronto,"CN Tower,Bathurst Quay,Island airport,Harbourf...",M5V,43.628947,-79.39442,2,Airport Lounge,Airport Service,Boat or Ferry,Harbor / Marina,Bar,Plane,Coffee Shop,Sculpture Garden,Airport Gate,Airport Food Court


In [79]:
toronto_merged.drop('index', axis=1)

Unnamed: 0,Postalcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4Y,Downtown Toronto,Church and Wellesley,M4Y,43.66586,-79.38316,2,Gay Bar,Wine Shop,Creperie,Salon / Barbershop,Bubble Tea Shop,Burger Joint,Restaurant,Ramen Restaurant,Pub,Pizza Place
1,M4R,Central Toronto,North Toronto West,M4R,43.715383,-79.405678,2,Sporting Goods Shop,Clothing Store,Coffee Shop,Gym / Fitness Center,Burger Joint,Chinese Restaurant,Dessert Shop,Diner,Gift Shop,Ice Cream Shop
2,M5R,Central Toronto,"The Annex,North Midtown,Yorkville",M5R,43.67271,-79.405678,2,Café,Sandwich Place,Coffee Shop,Cosmetics Shop,American Restaurant,Indian Restaurant,BBQ Joint,Pizza Place,History Museum,Pub
3,M6R,West Toronto,"Parkdale,Roncesvalles",M6R,43.64896,-79.456325,2,Coffee Shop,Gift Shop,Dessert Shop,Eastern European Restaurant,Bar,Bank,Dog Run,Italian Restaurant,Bookstore,Restaurant
4,M5V,Downtown Toronto,"CN Tower,Bathurst Quay,Island airport,Harbourf...",M5V,43.628947,-79.39442,2,Airport Lounge,Airport Service,Boat or Ferry,Harbor / Marina,Bar,Plane,Coffee Shop,Sculpture Garden,Airport Gate,Airport Food Court
5,M6H,West Toronto,"Dovercourt Village,Dufferin",M6H,43.669005,-79.442259,2,Pharmacy,Bakery,Supermarket,Middle Eastern Restaurant,Music Venue,Park,Café,Brewery,Bar,Bank
6,M6S,West Toronto,"Runnymede,Swansea",M6S,43.651571,-79.48445,2,Café,Sushi Restaurant,Pizza Place,Coffee Shop,Italian Restaurant,Smoothie Shop,Dessert Shop,Bar,Fish & Chips Shop,Pub
7,M4P,Central Toronto,Davisville North,M4P,43.712751,-79.390197,2,Gym,Hotel,Clothing Store,Sandwich Place,Breakfast Spot,Food & Drink Shop,Park,Garden Center,Garden,Dumpling Restaurant
8,M5S,Downtown Toronto,"Harbord,University of Toronto",M5S,43.662696,-79.400049,2,Café,Sandwich Place,Restaurant,Bookstore,Japanese Restaurant,Italian Restaurant,Bar,Bakery,French Restaurant,Beer Bar
9,M4K,East Toronto,"The Danforth West,Riverdale",M4K,43.679557,-79.352188,2,Greek Restaurant,Italian Restaurant,Ice Cream Shop,Yoga Studio,Pizza Place,Brewery,Bookstore,Spa,Restaurant,Juice Bar


Let's visualize the resulting clusters

In [81]:
import matplotlib.cm as cm
import matplotlib.colors as colors
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examine Clusters

#### Cluster 1

In [82]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Postalcode,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
23,M4V,43.686412,-79.400049,0,Pub,Coffee Shop,American Restaurant,Supermarket,Bagel Shop,Sports Bar,Fried Chicken Joint,Pizza Place,Sushi Restaurant,Restaurant
36,M4E,43.676357,-79.293031,0,Health Food Store,Trail,Pub,Wine Shop,Dance Studio,Ethiopian Restaurant,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Discount Store


#### Cluster 2

In [83]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Postalcode,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
31,M5P,43.696948,-79.411307,1,Park,Trail,Jewelry Store,Sushi Restaurant,Wine Shop,Deli / Bodega,Ethiopian Restaurant,Eastern European Restaurant,Dumpling Restaurant,Dog Run
32,M4W,43.679563,-79.377529,1,Park,Playground,Trail,Wine Shop,Dance Studio,Ethiopian Restaurant,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Discount Store


#### Cluster 3

In [84]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Postalcode,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4Y,43.66586,-79.38316,2,Gay Bar,Wine Shop,Creperie,Salon / Barbershop,Bubble Tea Shop,Burger Joint,Restaurant,Ramen Restaurant,Pub,Pizza Place
1,M4R,43.715383,-79.405678,2,Sporting Goods Shop,Clothing Store,Coffee Shop,Gym / Fitness Center,Burger Joint,Chinese Restaurant,Dessert Shop,Diner,Gift Shop,Ice Cream Shop
2,M5R,43.67271,-79.405678,2,Café,Sandwich Place,Coffee Shop,Cosmetics Shop,American Restaurant,Indian Restaurant,BBQ Joint,Pizza Place,History Museum,Pub
3,M6R,43.64896,-79.456325,2,Coffee Shop,Gift Shop,Dessert Shop,Eastern European Restaurant,Bar,Bank,Dog Run,Italian Restaurant,Bookstore,Restaurant
4,M5V,43.628947,-79.39442,2,Airport Lounge,Airport Service,Boat or Ferry,Harbor / Marina,Bar,Plane,Coffee Shop,Sculpture Garden,Airport Gate,Airport Food Court
5,M6H,43.669005,-79.442259,2,Pharmacy,Bakery,Supermarket,Middle Eastern Restaurant,Music Venue,Park,Café,Brewery,Bar,Bank
6,M6S,43.651571,-79.48445,2,Café,Sushi Restaurant,Pizza Place,Coffee Shop,Italian Restaurant,Smoothie Shop,Dessert Shop,Bar,Fish & Chips Shop,Pub
7,M4P,43.712751,-79.390197,2,Gym,Hotel,Clothing Store,Sandwich Place,Breakfast Spot,Food & Drink Shop,Park,Garden Center,Garden,Dumpling Restaurant
8,M5S,43.662696,-79.400049,2,Café,Sandwich Place,Restaurant,Bookstore,Japanese Restaurant,Italian Restaurant,Bar,Bakery,French Restaurant,Beer Bar
9,M4K,43.679557,-79.352188,2,Greek Restaurant,Italian Restaurant,Ice Cream Shop,Yoga Studio,Pizza Place,Brewery,Bookstore,Spa,Restaurant,Juice Bar


#### Cluster 4

In [85]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Postalcode,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
18,M5N,43.711695,-79.416936,3,Home Service,Garden,Wine Shop,Deli / Bodega,Falafel Restaurant,Ethiopian Restaurant,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Discount Store


#### Cluster 5

In [86]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Postalcode,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
34,M4T,43.689574,-79.38316,4,Gym,Trail,Restaurant,Comfort Food Restaurant,Deli / Bodega,Falafel Restaurant,Ethiopian Restaurant,Eastern European Restaurant,Dumpling Restaurant,Dog Run
