First of all we import needed libraries and open our saved data as new dataframe.

In [2]:
import pandas as pd
from geopy.geocoders import Nominatim #convert an address into latitude and longitude values
import folium
from sklearn.cluster import KMeans
import requests
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors



In [3]:
postal_df = pd.read_csv('final_postal_df.csv')
postal_df.shape

(103, 5)

Now we define coordinates of Toronto city.

In [4]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="skirka")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

Create a map of Toronto with neighborhoods superimposed on top.

In [5]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(postal_df['Latitude'], postal_df['Longitude'], postal_df['Borough'], postal_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Due to there are no so many postal code zones in our dataframe, I've decided to analyze and cluster all neighborhoods. Next, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them. First of all create some usefull variables.

In [6]:
CLIENT_ID = '5F4Z503LHA0TFNKY3IXOIXFHVYYMQP5L0DKJONG0NTNVZ0RW'
CLIENT_SECRET = '3W33JWTCF0L5GDTET3PZ1FP05JLE0T4HZPCPWZJEK022SVJZ'
VERSION = '20181221'
RADIUS = 500
LIMIT = 100

Define function which creates dataframe of all neighborhoods and venues.

In [7]:
def getNearbyVenues(names, latitudes, longitudes):
    venues_list=[]

    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            RADIUS, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Run the function above to receive the needed dataframe.

In [8]:
toronto_venues = getNearbyVenues(names=postal_df['Neighborhood'],
                                   latitudes=postal_df['Latitude'],
                                   longitudes=postal_df['Longitude']
                                  )

In [9]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge,Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497,RIGHT WAY TO GOLF,43.785177,-79.161108,Golf Course
2,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
3,"Guildwood,Morningside,West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,"Guildwood,Morningside,West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


Next step is to restructure the data for the analysis. The desired result is the following: neighborhoods should be in rows, different types of venues in columns, specific gravity of each type of venues as values.
We do it in 2 steps. First is to create dummies, next - group the rows as we need.

In [10]:
#one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

#add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

#move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
len(toronto_onehot['Neighborhood'].unique())

#group the rows by neighborhood
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

In [11]:
toronto_grouped.head()

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,"Adelaide,King,Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.01
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Agincourt North,L'Amoreaux East,Milliken,Steel...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Alderwood,Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
toronto_grouped.shape

(99, 274)

Please pay attention, that initially we have 103 neighborhoods, while now only 99. It's because some coordinates (neighborhoods) have no any venues according to Forsquare API requests. Bit 99 is still more than enough for further analysis.

Now we have data for cluster analysis. To understand the reasons of clustering later, let's create one more dataframe, which shows top 10 venues for each neighborhood. First, let's write a function to sort the venues in descending order. Then create the new dataframe and display the top 10 venues for each neighborhood.

In [13]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [14]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Coffee Shop,Café,Thai Restaurant,American Restaurant,Steakhouse,Asian Restaurant,Bakery,Hotel,Clothing Store,Restaurant
1,Agincourt,Breakfast Spot,Skating Rink,Clothing Store,Lounge,Women's Store,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
2,"Agincourt North,L'Amoreaux East,Milliken,Steel...",Playground,Park,Women's Store,Donut Shop,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant
3,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",Pizza Place,Grocery Store,Fried Chicken Joint,Coffee Shop,Sandwich Place,Pharmacy,Fast Food Restaurant,Beer Store,Gluten-free Restaurant,Dance Studio
4,"Alderwood,Long Branch",Pizza Place,Athletics & Sports,Coffee Shop,Pharmacy,Pool,Pub,Sandwich Place,Skating Rink,Gym,Airport Service


Now let's cluster our neighborhoods.

In [15]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 1, 0, 0, 2, 2, 2, 2, 2], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [16]:
toronto_merged = neighborhoods_venues_sorted

# add clustering labels
toronto_merged['Cluster'] = kmeans.labels_

# merge toronto_merged with postal_df to add latitude/longitude and other information for each neighborhood
toronto_merged = toronto_merged.join(postal_df.set_index('Neighborhood'), on='Neighborhood')

# change the order of columns for better visualisation.
cols = toronto_merged.columns.tolist()

cols.insert(0, 'PostalCode')
cols.insert(1, 'Borough')
cols.insert(3, 'Latitude')
cols.insert(4, 'Longitude')
cols.insert(5, 'Cluster')
cols = cols[:-6]

toronto_merged = toronto_merged[cols]

toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue
0,M5H,Downtown Toronto,"Adelaide,King,Richmond",43.650571,-79.384568,2,Coffee Shop,Café,Thai Restaurant,American Restaurant,Steakhouse,Asian Restaurant,Bakery,Hotel,Clothing Store
1,M1S,Scarborough,Agincourt,43.7942,-79.262029,2,Breakfast Spot,Skating Rink,Clothing Store,Lounge,Women's Store,Diner,Discount Store,Dog Run,Doner Restaurant
2,M1V,Scarborough,"Agincourt North,L'Amoreaux East,Milliken,Steel...",43.815252,-79.284577,1,Playground,Park,Women's Store,Donut Shop,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run
3,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",43.739416,-79.588437,0,Pizza Place,Grocery Store,Fried Chicken Joint,Coffee Shop,Sandwich Place,Pharmacy,Fast Food Restaurant,Beer Store,Gluten-free Restaurant
4,M8W,Etobicoke,"Alderwood,Long Branch",43.602414,-79.543484,0,Pizza Place,Athletics & Sports,Coffee Shop,Pharmacy,Pool,Pub,Sandwich Place,Skating Rink,Gym


Let's show the results on the map.

In [17]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Now let's analyse each cluster.

In [18]:
#The first cluster - mainly periphery of Toronto. The most popular places are pizza, store and coffee shop.
toronto_merged.loc[toronto_merged['Cluster'] == 0, toronto_merged.columns[[2] + list(range(6, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue
3,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",Pizza Place,Grocery Store,Fried Chicken Joint,Coffee Shop,Sandwich Place,Pharmacy,Fast Food Restaurant,Beer Store,Gluten-free Restaurant
4,"Alderwood,Long Branch",Pizza Place,Athletics & Sports,Coffee Shop,Pharmacy,Pool,Pub,Sandwich Place,Skating Rink,Gym
24,"Clarks Corners,Sullivan,Tam O'Shanter",Pizza Place,Chinese Restaurant,Rental Car Location,Thai Restaurant,Fried Chicken Joint,Noodle House,Fast Food Restaurant,Italian Restaurant,Drugstore
56,Humber Summit,Pizza Place,Shopping Mall,Empanada Restaurant,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
88,"The Junction North,Runnymede",Pizza Place,Women's Store,Drugstore,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
91,Victoria Village,Pizza Place,Coffee Shop,Hockey Arena,Portuguese Restaurant,Donut Shop,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store
92,Westmount,Pizza Place,Middle Eastern Restaurant,Sandwich Place,Chinese Restaurant,Coffee Shop,Intersection,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
94,Willowdale West,Pizza Place,Grocery Store,Coffee Shop,Butcher,Pharmacy,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
95,Woburn,Coffee Shop,Korean Restaurant,Women's Store,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop


In [19]:
#The 2'nd cluster - relax zone with park.
toronto_merged.loc[toronto_merged['Cluster'] == 1, toronto_merged.columns[[2] + list(range(6, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue
2,"Agincourt North,L'Amoreaux East,Milliken,Steel...",Playground,Park,Women's Store,Donut Shop,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run
13,"CFB Toronto,Downsview East",Snack Place,Airport,Park,Bus Stop,Empanada Restaurant,Ethiopian Restaurant,Event Space,Electronics Store,Eastern European Restaurant
16,Caledonia-Fairbanks,Park,Women's Store,Pharmacy,Fast Food Restaurant,Market,Drugstore,Diner,Discount Store,Dog Run
40,East Toronto,Park,Convenience Store,Women's Store,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
45,"Forest Hill North,Forest Hill West",Trail,Sushi Restaurant,Park,Jewelry Store,Women's Store,Dim Sum Restaurant,Diner,Discount Store,Dog Run
58,"Kingsview Village,Martin Grove Gardens,Richvie...",Pizza Place,Park,Mobile Phone Shop,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant
62,Lawrence Park,Dim Sum Restaurant,Swim School,Park,Bus Line,Women's Store,Donut Shop,Diner,Discount Store,Dog Run
65,"Maple Leaf Park,North Park,Upwood Park",Bakery,Basketball Court,Park,Construction & Landscaping,Women's Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore
72,Parkwoods,Fast Food Restaurant,Park,Food & Drink Shop,Ethiopian Restaurant,Empanada Restaurant,Electronics Store,Event Space,Eastern European Restaurant,Dessert Shop
74,Rosedale,Park,Playground,Trail,Women's Store,Donut Shop,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store


In [20]:
#The 3'd cluster - mainly cnter of Toronto with very diversed types of venues.
toronto_merged.loc[toronto_merged['Cluster'] == 2, toronto_merged.columns[[2] + list(range(6, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue
0,"Adelaide,King,Richmond",Coffee Shop,Café,Thai Restaurant,American Restaurant,Steakhouse,Asian Restaurant,Bakery,Hotel,Clothing Store
1,Agincourt,Breakfast Spot,Skating Rink,Clothing Store,Lounge,Women's Store,Diner,Discount Store,Dog Run,Doner Restaurant
5,"Bathurst Manor,Downsview North,Wilson Heights",Coffee Shop,Sushi Restaurant,Deli / Bodega,Fried Chicken Joint,Frozen Yogurt Shop,Bank,Bridal Shop,Sandwich Place,Diner
6,Bayview Village,Bank,Chinese Restaurant,Café,Japanese Restaurant,Ethiopian Restaurant,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
7,"Bedford Park,Lawrence Manor East",Coffee Shop,Sushi Restaurant,Italian Restaurant,Thai Restaurant,Fast Food Restaurant,Greek Restaurant,Sandwich Place,Juice Bar,Liquor Store
8,Berczy Park,Coffee Shop,Cocktail Bar,Restaurant,Cheese Shop,Café,Pub,Seafood Restaurant,Farmers Market,Steakhouse
9,"Birch Cliff,Cliffside West",College Stadium,Skating Rink,Café,General Entertainment,Women's Store,Donut Shop,Diner,Discount Store,Dog Run
10,"Bloordale Gardens,Eringate,Markland Wood,Old B...",Pizza Place,Liquor Store,Café,Shopping Plaza,Pharmacy,Beer Store,Golf Course,Gluten-free Restaurant,Electronics Store
11,"Brockton,Exhibition Place,Parkdale Village",Breakfast Spot,Coffee Shop,Café,Pet Store,Caribbean Restaurant,Falafel Restaurant,Performing Arts Venue,Stadium,Bar
12,Business reply mail Processing Centre969 Eastern,Light Rail Station,Yoga Studio,Garden,Smoke Shop,Park,Farmers Market,Spa,Fast Food Restaurant,Brewery


In [21]:
#The 4'th cluster - baseball zone.
toronto_merged.loc[toronto_merged['Cluster'] == 3, toronto_merged.columns[[2] + list(range(6, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue
36,Downsview Central,Baseball Field,Food Truck,Women's Store,Dumpling Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore
41,"Emery,Humberlea",Baseball Field,Women's Store,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore
55,"Humber Bay,King's Mill Park,Kingsway Park Sout...",Baseball Field,Construction & Landscaping,Women's Store,Dumpling Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore


In [22]:
#The 5'th cluster - neighborhood which differs from other.
toronto_merged.loc[toronto_merged['Cluster'] == 4, toronto_merged.columns[[2] + list(range(6, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue
80,"Silver Hills,York Mills",Cafeteria,Women's Store,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant
