# IBM Capstone

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.

Libraries imported.


In [2]:
import requests
from bs4 import BeautifulSoup

## Get data from wikipedia

In [3]:
url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text


soup = BeautifulSoup(url,"lxml")
#print(soup.prettify())




table = soup.find("table",{"class":"wikitable sortable"})


parsed_table_data = []



rows = table.findAll('tr')
for row in rows:
                chicken = row.findChildren(recursive=False)
                row_text = []
            
                for chick in chicken:
                    clean_text = chick.text.strip()
                    row_text.append(clean_text)
  
            
                parsed_table_data.append(row_text)
                
                


df=pd.DataFrame(parsed_table_data,columns=["Postcode","Borough","Neighbour"])


df.head()

Unnamed: 0,Postcode,Borough,Neighbour
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


## Data cleaning

In [17]:
df=df[df.Borough!="Not assigned"]
df=df[df.Postcode!="Postcode"]
df.shape
df.loc[df.Neighbour=="Not assigned","Neighbour"] = df.Borough
df.head()

AttributeError: 'DataFrame' object has no attribute 'Postcode'

## Group by postcode

In [None]:
df1 = (df.groupby(['Postcode','Borough'])['Neighbour']
       .apply(lambda x: ','.join(set(x.dropna())))
       .reset_index())
df1.columns=["PostCode","Borough","Neighbour"]
df1.head()

In [18]:
df=df1
df.shape

(103, 3)

## GEO Data

In [19]:
geo_df = pd.read_csv("http://cocl.us/Geospatial_data")
geo_df.columns=["PostCode","Latitude","Longitude"]
geo_df.head()

Unnamed: 0,PostCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [20]:
df=pd.merge(df,geo_df,on="PostCode")
df.head()

Unnamed: 0,PostCode,Borough,Neighbour,Latitude,Longitude
0,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Port Union,Rouge Hill,Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Morningside,Guildwood,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Filtering rows with the value Toronto

In [21]:
df=df.loc[df['Borough'].str.contains("Toronto")]
df.head()

Unnamed: 0,PostCode,Borough,Neighbour,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"Riverdale,The Danforth West",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar,The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [22]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

## Nearby venues

In [23]:
df.shape

(38, 5)

## Position of Toronto

In [24]:
address = 'Toronto,CA'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Toronto City are 43.653963, -79.387207.


In [25]:
df.head()

Unnamed: 0,PostCode,Borough,Neighbour,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"Riverdale,The Danforth West",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar,The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [44]:
neighborhoods=df
neighborhoods.head()


Unnamed: 0,PostCode,Borough,Neighbour,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"Riverdale,The Danforth West",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar,The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


#### Let's check the size of the resulting dataframe

## A map of Toronto with neighborhoods superimposed

In [48]:
map_t = folium.Map(location=[latitude, longitude], zoom_start=10)


# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighbour']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_t)  
    
map_t

### Let's simplify the above map and segment and cluster only the neighborhoods in East Toronto. So let's slice the original dataframe and create a new dataframe of the Toronto data.
 

In [None]:
df

In [56]:
East_Toronto = df[df['Borough'] == 'East Toronto'].reset_index(drop=True)
East_Toronto.head()

Unnamed: 0,PostCode,Borough,Neighbour,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"Riverdale,The Danforth West",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar,The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M7Y,East Toronto,Business reply mail Processing Centre969 Eastern,43.662744,-79.321558


In [57]:
address = 'East Toronto, Toronto,Canada'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of East Toronto are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of East Toronto are 43.6247901, -79.3934918.


In [60]:
# create map of East Toronto using latitude and longitude values
map_East_Toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(East_Toronto['Latitude'], East_Toronto['Longitude'], East_Toronto['Neighbour']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_East_Toronto)  
    
map_East_Toronto

### After retrieving all necessary data for East_Toronto. Lets explore using foursquare API Next, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them.

###  Foursquare Credentials and Version


In [52]:
CLIENT_ID = 'VOSXZ2DAW24EIZHS42UWJTZKFM43KPBUIIFHPGBBSGJYB5IK' # your Foursquare ID
CLIENT_SECRET = 'APLG3CIBEXPJRBFYVFBGMODGO43NL4AJR25XLELCT1D5YG10' # your Foursquare Secret
VERSION = '20181003' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: VOSXZ2DAW24EIZHS42UWJTZKFM43KPBUIIFHPGBBSGJYB5IK
CLIENT_SECRET:APLG3CIBEXPJRBFYVFBGMODGO43NL4AJR25XLELCT1D5YG10


In [61]:
East_Toronto.loc[0, 'Neighbour']

'The Beaches'

Get the neighbor's latitude and longitude values.

In [63]:
neighbor_latitude = East_Toronto.loc[0, 'Latitude'] # neighborhood latitude value
neighbor_longitude = East_Toronto.loc[0, 'Longitude'] # neighborhood longitude value

neighbor_name = East_Toronto.loc[0, 'Neighbour'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbor_name, 
                                                               neighbor_latitude, 
                                                               neighbor_longitude))

Latitude and longitude values of The Beaches are 43.67635739999999, -79.2930312.


In [65]:
# Now, let's get the top 500 venues that are in The Beaches within a radius of 1000 meters.
# Also create GET request url 

radius = 1000
LIMIT = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbor_latitude, 
    neighbor_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=VOSXZ2DAW24EIZHS42UWJTZKFM43KPBUIIFHPGBBSGJYB5IK&client_secret=APLG3CIBEXPJRBFYVFBGMODGO43NL4AJR25XLELCT1D5YG10&v=20181003&ll=43.67635739999999,-79.2930312&radius=1000&limit=500'

In [66]:
import requests
import json
from pandas.io.json import json_normalize
results = requests.get(url).json()

In [67]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [68]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()
len(nearby_venues)
print('{} venues by Foursquare.'.format(nearby_venues.shape[0]))

77 venues by Foursquare.


## Now lets Explore Neighborhoods in East Toronto


In [69]:
#function to repeat the same process to all the neighborhoods in Scarborough
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Call the above function on each neighborhood and create a new dataframe called East_Toronto_venues

In [71]:
East_Toronto_venues = getNearbyVenues(names=East_Toronto['Neighbour'],
                                   latitudes=East_Toronto['Latitude'],
                                   longitudes=East_Toronto['Longitude'])

The Beaches
Riverdale,The Danforth West
India Bazaar,The Beaches West
Studio District
Business reply mail Processing Centre969 Eastern


In [72]:
East_Toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Starbucks,43.678798,-79.298045,Coffee Shop
1,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
2,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
3,"Riverdale,The Danforth West",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant
4,"Riverdale,The Danforth West",43.679557,-79.352188,Dolce Gelato,43.677773,-79.351187,Ice Cream Shop


In [73]:
print('There are {} uniques categories.'.format(len(East_Toronto_venues['Venue Category'].unique())))
East_Toronto_venues.groupby('Neighborhood').count()

There are 68 uniques categories.


Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Business reply mail Processing Centre969 Eastern,17,17,17,17,17,17
"India Bazaar,The Beaches West",20,20,20,20,20,20
"Riverdale,The Danforth West",42,42,42,42,42,42
Studio District,40,40,40,40,40,40
The Beaches,3,3,3,3,3,3


## Analyze Each Neighborhood


In [77]:
# one hot encoding
East_Toronto_venues_onehot = pd.get_dummies(East_Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
East_Toronto_venues_onehot['Neighborhood'] = East_Toronto_venues['Neighborhood'] 

# move neighbor column to the first column
fixed_columns = [East_Toronto_venues_onehot.columns[-1]] + list(East_Toronto_venues_onehot.columns[:-1])
East_Toronto_venues_onehot = East_Toronto_venues_onehot[fixed_columns]

East_Toronto_venues_onehot.head()

Unnamed: 0,Yoga Studio,American Restaurant,Auto Workshop,Bakery,Bank,Bar,Board Shop,Bookstore,Breakfast Spot,Brewery,Bubble Tea Shop,Burger Joint,Burrito Place,Café,Caribbean Restaurant,Cheese Shop,Chinese Restaurant,Clothing Store,Coffee Shop,Comfort Food Restaurant,Comic Shop,Convenience Store,Cosmetics Shop,Coworking Space,Dessert Shop,Diner,Farmers Market,Fast Food Restaurant,Fish & Chips Shop,Fish Market,Food & Drink Shop,Fruit & Vegetable Store,Furniture / Home Store,Garden,Garden Center,Gastropub,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Health Food Store,Hotel,Ice Cream Shop,Indian Restaurant,Italian Restaurant,Juice Bar,Latin American Restaurant,Light Rail Station,Liquor Store,Lounge,Middle Eastern Restaurant,Movie Theater,Neighborhood,New American Restaurant,Park,Pet Store,Pizza Place,Pub,Recording Studio,Restaurant,Sandwich Place,Seafood Restaurant,Skate Park,Spa,Stationery Store,Steakhouse,Sushi Restaurant,Trail
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,The Beaches,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Riverdale,The Danforth West",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,"Riverdale,The Danforth West",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [78]:
East_Toronto_venues_onehot.shape

(122, 68)

### lets group rows by neighborhood and by taking the mean of the frequency of occurrence of each category


In [79]:
East_Toronto_venues_grouped = East_Toronto_venues_onehot.groupby('Neighborhood').mean().reset_index()
East_Toronto_venues_grouped

Unnamed: 0,Neighborhood,Yoga Studio,American Restaurant,Auto Workshop,Bakery,Bank,Bar,Board Shop,Bookstore,Breakfast Spot,Brewery,Bubble Tea Shop,Burger Joint,Burrito Place,Café,Caribbean Restaurant,Cheese Shop,Chinese Restaurant,Clothing Store,Coffee Shop,Comfort Food Restaurant,Comic Shop,Convenience Store,Cosmetics Shop,Coworking Space,Dessert Shop,Diner,Farmers Market,Fast Food Restaurant,Fish & Chips Shop,Fish Market,Food & Drink Shop,Fruit & Vegetable Store,Furniture / Home Store,Garden,Garden Center,Gastropub,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Health Food Store,Hotel,Ice Cream Shop,Indian Restaurant,Italian Restaurant,Juice Bar,Latin American Restaurant,Light Rail Station,Liquor Store,Lounge,Middle Eastern Restaurant,Movie Theater,New American Restaurant,Park,Pet Store,Pizza Place,Pub,Recording Studio,Restaurant,Sandwich Place,Seafood Restaurant,Skate Park,Spa,Stationery Store,Steakhouse,Sushi Restaurant,Trail
0,Business reply mail Processing Centre969 Eastern,0.058824,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.058824,0.058824,0.0,0.0,0.0,0.0,0.0,0.058824,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117647,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.058824,0.0,0.058824,0.058824,0.0,0.0,0.058824,0.058824,0.0,0.0,0.0,0.0
1,"India Bazaar,The Beaches West",0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.05,0.0,0.05,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.05,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.05,0.05,0.0,0.05,0.0,0.0,0.0,0.05,0.0,0.0,0.05,0.0,0.1,0.05,0.0,0.05,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.05,0.05,0.0
2,"Riverdale,The Danforth West",0.02381,0.02381,0.0,0.02381,0.0,0.0,0.0,0.047619,0.0,0.02381,0.02381,0.0,0.0,0.02381,0.02381,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.02381,0.0,0.02381,0.02381,0.0,0.0,0.0,0.0,0.0,0.02381,0.02381,0.0,0.0,0.0,0.238095,0.02381,0.0,0.0,0.02381,0.0,0.071429,0.02381,0.047619,0.02381,0.0,0.0,0.02381,0.02381,0.0,0.0,0.0,0.0,0.0,0.02381,0.02381,0.0,0.0,0.0,0.0,0.0,0.02381,0.0,0.0,0.0,0.02381
3,Studio District,0.025,0.05,0.0,0.05,0.025,0.025,0.0,0.025,0.025,0.025,0.0,0.0,0.0,0.1,0.0,0.025,0.025,0.025,0.075,0.025,0.0,0.025,0.0,0.025,0.0,0.025,0.0,0.0,0.0,0.025,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.025,0.0,0.0,0.025,0.0,0.05,0.025,0.025,0.0,0.0,0.0,0.025,0.0,0.025,0.025,0.0,0.0,0.0,0.0,0.0,0.025,0.025,0.0,0.0,0.025,0.0,0.0,0.0
4,The Beaches,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Lets print each neighborhood along with the top 5 most common venues


In [81]:
num_top_venues = 5

for hood in East_Toronto_venues_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = East_Toronto_venues_grouped[East_Toronto_venues_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Business reply mail Processing Centre969 Eastern----
                venue  freq
0  Light Rail Station  0.12
1         Yoga Studio  0.06
2          Comic Shop  0.06
3                 Spa  0.06
4          Skate Park  0.06


----India Bazaar,The Beaches West----
               venue  freq
0               Park  0.10
1      Burrito Place  0.05
2              Hotel  0.05
3     Ice Cream Shop  0.05
4  Food & Drink Shop  0.05


----Riverdale,The Danforth West----
                venue  freq
0    Greek Restaurant  0.24
1         Coffee Shop  0.07
2      Ice Cream Shop  0.07
3           Bookstore  0.05
4  Italian Restaurant  0.05


----Studio District----
                venue  freq
0                Café  0.10
1         Coffee Shop  0.08
2              Bakery  0.05
3  Italian Restaurant  0.05
4           Gastropub  0.05


----The Beaches----
               venue  freq
0                Pub  0.33
1        Coffee Shop  0.33
2        Yoga Studio  0.00
3  Indian Restaurant  0.00
4       Liquor S

Function to sort the venues in descending order.


In [82]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### New dataframe and display the top 5 venues for each neighborhood.


In [87]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = East_Toronto_venues_grouped['Neighborhood']

for ind in np.arange(East_Toronto_venues_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(East_Toronto_venues_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()
len(neighborhoods_venues_sorted)

5

In [88]:
East_Toronto_venues_grouped

Unnamed: 0,Neighborhood,Yoga Studio,American Restaurant,Auto Workshop,Bakery,Bank,Bar,Board Shop,Bookstore,Breakfast Spot,Brewery,Bubble Tea Shop,Burger Joint,Burrito Place,Café,Caribbean Restaurant,Cheese Shop,Chinese Restaurant,Clothing Store,Coffee Shop,Comfort Food Restaurant,Comic Shop,Convenience Store,Cosmetics Shop,Coworking Space,Dessert Shop,Diner,Farmers Market,Fast Food Restaurant,Fish & Chips Shop,Fish Market,Food & Drink Shop,Fruit & Vegetable Store,Furniture / Home Store,Garden,Garden Center,Gastropub,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Health Food Store,Hotel,Ice Cream Shop,Indian Restaurant,Italian Restaurant,Juice Bar,Latin American Restaurant,Light Rail Station,Liquor Store,Lounge,Middle Eastern Restaurant,Movie Theater,New American Restaurant,Park,Pet Store,Pizza Place,Pub,Recording Studio,Restaurant,Sandwich Place,Seafood Restaurant,Skate Park,Spa,Stationery Store,Steakhouse,Sushi Restaurant,Trail
0,Business reply mail Processing Centre969 Eastern,0.058824,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.058824,0.058824,0.0,0.0,0.0,0.0,0.0,0.058824,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117647,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.058824,0.0,0.058824,0.058824,0.0,0.0,0.058824,0.058824,0.0,0.0,0.0,0.0
1,"India Bazaar,The Beaches West",0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.05,0.0,0.05,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.05,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.05,0.05,0.0,0.05,0.0,0.0,0.0,0.05,0.0,0.0,0.05,0.0,0.1,0.05,0.0,0.05,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.05,0.05,0.0
2,"Riverdale,The Danforth West",0.02381,0.02381,0.0,0.02381,0.0,0.0,0.0,0.047619,0.0,0.02381,0.02381,0.0,0.0,0.02381,0.02381,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.02381,0.0,0.02381,0.02381,0.0,0.0,0.0,0.0,0.0,0.02381,0.02381,0.0,0.0,0.0,0.238095,0.02381,0.0,0.0,0.02381,0.0,0.071429,0.02381,0.047619,0.02381,0.0,0.0,0.02381,0.02381,0.0,0.0,0.0,0.0,0.0,0.02381,0.02381,0.0,0.0,0.0,0.0,0.0,0.02381,0.0,0.0,0.0,0.02381
3,Studio District,0.025,0.05,0.0,0.05,0.025,0.025,0.0,0.025,0.025,0.025,0.0,0.0,0.0,0.1,0.0,0.025,0.025,0.025,0.075,0.025,0.0,0.025,0.0,0.025,0.0,0.025,0.0,0.0,0.0,0.025,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.025,0.0,0.0,0.025,0.0,0.05,0.025,0.025,0.0,0.0,0.0,0.025,0.0,0.025,0.025,0.0,0.0,0.0,0.0,0.0,0.025,0.025,0.0,0.0,0.025,0.0,0.0,0.0
4,The Beaches,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Cluster Neighborhoods


In [91]:
# set number of clusters
kclusters = 5

East_Toronto_grouped_clustering = East_Toronto_venues_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(East_Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([3, 4, 2, 1, 0], dtype=int32)

In [95]:
East_Toronto_merged = East_Toronto

# add clustering labels
East_Toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
East_Toronto_merged = East_Toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbour')

East_Toronto_merged.head() # check the last columns!

Unnamed: 0,PostCode,Borough,Neighbour,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,3,Coffee Shop,Pub,Trail,Coworking Space,Comfort Food Restaurant
1,M4K,East Toronto,"Riverdale,The Danforth West",43.679557,-79.352188,4,Greek Restaurant,Ice Cream Shop,Coffee Shop,Italian Restaurant,Bookstore
2,M4L,East Toronto,"India Bazaar,The Beaches West",43.668999,-79.315572,2,Park,Pet Store,Hotel,Ice Cream Shop,Italian Restaurant
3,M4M,East Toronto,Studio District,43.659526,-79.340923,1,Café,Coffee Shop,Gastropub,Bakery,Italian Restaurant
4,M7Y,East Toronto,Business reply mail Processing Centre969 Eastern,43.662744,-79.321558,0,Light Rail Station,Garden,Pizza Place,Auto Workshop,Brewery


## Visualize Clusters

In [96]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(East_Toronto_merged['Latitude'], East_Toronto_merged['Longitude'], East_Toronto_merged['Neighbour'], East_Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters