In [176]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium
import requests
import json

## Scraping the web link and inserting data into dataframe using Beautifulsoup package

In [16]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', attrs={'class':'wikitable sortable'})
table_rows = table.find_all('tr')

l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)
df = pd.DataFrame(l, columns=["Postcode", "Borough", "Neighborhood"])

In [21]:
dfc = df.drop(0, axis=0).copy(deep=True) ## deleting first row as it was blank

In [22]:
dfc.head() ## final data frame

Unnamed: 0,Postcode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned\n
2,M2A,Not assigned,Not assigned\n
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n
5,M5A,Downtown Toronto,Harbourfront\n


In [31]:
## there is a newline character at the end of every neighborhood, removing that below

dfc['Neighborhood'] = dfc['Neighborhood'].apply(lambda x : x.rstrip())

In [64]:
dfc.head()

Unnamed: 0,Postcode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor


In [39]:
## removing rows where Borough is not assigned
dfc = dfc[dfc['Borough']!='Not assigned']

## More than one neighborhood can exist in one postal code area. Making a list of Postal Areas with more than one neighborhood

In [63]:
index_list = dfc['Postcode'].value_counts().index.to_list()
postal_codes = []
for index in index_list:
    if df['Postcode'].value_counts()[index] > 1:
        postal_codes.append(index)
        
print(postal_codes)

['M8Y', 'M9V', 'M5V', 'M4V', 'M8Z', 'M9B', 'M9R', 'M1V', 'M6M', 'M9C', 'M2J', 'M1C', 'M8X', 'M1L', 'M1M', 'M5J', 'M3H', 'M8V', 'M5H', 'M1E', 'M1K', 'M5R', 'M6L', 'M5T', 'M6K', 'M1T', 'M1P', 'M3K', 'M4B', 'M5K', 'M5B', 'M6J', 'M6S', 'M4T', 'M9M', 'M4X', 'M6P', 'M2L', 'M5P', 'M1R', 'M5L', 'M6A', 'M6H', 'M6N', 'M3J', 'M1N', 'M6R', 'M3C', 'M8W', 'M2M', 'M5X', 'M5S', 'M1B', 'M5M', 'M4K', 'M4L']


In [72]:
dfc[dfc['Postcode']=='M8Y']['Neighborhood'].index[0]

266

## combining neighborhoods in one row.

In [82]:
neighborhoods_delete = []
for pc in postal_codes:
    for neighborhood in dfc[dfc['Postcode']==pc]['Neighborhood'].to_list():
        neighborhoods_delete.extend(dfc[dfc['Postcode']==pc]['Neighborhood'].to_list()[1:])
        dfc.loc[dfc[dfc['Postcode']==pc]['Neighborhood'].index[0], 'Neighborhood'] =dfc.loc[dfc[dfc['Postcode']==pc]['Neighborhood'].index[0], 'Neighborhood']+','+neighborhood 

In [83]:
neighborhoods_delete

["King's Mill Park",
 'Kingsway Park South East',
 'Mimico NE',
 'Old Mill South',
 'The Queensway East',
 'Royal York South East',
 'Sunnylea',
 "King's Mill Park",
 'Kingsway Park South East',
 'Mimico NE',
 'Old Mill South',
 'The Queensway East',
 'Royal York South East',
 'Sunnylea',
 "King's Mill Park",
 'Kingsway Park South East',
 'Mimico NE',
 'Old Mill South',
 'The Queensway East',
 'Royal York South East',
 'Sunnylea',
 "King's Mill Park",
 'Kingsway Park South East',
 'Mimico NE',
 'Old Mill South',
 'The Queensway East',
 'Royal York South East',
 'Sunnylea',
 "King's Mill Park",
 'Kingsway Park South East',
 'Mimico NE',
 'Old Mill South',
 'The Queensway East',
 'Royal York South East',
 'Sunnylea',
 "King's Mill Park",
 'Kingsway Park South East',
 'Mimico NE',
 'Old Mill South',
 'The Queensway East',
 'Royal York South East',
 'Sunnylea',
 "King's Mill Park",
 'Kingsway Park South East',
 'Mimico NE',
 'Old Mill South',
 'The Queensway East',
 'Royal York South East'

In [85]:
dfc[dfc['Postcode']=='M9V']

Unnamed: 0,Postcode,Borough,Neighborhood
228,M9V,Etobicoke,"Albion Gardens,Albion Gardens,Beaumond Heights..."
229,M9V,Etobicoke,Beaumond Heights
230,M9V,Etobicoke,Humbergate
231,M9V,Etobicoke,Jamestown
232,M9V,Etobicoke,Mount Olive
233,M9V,Etobicoke,Silverstone
234,M9V,Etobicoke,South Steeles
235,M9V,Etobicoke,Thistletown


In [95]:
dfcc = dfc.copy(deep=True)

## deleting rows with neighborhoods which are already combined into single row.

In [96]:
indexes = []
for n in neighborhoods_delete:
    dfc.drop(dfc[dfc['Neighborhood']==n].index, inplace=True)

In [105]:
dfc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102 entries, 0 to 101
Data columns (total 3 columns):
Postcode        102 non-null object
Borough         102 non-null object
Neighborhood    102 non-null object
dtypes: object(3)
memory usage: 3.2+ KB


In [103]:
dfc.index = np.arange(0, len(dfc)) ## rearranging index to start from 0

In [106]:
dfc.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Heights,Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


In [110]:
dfc[dfc['Neighborhood']=='not assigned'] ## checking if any neighborhood is not assigned.

Unnamed: 0,Postcode,Borough,Neighborhood


## reading the coordinates csv and merging with the previous dataframe

In [111]:
df_coord = pd.read_csv('D://Machine Learning and AI//Courses//IBM Data Science Professional//Geospatial_Coordinates.csv')

In [112]:
df_coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [113]:
df_coord.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
Postal Code    103 non-null object
Latitude       103 non-null float64
Longitude      103 non-null float64
dtypes: float64(2), object(1)
memory usage: 2.5+ KB


In [114]:
dfc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102 entries, 0 to 101
Data columns (total 3 columns):
Postcode        102 non-null object
Borough         102 non-null object
Neighborhood    102 non-null object
dtypes: object(3)
memory usage: 3.2+ KB


In [115]:
set1 = set(df_coord['Postal Code'].to_list())

In [117]:
set2 = set(dfc['Postcode'].to_list())

In [119]:
set1.difference(set2) ## there is row count difference between the two dataframes, so trying to find out which postal code is not present in the first dataframe

{'M5C'}

In [121]:
dfc[dfc['Postcode']=='M5C']  ## M5C postal code is not present in neighborhood df   

Unnamed: 0,Postcode,Borough,Neighborhood


In [124]:
df_coord.drop(df_coord[df_coord['Postal Code']=='M5C'].index, inplace=True) ## dropping row where postal code is M5C

In [125]:
df_coord.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102 entries, 0 to 102
Data columns (total 3 columns):
Postal Code    102 non-null object
Latitude       102 non-null float64
Longitude      102 non-null float64
dtypes: float64(2), object(1)
memory usage: 3.2+ KB


In [126]:
df_coord.sort_values(by='Postal Code', inplace=True)

In [128]:
dfc.sort_values(by='Postcode', inplace=True)

In [129]:
df_coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [130]:
dfc.head()

Unnamed: 0,Postcode,Borough,Neighborhood
6,M1B,Scarborough,"Rouge,Rouge,Malvern"
12,M1C,Scarborough,"Highland Creek,Highland Creek,Rouge Hill,Port ..."
17,M1E,Scarborough,"Guildwood,Guildwood,Morningside,West Hill"
21,M1G,Scarborough,Woburn
25,M1H,Scarborough,Cedarbrae


In [133]:
df_coord.rename(columns = {'Postal Code':'Postcode'}, inplace = True) ## renaming col so that the two data frames can be merged on it

In [134]:
df_coord.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## merging coordinates with neighborhoods dataframe

In [137]:
dfc = pd.merge(dfc, df_coord, on='Postcode')

In [138]:
dfc.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Highland Creek,Rouge Hill,Port ...",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


#### Use geopy library to get the latitude and longitude values of New York City

In [142]:
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


#### Create a map of New York with neighborhoods superimposed on top.

In [144]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(dfc['Latitude'], dfc['Longitude'], dfc['Borough'], dfc['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

However, for illustration purposes, let's simplify the above map and segment and cluster only the neighborhoods in Scarborough. So let's slice the original dataframe and create a new dataframe of the Scarborough data.

In [145]:
scarborough_data = dfc[dfc['Borough'] == 'Scarborough'].reset_index(drop=True)
scarborough_data.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Highland Creek,Rouge Hill,Port ...",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


Let's get the geographical coordinates of Scarborough.

In [150]:
address = 'Scarborough'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Scarborough are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Scarborough are 54.2820009, -0.4011868.


let's visualize Manhattan the Scarborough in it.

In [152]:
map_scarborough  = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(scarborough_data['Latitude'], scarborough_data['Longitude'], scarborough_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_scarborough)  
    
map_scarborough

Next, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them.

#### Define Foursquare Credentials and Version

In [160]:
CLIENT_ID = '***' # your Foursquare ID
CLIENT_SECRET = '***' # your Foursquare Secret
VERSION = '20200313' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 1VGZ4VCLGXD0NSQ2UMXS11CCUJE5RRMMCWCJOEH2QV3F5Z5I
CLIENT_SECRET:E3CBEJJHPTA5XEGGEL4TNYB4D2UJCKKQUAEOHVY3SPK1DTE5


#### Let's explore one of the neighborhoods in our dataframe.

In [161]:
scarborough_data.loc[3, 'Neighborhood']

'Woburn'

In [162]:
neighborhood_latitude = scarborough_data.loc[3, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = scarborough_data.loc[3, 'Longitude'] # neighborhood longitude value

neighborhood_name = scarborough_data.loc[3, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Woburn are 43.7709921, -79.21691740000001.


#### Now, let's get the top 50 venues that are in Marble Hill within a radius of 100 meters.

In [173]:
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    500, 
    50)

In [174]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e6a8dada2e538001bc1c43a'},
  'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 3,
  'suggestedBounds': {'ne': {'lat': 43.7754921045, 'lng': -79.21069729639068},
   'sw': {'lat': 43.7664920955, 'lng': -79.22313750360935}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4cc1d28c06c254815ac18547',
       'name': 'Starbucks',
       'location': {'address': '300 Borough Dr',
        'crossStreet': 'Scarborough Town Centre',
        'lat': 43.770037201625215,
        'lng': -79.22115586641958,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.770037201625215,
          'lng': -79.22115586641958}],
        'distance': 356,
        'cc': 'CA',
        '

In [175]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [180]:
venues = results['response']['groups'][0]['items'] ## info is in the items key
    
nearby_venues = json_normalize(venues) # flatten JSON

# # filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Starbucks,Coffee Shop,43.770037,-79.221156
1,Tim Hortons,Coffee Shop,43.770827,-79.223078
2,Korean Grill House,Korean Restaurant,43.770812,-79.214502


## . Explore Neighborhoods in Scarborough

In [190]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            500, 
            10)
            
        # make the GET request
        resultsc = requests.get(url).json()
        results = resultsc["response"]["groups"][0]["items"]
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)


scarborough_data.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Highland Creek,Rouge Hill,Port ...",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [198]:

# scarborough_venues = getNearbyVenues(names=scarborough_data.loc[3:4,['Neighborhood']],
#                                    latitudes=scarborough_data.loc[3:4,['Latitude']],
#                                    longitudes=scarborough_data.loc[3:4,['Longitude']]
#                                   )

scarborough_venues = getNearbyVenues(names=['Woburn', 'Cedarbrae'],
                                   latitudes=[43.770992,43.773136 ],
                                   longitudes=[-79.216917, -79.239476]
                                  )

#  scarborough_data.loc[3:5,['Longitude']]

Woburn
Cedarbrae


In [201]:
scarborough_venues

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Woburn,43.770992,-79.216917,Starbucks,43.770037,-79.221156,Coffee Shop
1,Woburn,43.770992,-79.216917,Tim Hortons,43.770827,-79.223078,Coffee Shop
2,Woburn,43.770992,-79.216917,Korean Grill House,43.770812,-79.214502,Korean Restaurant
3,Cedarbrae,43.773136,-79.239476,Federick Restaurant,43.774697,-79.241142,Hakka Restaurant
4,Cedarbrae,43.773136,-79.239476,Drupati's Roti & Doubles,43.775222,-79.241678,Caribbean Restaurant
5,Cedarbrae,43.773136,-79.239476,Thai One On,43.774468,-79.241268,Thai Restaurant
6,Cedarbrae,43.773136,-79.239476,Centennial Recreation Centre,43.774593,-79.2365,Athletics & Sports
7,Cedarbrae,43.773136,-79.239476,TD Canada Trust,43.77483,-79.241251,Bank
8,Cedarbrae,43.773136,-79.239476,Petro-Canada,43.774106,-79.243097,Gas Station
9,Cedarbrae,43.773136,-79.239476,B&A Bakery,43.774391,-79.243877,Bakery


In [202]:
scarborough_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Cedarbrae,9,9,9,9,9,9
Woburn,3,3,3,3,3,3


#### Let's find out how many unique categories can be curated from all the returned venues

In [203]:
print('There are {} uniques categories.'.format(len(scarborough_venues['Venue Category'].unique())))

There are 11 uniques categories.


##  Analyze Each Neighborhood

In [204]:
# one hot encoding
scarborough_onehot = pd.get_dummies(scarborough_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
scarborough_onehot['Neighborhood'] = scarborough_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [scarborough_onehot.columns[-1]] + list(scarborough_onehot.columns[:-1])
scarborough_onehot = scarborough_onehot[fixed_columns]

scarborough_onehot.head()

Unnamed: 0,Neighborhood,Athletics & Sports,Bakery,Bank,Caribbean Restaurant,Coffee Shop,Fried Chicken Joint,Gas Station,Hakka Restaurant,Korean Restaurant,Lounge,Thai Restaurant
0,Woburn,0,0,0,0,1,0,0,0,0,0,0
1,Woburn,0,0,0,0,1,0,0,0,0,0,0
2,Woburn,0,0,0,0,0,0,0,0,1,0,0
3,Cedarbrae,0,0,0,0,0,0,0,1,0,0,0
4,Cedarbrae,0,0,0,1,0,0,0,0,0,0,0


#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [205]:
scarborough_grouped = scarborough_onehot.groupby('Neighborhood').mean().reset_index()
scarborough_grouped

Unnamed: 0,Neighborhood,Athletics & Sports,Bakery,Bank,Caribbean Restaurant,Coffee Shop,Fried Chicken Joint,Gas Station,Hakka Restaurant,Korean Restaurant,Lounge,Thai Restaurant
0,Cedarbrae,0.111111,0.111111,0.111111,0.111111,0.0,0.111111,0.111111,0.111111,0.0,0.111111,0.111111
1,Woburn,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,0.333333,0.0,0.0


#### Let's print each neighborhood along with the top 5 most common venues

In [206]:
num_top_venues = 5

for hood in scarborough_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = scarborough_grouped[scarborough_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Cedarbrae----
                  venue  freq
0    Athletics & Sports  0.11
1                Bakery  0.11
2                  Bank  0.11
3  Caribbean Restaurant  0.11
4   Fried Chicken Joint  0.11


----Woburn----
                venue  freq
0         Coffee Shop  0.67
1   Korean Restaurant  0.33
2  Athletics & Sports  0.00
3              Bakery  0.00
4                Bank  0.00




First, let's write a function to sort the venues in descending order.

In [207]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [208]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = scarborough_grouped['Neighborhood']

for ind in np.arange(scarborough_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(scarborough_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Cedarbrae,Thai Restaurant,Lounge,Hakka Restaurant,Gas Station,Fried Chicken Joint,Caribbean Restaurant,Bank,Bakery,Athletics & Sports,Korean Restaurant
1,Woburn,Coffee Shop,Korean Restaurant,Thai Restaurant,Lounge,Hakka Restaurant,Gas Station,Fried Chicken Joint,Caribbean Restaurant,Bank,Bakery


## Cluster Neighborhoods

In [211]:
# set number of clusters
kclusters = 2

scarborough_grouped_clustering = scarborough_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(scarborough_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 1])

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [212]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

scarborough_merged = scarborough_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
scarborough_merged = scarborough_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

scarborough_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Rouge,Rouge,Malvern",43.806686,-79.194353,,,,,,,,,,,
1,M1C,Scarborough,"Highland Creek,Highland Creek,Rouge Hill,Port ...",43.784535,-79.160497,,,,,,,,,,,
2,M1E,Scarborough,"Guildwood,Guildwood,Morningside,West Hill",43.763573,-79.188711,,,,,,,,,,,
3,M1G,Scarborough,Woburn,43.770992,-79.216917,1.0,Coffee Shop,Korean Restaurant,Thai Restaurant,Lounge,Hakka Restaurant,Gas Station,Fried Chicken Joint,Caribbean Restaurant,Bank,Bakery
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0.0,Thai Restaurant,Lounge,Hakka Restaurant,Gas Station,Fried Chicken Joint,Caribbean Restaurant,Bank,Bakery,Athletics & Sports,Korean Restaurant


Finally, let's visualize the resulting clusters

In [217]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(scarborough_merged['Latitude'], scarborough_merged['Longitude'], scarborough_merged['Neighborhood'], scarborough_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[1],
        fill=True,
        fill_color=rainbow[1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

##  Examine Clusters

In [219]:
## cluster 1
scarborough_merged.loc[scarborough_merged['Cluster Labels'] == 0, scarborough_merged.columns[[1] + list(range(5, scarborough_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Scarborough,0.0,Thai Restaurant,Lounge,Hakka Restaurant,Gas Station,Fried Chicken Joint,Caribbean Restaurant,Bank,Bakery,Athletics & Sports,Korean Restaurant


In [220]:
## cluster 2
scarborough_merged.loc[scarborough_merged['Cluster Labels'] == 1, scarborough_merged.columns[[1] + list(range(5, scarborough_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Scarborough,1.0,Coffee Shop,Korean Restaurant,Thai Restaurant,Lounge,Hakka Restaurant,Gas Station,Fried Chicken Joint,Caribbean Restaurant,Bank,Bakery


In [221]:
## cluster 3
scarborough_merged.loc[scarborough_merged['Cluster Labels'] == 2, scarborough_merged.columns[[1] + list(range(5, scarborough_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
