# Segmenting and Clustering Neighborhoods in Toronto Part 1

In [1]:
import requests
import lxml.html as lh
import pandas as pd
from bs4 import BeautifulSoup

# Getting the text data from the Wiki url

In [2]:

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
data = requests.get(url).text

In [3]:
soup = BeautifulSoup(data,"html5lib")

# Extracting Required Data from the table and cleaning it

In [4]:
tables=soup.find('table')
len(tables)

2

In [5]:
table_contents=[]
for row in tables.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

# print(table_contents)
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern'
                                              :'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto'
                                              :'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [6]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [7]:
tables.findAll('td')

[<td style="width:11%; vertical-align:top; color:#ccc;">
 <p><b>M1A</b><br/><span style="font-size:85%;"><i>Not assigned</i></span>
 </p>
 </td>,
 <td style="width:11%; vertical-align:top; color:#ccc;">
 <p><b>M2A</b><br/><span style="font-size:85%;"><i>Not assigned</i></span>
 </p>
 </td>,
 <td style="width:11%; vertical-align:top;">
 <p><b>M3A</b><br/><span style="font-size:85%;"><a href="/wiki/North_York" title="North York">North York</a><br/>(<a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>)</span>
 </p>
 </td>,
 <td style="width:11%; vertical-align:top;">
 <p><b>M4A</b><br/><span style="font-size:85%;"><a href="/wiki/North_York" title="North York">North York</a><br/>(<a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>)</span>
 </p>
 </td>,
 <td style="width:11%; vertical-align:top;">
 <p><b>M5A</b><br/><span style="font-size:85%;"><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a><br/>(<a href="/wiki/Regent_Park" titl

In [8]:
df.shape

(103, 3)

# WEEK 3 PART 2

In [9]:
import folium
# conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

In [10]:
df["Latitude"] = ""
df["Longitude"] = ""
df.shape

(103, 5)

In [11]:
# Cleaning the neighborhood with multiple values ( Selecting the first one)
df["Neighborhood"] = df["Neighborhood"].str.split(",", n = 1, expand = True) 
df["Neighborhood"] = df["Neighborhood"].str.split("-", n = 1, expand = True) 
df["Neighborhood"].head(5)

0                        Parkwoods
1                 Victoria Village
2                      Regent Park
3                   Lawrence Manor
4    Ontario Provincial Government
Name: Neighborhood, dtype: object

In [12]:
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,,
1,M4A,North York,Victoria Village,,
2,M5A,Downtown Toronto,Regent Park,,
3,M6A,North York,Lawrence Manor,,
4,M7A,Queen's Park,Ontario Provincial Government,,
5,M9A,Etobicoke,Islington Avenue,,
6,M1B,Scarborough,Malvern,,
7,M3B,North York,Don Mills North,,
8,M4B,East York,Parkview Hill,,
9,M5B,Downtown Toronto,Garden District,,


In [13]:
df1 = df.loc[0:25]
df2 = df.loc[26:50]
df3 = df.loc[51:75]
df4 = df.loc[76:102]

# To get coordinates and populate the df

In [14]:
# Need to drop those Neighborhood that the geocode does not find
to_drop_unknown = []
geolocator = Nominatim(user_agent="ny_explorer")
for index, row in df1.iterrows():
    address = row['Neighborhood'] + ', Toronto'
    try:
        location = geolocator.geocode(address)
        latitude = location.latitude
        longitude = location.longitude
        print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))
        df1.loc[index, 'Latitude'] = latitude
        df1.loc[index, 'Longitude'] = longitude
    except AttributeError:
        print('Cannot do: {}, will drop index: {}'.format(address, index))
        to_drop_unknown.append(index)

The geograpical coordinate of Parkwoods, Toronto are 43.7611243, -79.3240594.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


The geograpical coordinate of Victoria Village, Toronto are 43.732658, -79.3111892.
The geograpical coordinate of Regent Park, Toronto are 43.6607056, -79.3604569.
The geograpical coordinate of Lawrence Manor, Toronto are 43.7220788, -79.4375067.
Cannot do: Ontario Provincial Government, Toronto, will drop index: 4
The geograpical coordinate of Islington Avenue, Toronto are 43.7161378, -79.5555531.
The geograpical coordinate of Malvern, Toronto are 43.8091955, -79.2217008.
The geograpical coordinate of Don Mills North, Toronto are 43.775347, -79.3459439.
The geograpical coordinate of Parkview Hill, Toronto are 43.7062977, -79.3219073.
The geograpical coordinate of Garden District, Toronto are 43.6564995, -79.3771141.
The geograpical coordinate of Glencairn, Toronto are 43.7087117, -79.4406853.
The geograpical coordinate of West Deane Park, Toronto are 43.6631995, -79.5685684.
The geograpical coordinate of Rouge Hill, Toronto are 43.7802711, -79.1304992.
The geograpical coordinate of Do

In [15]:
geolocator = Nominatim(user_agent="ny_explorer2")
for index, row in df2.iterrows():
    address = row['Neighborhood'] + ', Toronto'
    try:
        location = geolocator.geocode(address)
        latitude = location.latitude
        longitude = location.longitude
        print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))
        df2.loc[index, 'Latitude'] = latitude
        df2.loc[index, 'Longitude'] = longitude
    except AttributeError:
        print('Cannot do: {}, will drop index: {}'.format(address, index))
        to_drop_unknown.append(index)

The geograpical coordinate of Cedarbrae, Toronto are 43.75646655, -79.22669244258802.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


The geograpical coordinate of Hillcrest Village, Toronto are 43.6816953, -79.4257118.
The geograpical coordinate of Bathurst Manor, Toronto are 43.76389295, -79.45636693710946.
The geograpical coordinate of Thorncliffe Park, Toronto are 43.704553, -79.3454074.
The geograpical coordinate of Richmond, Toronto are 43.6485875, -79.3913729.
The geograpical coordinate of Dufferin, Toronto are 43.6602019, -79.4357191.
The geograpical coordinate of Scarborough Village, Toronto are 43.7437422, -79.2116324.
The geograpical coordinate of Fairview, Toronto are 43.777758500000004, -79.34429375180316.
The geograpical coordinate of Northwood Park, Toronto are 43.7541351, -79.50448.
The geograpical coordinate of The Danforth  East, Toronto are 43.6881713, -79.3018156.
The geograpical coordinate of Harbourfront East, Toronto are 43.6400801, -79.3801495.
The geograpical coordinate of Little Portugal, Toronto are 43.64741325, -79.43111632546047.
The geograpical coordinate of Kennedy Park, Toronto are 43.

In [16]:
geolocator = Nominatim(user_agent="ny_explorer3")
for index, row in df3.iterrows():
    address = row['Neighborhood'] + ', Toronto'
    try:
        location = geolocator.geocode(address)
        latitude = location.latitude
        longitude = location.longitude
        print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))
        df3.loc[index, 'Latitude'] = latitude
        df3.loc[index, 'Longitude'] = longitude
    except AttributeError:
        print('Cannot do: {}, will drop index: {}'.format(address, index))
        to_drop_unknown.append(index)

The geograpical coordinate of Cliffside, Toronto are 43.7111699, -79.2481769.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


The geograpical coordinate of Willowdale, Toronto are 43.7615095, -79.4109234.
The geograpical coordinate of Downsview Central, Toronto are 43.7492988, -79.462248.
The geograpical coordinate of Studio District, Toronto are 43.6430587, -79.4026004.
The geograpical coordinate of Bedford Park, Toronto are 43.7373876, -79.4109253.
The geograpical coordinate of Del Ray, Toronto are 43.6890238, -79.493985.
The geograpical coordinate of Humberlea, Toronto are 43.7213166, -79.5331605.
The geograpical coordinate of Birch Cliff, Toronto are 43.6918051, -79.2644935.
The geograpical coordinate of Willowdale South, Toronto are 43.7615095, -79.4109234.
The geograpical coordinate of Downsview Northwest, Toronto are 43.7492988, -79.462248.
The geograpical coordinate of Lawrence Park, Toronto are 43.729199, -79.4032525.
The geograpical coordinate of Roselawn, Toronto are 43.7059981, -79.42123520744894.
The geograpical coordinate of Runnymede, Toronto are 43.6517026, -79.4759978.
The geograpical coordin

In [17]:
geolocator = Nominatim(user_agent="ny_explorer4")
for index, row in df4.iterrows():
    address = row['Neighborhood'] + ', Toronto'
    try:
        location = geolocator.geocode(address)
        latitude = location.latitude
        longitude = location.longitude
        print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))
        df4.loc[index, 'Latitude'] = latitude
        df4.loc[index, 'Longitude'] = longitude
    except AttributeError:
        print('Cannot do: {}, will drop index: {}'.format(address, index))
        to_drop_unknown.append(index)

Cannot do: Enclave of L4W, Toronto, will drop index: 76
The geograpical coordinate of Kingsview Village, Toronto are 43.6995391, -79.5563459.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


The geograpical coordinate of Agincourt, Toronto are 43.7853531, -79.2785494.
The geograpical coordinate of Davisville, Toronto are 43.697936, -79.3972908.
The geograpical coordinate of University of Toronto, Toronto are 43.663461999999996, -79.39775965337452.
The geograpical coordinate of Runnymede, Toronto are 43.6517026, -79.4759978.
The geograpical coordinate of Clarks Corners, Toronto are 43.7964095, -79.2977951.
The geograpical coordinate of Moore Park, Toronto are 43.6903876, -79.3832965.
The geograpical coordinate of Kensington Market, Toronto are 43.6552136, -79.4022604.
The geograpical coordinate of Milliken, Toronto are 43.8231743, -79.3017626.
The geograpical coordinate of Summerhill West, Toronto are 43.6816776, -79.3905037.
The geograpical coordinate of CN Tower, Toronto are 43.6425637, -79.38708718320467.
The geograpical coordinate of New Toronto, Toronto are 43.6007625, -79.505264.
The geograpical coordinate of South Steeles, Toronto are 43.8161778, -79.3145378.
The geo

In [30]:
to_drop_unknown

[4, 76, 100]

In [18]:
print(df1.shape)
print(df2.shape)
print(df3.shape)
print(df4.shape)

(26, 5)
(25, 5)
(25, 5)
(27, 5)


In [19]:
df = df1.append(df2, ignore_index = True)
df = df.append(df3, ignore_index = True)
df = df.append(df4, ignore_index = True)

In [20]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7611,-79.3241
1,M4A,North York,Victoria Village,43.7327,-79.3112
2,M5A,Downtown Toronto,Regent Park,43.6607,-79.3605
3,M6A,North York,Lawrence Manor,43.7221,-79.4375
4,M7A,Queen's Park,Ontario Provincial Government,,


# Cleaning the dataframe

In [22]:
import  numpy as np
clean_df = df.drop(to_drop_unknown)
clean_df['Latitude'].replace('', np.nan, inplace=True)
clean_df.dropna(subset=['Latitude'], inplace=True)
clean_df.shape

(100, 5)

In [23]:
clean_df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7611,-79.3241
1,M4A,North York,Victoria Village,43.7327,-79.3112
2,M5A,Downtown Toronto,Regent Park,43.6607,-79.3605
3,M6A,North York,Lawrence Manor,43.7221,-79.4375
5,M9A,Etobicoke,Islington Avenue,43.7161,-79.5556
6,M1B,Scarborough,Malvern,43.8092,-79.2217
7,M3B,North York,Don Mills North,43.7753,-79.3459
8,M4B,East York,Parkview Hill,43.7063,-79.3219
9,M5B,Downtown Toronto,Garden District,43.6565,-79.3771
10,M6B,North York,Glencairn,43.7087,-79.4407


# Mapping the arear of Neighbourhood of Toronto

In [24]:
address = 'Toronto'
try:
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))
    df.loc[index, 'Latitude'] = latitude
    df.loc[index, 'Longitude'] = longitude
except AttributeError:
    print('Cannot do: {}, will drop index: {}'.format(address, index))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [25]:
my_map = folium.Map(location=[latitude, longitude], zoom_start= 11)
my_map

In [28]:
n = 91  # Unable to show Marker in Map if Marker > 91
clean_df1 = clean_df.tail(n)
clean_df1.shape

(91, 5)

In [29]:
import folium
my_map = folium.Map(location=[latitude, longitude], zoom_start=11)
# add markers to map
for lat, lng, label in zip(clean_df1['Latitude'], clean_df1['Longitude'], clean_df1['Neighborhood']):
    label = folium.Popup(label)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(my_map)  
my_map

# PART 3

# Filter the boroughs that contain word Toronto

In [31]:
clean_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7611,-79.3241
1,M4A,North York,Victoria Village,43.7327,-79.3112
2,M5A,Downtown Toronto,Regent Park,43.6607,-79.3605
3,M6A,North York,Lawrence Manor,43.7221,-79.4375
5,M9A,Etobicoke,Islington Avenue,43.7161,-79.5556
...,...,...,...,...,...
97,M5X,Downtown Toronto,First Canadian Place,43.6488,-79.3817
98,M8X,Etobicoke,The Kingsway,43.6474,-79.5113
99,M4Y,Downtown Toronto,Church and Wellesley,43.6709,-79.3728
101,M8Y,Etobicoke,Old Mill South,43.6498,-79.4943


In [37]:
Toronto_df=clean_df[clean_df['Borough'].str.contains("Toronto")].reset_index(drop=True)

In [38]:
Toronto_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,Regent Park,43.6607,-79.3605
1,M5B,Downtown Toronto,Garden District,43.6565,-79.3771
2,M5C,Downtown Toronto,St. James Town,43.6694,-79.3727
3,M4E,East Toronto,The Beaches,43.671,-79.2967
4,M5E,Downtown Toronto,Berczy Park,43.648,-79.3754
5,M5G,Downtown Toronto,Central Bay Street,43.6561,-79.3847
6,M6G,Downtown Toronto,Christie,43.6641,-79.4184
7,M5H,Downtown Toronto,Richmond,43.6486,-79.3914
8,M6H,West Toronto,Dufferin,43.6602,-79.4357
9,M4J,East York/East Toronto,The Danforth East,43.6882,-79.3018


In [34]:
Toronto_df.drop_duplicates(subset ="PostalCode",keep = "first", inplace = True)
Toronto_df.reset_index(drop=True) 
print(Toronto_df.shape)

(38, 5)


In [39]:
# define Foursquare Credentials and Version
CLIENT_ID = 'XKWBA0LW2HDJDQNZZJ01ME1MQA4IUFKX0WHKQL2D0N3LSDQB' # your Foursquare ID
CLIENT_SECRET = 'W1EXMCPMF14NQR2CI2MSU1MQCSPXH0SMDC42JOLGCPONDWQY' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)
toronto_df_new = Toronto_df.copy()

Your credentails:
CLIENT_ID: XKWBA0LW2HDJDQNZZJ01ME1MQA4IUFKX0WHKQL2D0N3LSDQB
CLIENT_SECRET:W1EXMCPMF14NQR2CI2MSU1MQCSPXH0SMDC42JOLGCPONDWQY


In [46]:
toronto_df_new

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,Regent Park,43.6607,-79.3605
1,M5B,Downtown Toronto,Garden District,43.6565,-79.3771
2,M5C,Downtown Toronto,St. James Town,43.6694,-79.3727
3,M4E,East Toronto,The Beaches,43.671,-79.2967
4,M5E,Downtown Toronto,Berczy Park,43.648,-79.3754
5,M5G,Downtown Toronto,Central Bay Street,43.6561,-79.3847
6,M6G,Downtown Toronto,Christie,43.6641,-79.4184
7,M5H,Downtown Toronto,Richmond,43.6486,-79.3914
8,M6H,West Toronto,Dufferin,43.6602,-79.4357
9,M4J,East York/East Toronto,The Danforth East,43.6882,-79.3018


# TOP 100 VENUES THAT ARE IN THE RADIUS OF 500M

In [40]:
radius = 500
LIMIT = 100

venues = []

for lat, long, post, borough, neighborhood in zip(toronto_df_new['Latitude'], toronto_df_new['Longitude'], toronto_df_new['PostalCode'], toronto_df_new['Borough'], toronto_df_new['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,        CLIENT_SECRET,        VERSION,        lat,        long,        radius,         LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post,             borough,            neighborhood,            lat,             long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [44]:
venues_df=pd.DataFrame(venues)

In [45]:
# define the column names
venues_df.columns = ['PostalCode', 'Borough', 'Neighborhood', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(1810, 9)


Unnamed: 0,PostalCode,Borough,Neighborhood,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M5A,Downtown Toronto,Regent Park,43.660706,-79.360457,Regent Park Aquatic Centre,43.6606,-79.361392,Pool
1,M5A,Downtown Toronto,Regent Park,43.660706,-79.360457,Sumach Espresso,43.658135,-79.359515,Coffee Shop
2,M5A,Downtown Toronto,Regent Park,43.660706,-79.360457,Daniels Spectrum,43.660137,-79.361808,Performing Arts Venue
3,M5A,Downtown Toronto,Regent Park,43.660706,-79.360457,Thai To Go,43.663418,-79.36071,Thai Restaurant
4,M5A,Downtown Toronto,Regent Park,43.660706,-79.360457,Paintbox Bistro,43.66005,-79.362855,Restaurant


# How many values were returned for each postal code

In [53]:
venues_df.groupby("PostalCode").count().head()

Unnamed: 0_level_0,Borough,Neighborhood,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
M4E,48,48,48,48,48,48,48,48
M4J,31,31,31,31,31,31,31,31
M4K,32,32,32,32,32,32,32,32
M4L,38,38,38,38,38,38,38,38
M4M,100,100,100,100,100,100,100,100


In [57]:
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))

There are 243 uniques categories.


In [55]:
venues_df['VenueCategory'].unique()[:10]

array(['Pool', 'Coffee Shop', 'Performing Arts Venue', 'Thai Restaurant',
       'Restaurant', 'Sushi Restaurant', 'Animal Shelter', 'Pub', 'Park',
       'Food Truck'], dtype=object)

# Analyse each Area

In [67]:
# one hot encoding
toronto_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")#prefix="", prefix_sep="")

# add postal, borough and neighborhood column back to dataframe
toronto_onehot['PostalCode'] = venues_df['PostalCode'] 
toronto_onehot['Borough'] = venues_df['Borough'] 
toronto_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move postal, borough and neighborhood column to the first column
fixed_columns = list(toronto_onehot.columns[-3:]) + list(toronto_onehot.columns[:-3])
toronto_onehot = toronto_onehot[fixed_columns]

print(toronto_onehot.shape)
toronto_onehot.head()

(1810, 246)


Unnamed: 0,PostalCode,Borough,Neighborhoods,Accessories Store,African Restaurant,American Restaurant,Animal Shelter,Antique Shop,Aquarium,Art Gallery,...,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,M5A,Downtown Toronto,Regent Park,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M5A,Downtown Toronto,Regent Park,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M5A,Downtown Toronto,Regent Park,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M5A,Downtown Toronto,Regent Park,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M5A,Downtown Toronto,Regent Park,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [68]:
toronto_grouped = toronto_onehot.groupby(["PostalCode", "Borough", "Neighborhoods"]).mean().reset_index()
print(toronto_grouped.shape)
toronto_grouped.head()

(38, 246)


Unnamed: 0,PostalCode,Borough,Neighborhoods,Accessories Store,African Restaurant,American Restaurant,Animal Shelter,Antique Shop,Aquarium,Art Gallery,...,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,M4E,East Toronto,The Beaches,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4J,East York/East Toronto,The Danforth East,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M4K,East Toronto,The Danforth West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4L,East Toronto,India Bazaar,0.0,0.0,0.0,0.0,0.0,0.0,0.026316,...,0.0,0.026316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M4M,East Toronto,Studio District,0.0,0.0,0.01,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.02


# TOP 10 venues for each postalcode

In [69]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
areaColumns = ['PostalCode', 'Borough', 'Neighborhoods']
freqColumns = []
for ind in np.arange(num_top_venues):
    try:
        freqColumns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        freqColumns.append('{}th Most Common Venue'.format(ind+1))
columns = areaColumns+freqColumns

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['PostalCode'] = toronto_grouped['PostalCode']
neighborhoods_venues_sorted['Borough'] = toronto_grouped['Borough']
neighborhoods_venues_sorted['Neighborhoods'] = toronto_grouped['Neighborhoods']

for ind in np.arange(toronto_grouped.shape[0]):
    row_categories = toronto_grouped.iloc[ind, :].iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    neighborhoods_venues_sorted.iloc[ind, 3:] = row_categories_sorted.index.values[0:num_top_venues]

# neighborhoods_venues_sorted.sort_values(freqColumns, inplace=True)
print(neighborhoods_venues_sorted.shape)
neighborhoods_venues_sorted.head()

(38, 13)


Unnamed: 0,PostalCode,Borough,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,Beach,Japanese Restaurant,Bar,Nail Salon,Pub,Pizza Place,Thai Restaurant,Breakfast Spot,Park,Skating Rink
1,M4J,East York/East Toronto,The Danforth East,Grocery Store,Pharmacy,Skating Rink,Coffee Shop,Bus Line,Middle Eastern Restaurant,Light Rail Station,Fish & Chips Shop,Metro Station,Mexican Restaurant
2,M4K,East Toronto,The Danforth West,Grocery Store,Coffee Shop,Bus Line,Pharmacy,Sandwich Place,Fish & Chips Shop,Doctor's Office,Sushi Restaurant,Fried Chicken Joint,Fast Food Restaurant
3,M4L,East Toronto,India Bazaar,Indian Restaurant,Grocery Store,Restaurant,Café,Halal Restaurant,Shopping Plaza,Burger Joint,Bus Stop,Sandwich Place,Egyptian Restaurant
4,M4M,East Toronto,Studio District,Coffee Shop,Italian Restaurant,Gym / Fitness Center,Furniture / Home Store,Spa,Asian Restaurant,Bakery,Bar,Café,Speakeasy


Unnamed: 0,PostalCode,Borough,Neighborhoods,Accessories Store,African Restaurant,American Restaurant,Animal Shelter,Antique Shop,Aquarium,Art Gallery,...,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,M4E,East Toronto,The Beaches,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4J,East York/East Toronto,The Danforth East,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M4K,East Toronto,The Danforth West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4L,East Toronto,India Bazaar,0.0,0.0,0.0,0.0,0.0,0.0,0.026316,...,0.0,0.026316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M4M,East Toronto,Studio District,0.0,0.0,0.01,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.02
5,M4N,Central Toronto,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,M4P,Central Toronto,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,M4R,Central Toronto,North Toronto West,0.0,0.0,0.02,0.0,0.0,0.0,0.02,...,0.0,0.06,0.0,0.0,0.04,0.0,0.02,0.0,0.0,0.02
8,M4S,Central Toronto,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,M4T,Central Toronto,Moore Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# CLUSTER NEIGHBORHOODS

In [75]:
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 5

Toronto_grouped_clustering = toronto_grouped.drop(["PostalCode", "Borough", "Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 0, 0, 0, 1, 1, 1, 1, 1, 4], dtype=int32)

In [76]:
#checking shape of dataframe
print(toronto_grouped.shape)
print(neighborhoods_venues_sorted.shape)
print(toronto_df_new.shape)
print(kmeans.labels_.size)

(38, 246)
(38, 13)
(38, 5)
38


In [77]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
toronto_merged = toronto_df_new.copy()
# add clustering labels
toronto_merged["Cluster Labels"] = kmeans.labels_
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.drop(["Borough", "Neighborhoods"], 1).set_index("PostalCode"), on="PostalCode")
toronto_merged.rename(columns = {'BoroughLatitude':'Latitude', 'BoroughLongitude':'Longitude'}, inplace = True) 
print(toronto_merged.shape)
toronto_merged.head() # check the last columns!

(38, 16)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,Regent Park,43.6607,-79.3605,0,Coffee Shop,Restaurant,Thai Restaurant,Park,Auto Dealership,Fast Food Restaurant,Food Truck,Beer Store,Sushi Restaurant,Pub
1,M5B,Downtown Toronto,Garden District,43.6565,-79.3771,0,Clothing Store,Coffee Shop,Hotel,Cosmetics Shop,Lingerie Store,Café,Electronics Store,Japanese Restaurant,Sandwich Place,Restaurant
2,M5C,Downtown Toronto,St. James Town,43.6694,-79.3727,0,Coffee Shop,Pizza Place,Café,Grocery Store,Breakfast Spot,Bakery,Food & Drink Shop,Caribbean Restaurant,Market,Bistro
3,M4E,East Toronto,The Beaches,43.671,-79.2967,0,Beach,Japanese Restaurant,Bar,Nail Salon,Pub,Pizza Place,Thai Restaurant,Breakfast Spot,Park,Skating Rink
4,M5E,Downtown Toronto,Berczy Park,43.648,-79.3754,1,Coffee Shop,Café,Restaurant,Seafood Restaurant,Japanese Restaurant,Hotel,Cocktail Bar,Bakery,Italian Restaurant,Breakfast Spot


In [78]:
# sort the results by Cluster Labels
print(toronto_merged.shape)
toronto_merged.sort_values(["Cluster Labels"], inplace=True)
toronto_merged.head()

(38, 16)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,Regent Park,43.6607,-79.3605,0,Coffee Shop,Restaurant,Thai Restaurant,Park,Auto Dealership,Fast Food Restaurant,Food Truck,Beer Store,Sushi Restaurant,Pub
25,M6R,West Toronto,Parkdale,43.6405,-79.4369,0,Tibetan Restaurant,Restaurant,Indian Restaurant,Diner,Pharmacy,Pizza Place,Bakery,French Restaurant,Boutique,Brewery
32,M5V,Downtown Toronto,CN Tower,43.6426,-79.3871,0,Hotel,Coffee Shop,Pizza Place,Gym,Scenic Lookout,Aquarium,Sandwich Place,Baseball Stadium,Bar,Bistro
34,M5W,Downtown Toronto Stn A,Enclave of M5E,43.7453,-79.492,0,Grocery Store,Pharmacy,Cosmetics Shop,Spa,Sandwich Place,Coffee Shop,Fast Food Restaurant,Vietnamese Restaurant,Pizza Place,Ice Cream Shop
29,M4T,Central Toronto,Moore Park,43.6904,-79.3833,0,Summer Camp,Gym,Convenience Store,Playground,Trail,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store


# Plotting the final neighborhood of TORONTO

In [79]:
#n = 33  # Unable to show Marker in Map if Marker > 10
#clean_df1 = toronto_merged.tail(n)
#clean_df1.shape

In [80]:
map_clusters  = folium.Map(location=[latitude, longitude], zoom_start=12)
# set color scheme for the clusters
x = np.arange(kclusters)
rainbow = [    'red',    'blue',    'orange',    'darkgreen',    'darkblue',    'black']
# add markers to map
markers_colors = []
for lat, lon, post, bor, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['PostalCode'], toronto_merged['Borough'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup('{} ({}): {} - Cluster {}'.format(bor, post, poi, cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters