Obtain the list of postal codes of Canada and transform to a pandas dataframe

Installing packages:

In [1]:
try:
    print("Installing BeautifulSoup4...\n")
    !conda install -c conda-forge beautifulsoup4 --yes
    print("BeautifulSoup4 has been successfully installed!\n")
except:
    print("ERROR: could not install BeautifulSoup4!\n")
try:
    print("Installing lxml...\n")
    !conda install -c conda-forge lxml --yes
    print("lxml has been successfully installed!\n")
except:
    print("ERROR: could not install lxml!\n")
try:
    print("Installing ProgressBar...\n")
    !conda install -c conda-forge ProgressBar2 --yes
    print("ProgressBar has been successfully installed!\n")
except:
    print("ERROR: could not install ProgressBar!\n")
try:
    print("Installing GeoPy...\n")
    !conda install -c conda-forge geopy --yes
    print("GeoPy has been successfully installed!\n")
except:
    print("ERROR: could not install GeoPy!\n")
try:
    print("Installing Folium...\n")
    !conda install -c conda-forge folium=0.5.0 --yes
    print("Folium has been successfully installed!\n")
except:
    print("ERROR: could not install Folium!\n")
try:
    print("Installing HTML5LIB...\n")
    !conda install -c conda-forge html5lib --yes
    print("HTML5LIB has been successfully installed!\n")
except:
    print("ERROR: could not install HTML5LIB!\n")

Installing BeautifulSoup4...

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    beautifulsoup4-4.8.1       |           py36_0         149 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following packages will be UPDATED:

    beautifulsoup4:  4.7.1-py36_1      --> 4.8.1-py36_0          conda-forge
    ca-certificates: 2019.11.27-0      --> 2019.11.28-hecc5488_0 conda-forge
    certifi:         2019.11.28-py36_0 -->

Importing libraries:

In [2]:
try:
    print("Importing libraries...\n")
    from progressbar import ProgressBar
    from bs4 import BeautifulSoup as bts # library for web scraping
    import numpy as np # library to handle data in a vectorized manner
    import pandas as pd # library for data analysis
    from pandas.io.json import json_normalize
    import matplotlib.cm as cm
    import matplotlib.colors as colors
    import requests # library to handle requests
    from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
    import matplotlib as mp # library for visualization
    from sklearn.cluster import KMeans # import k-means from clustering stage
    from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
    import folium # map rendering library
    import lxml
    import lxml.html as lh
    import re
    from time import sleep
    print("All libraries imported successfully!\n")
except:
    print("ERROR: Could not import all libraries!\n")

%matplotlib inline

Importing libraries...

All libraries imported successfully!



Obtaining the data from the wiki site:

In [3]:
try:
    print("Requesting source...")
    source  = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
    print("Source successfully requested!")
except:
    print("ERROR: could not request the source.\n")
try:
    print("Creating beautifulsoup object from source data...")
    soup = bts(source,'lxml')
    print("Successfully created beautifulsoup object from source data!")
except:
    print("ERROR: could not create beautifulsoup object from source data!")
try:
    print("Finding all tables in source code...")
    table = soup.findAll('table',class_='wikitable sortable')
    print("Tables were successfully found in source code!")
    print("Number of tables in site:",len(table))
except:
    print("ERROR: could not find any table in source code!")
table = table[0]
print("Getting the table text list")
table_text = table.tbody.text
table_text = re.sub("\[.*?\]", "",table_text)
table_list = table_text.split('\n')
del table_list[-1]
print("Table text list obtained successfully.")
print("Getting the table column headers")
table_columns = table_list[0:5]
table_columns[0] = 'NotRequired1'
table_columns[4] = 'NotRequired2'
print("Table column headers: ", table_columns[1:4])
print("Number of data in table: ",len(table_list[5:]))
if (len(table_list[5:]) % 5 == 0):
    table_data = np.array(table_list[5:]).reshape(len(table_list[5:]) // 5,5)
else:
    print("Number of table elements is incorrect!")
print("Preparing the required dataframe and table.")
dataframe =  pd.DataFrame(np.nan_to_num(table_data),columns = table_columns)
print("Table shape as per site data: ", dataframe.shape)
print(dataframe.head(5))
dataframe1 = dataframe.drop(['NotRequired1','NotRequired2'],axis=1)
dataframe1 = dataframe1.reset_index(drop=True)
print("Table shape after removing not required blank columns: ",dataframe1.shape)
print(dataframe1.head(5))

Requesting source...
Source successfully requested!
Creating beautifulsoup object from source data...
Successfully created beautifulsoup object from source data!
Finding all tables in source code...
Tables were successfully found in source code!
Number of tables in site: 1
Getting the table text list
Table text list obtained successfully.
Getting the table column headers
Table column headers:  ['Postcode', 'Borough', 'Neighbourhood']
Number of data in table:  1435
Preparing the required dataframe and table.
Table shape as per site data:  (287, 5)
  NotRequired1 Postcode           Borough     Neighbourhood NotRequired2
0                   M1A      Not assigned      Not assigned             
1                   M2A      Not assigned      Not assigned             
2                   M3A        North York         Parkwoods             
3                   M4A        North York  Victoria Village             
4                   M5A  Downtown Toronto      Harbourfront             
Table sha

Removing the Boroughs with 'Not assigned' values:

In [5]:
dataframe2 = dataframe1[dataframe1.Borough!="Not assigned"]
dataframe2 = dataframe2.reset_index(drop=True)
print("Table shape after removing Boroughs with Not assigned values: ",dataframe2.shape)
print(dataframe2.head(20))


Table shape after removing Boroughs with Not assigned values:  (210, 3)
   Postcode           Borough     Neighbourhood
0       M3A        North York         Parkwoods
1       M4A        North York  Victoria Village
2       M5A  Downtown Toronto      Harbourfront
3       M6A        North York  Lawrence Heights
4       M6A        North York    Lawrence Manor
5       M7A      Queen's Park      Not assigned
6       M9A  Downtown Toronto      Queen's Park
7       M1B       Scarborough             Rouge
8       M1B       Scarborough           Malvern
9       M3B        North York   Don Mills North
10      M4B         East York  Woodbine Gardens
11      M4B         East York     Parkview Hill
12      M5B  Downtown Toronto           Ryerson
13      M5B  Downtown Toronto   Garden District
14      M6B        North York         Glencairn
15      M9B         Etobicoke        Cloverdale
16      M9B         Etobicoke         Islington
17      M9B         Etobicoke      Martin Grove
18      M9B     

Updating Neighbourhoods with values as 'Not assigned' to the corresponding Borough value:

In [6]:
#print(dataframe2.loc[dataframe2['Neighbourhood'] == 'Not assigned','Neighbourhood'])
#print(dataframe2.loc[dataframe2['Neighbourhood'] == 'Not assigned','Borough'])
dataframe2.loc[dataframe2['Neighbourhood'] == 'Not assigned','Neighbourhood'] = dataframe2['Borough']
dataframe2.reset_index()
print("Shape of table after updating the neighbourhood values: ",dataframe2.shape)
print(dataframe2.head(20))

Shape of table after updating the neighbourhood values:  (210, 3)
   Postcode           Borough     Neighbourhood
0       M3A        North York         Parkwoods
1       M4A        North York  Victoria Village
2       M5A  Downtown Toronto      Harbourfront
3       M6A        North York  Lawrence Heights
4       M6A        North York    Lawrence Manor
5       M7A      Queen's Park      Queen's Park
6       M9A  Downtown Toronto      Queen's Park
7       M1B       Scarborough             Rouge
8       M1B       Scarborough           Malvern
9       M3B        North York   Don Mills North
10      M4B         East York  Woodbine Gardens
11      M4B         East York     Parkview Hill
12      M5B  Downtown Toronto           Ryerson
13      M5B  Downtown Toronto   Garden District
14      M6B        North York         Glencairn
15      M9B         Etobicoke        Cloverdale
16      M9B         Etobicoke         Islington
17      M9B         Etobicoke      Martin Grove
18      M9B         Et

In [7]:
dataframe3 = dataframe2.groupby(['Postcode', 'Borough']).agg(lambda x: ", ".join(sorted(set(x))))
dataframe3 = dataframe3.reset_index()
print("Table shape after grouping and joining neighborhoods in same postal code: ", dataframe3.shape)
print(dataframe3.head(30))
print("Table prepared successfully!")    


Table shape after grouping and joining neighborhoods in same postal code:  (103, 3)
   Postcode      Borough                                      Neighbourhood
0       M1B  Scarborough                                     Malvern, Rouge
1       M1C  Scarborough             Highland Creek, Port Union, Rouge Hill
2       M1E  Scarborough                  Guildwood, Morningside, West Hill
3       M1G  Scarborough                                             Woburn
4       M1H  Scarborough                                          Cedarbrae
5       M1J  Scarborough                                Scarborough Village
6       M1K  Scarborough        East Birchmount Park, Ionview, Kennedy Park
7       M1L  Scarborough                    Clairlea, Golden Mile, Oakridge
8       M1M  Scarborough    Cliffcrest, Cliffside, Scarborough Village West
9       M1N  Scarborough                        Birch Cliff, Cliffside West
10      M1P  Scarborough  Dorset Park, Scarborough Town Centre, Wexford ...
11  

In [8]:
print("Final number of rows in dataframe: ", dataframe3.shape)

Final number of rows in dataframe:  (103, 3)


Creating columns for the Latitude and Longitude for each Postal Code: 

In [9]:
dataframe3['Latitude'] = np.nan
dataframe3['Longitude'] = np.nan
dataframe3.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",,
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",,
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",,
3,M1G,Scarborough,Woburn,,
4,M1H,Scarborough,Cedarbrae,,
5,M1J,Scarborough,Scarborough Village,,
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",,
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",,
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",,
9,M1N,Scarborough,"Birch Cliff, Cliffside West",,


Gathering the Latitude and Longitude coordinates for each postal code using geolocater: 

In [11]:
pbar = ProgressBar()
geolocator = Nominatim()
location = None
while(location is None):
    for index in pbar(range(0,dataframe3['Postcode'].shape[0])):
        address = dataframe3.loc[index,'Postcode'] + ", Toronto, Ontario"
        location = geolocator.geocode(address, timeout = None)
        if (location != None):
            dataframe3.loc[index,'Latitude'] = location.latitude
            dataframe3.loc[index,'Longitude'] = location.longitude
        sleep(1)

print(dataframe3.shape)
dataframe3.head()

  from ipykernel import kernelapp as app
100% (103 of 103) |######################| Elapsed Time: 0:02:38 Time:  0:02:38


(103, 5)


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.653963,-79.387207
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.653963,-79.387207
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",,
3,M1G,Scarborough,Woburn,43.644903,-79.381836
4,M1H,Scarborough,Cedarbrae,,


Removing addresses that have no find location details: 
Final table with location details: 

In [12]:
dataframe4 = dataframe3.dropna()
dataframe4 = dataframe4.reset_index(drop=True)
print("Table shape after removing postcodes with no location values: ",dataframe4.shape)
dataframe4.head(25)

Table shape after removing postcodes with no location values:  (24, 5)


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.653963,-79.387207
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.653963,-79.387207
2,M1G,Scarborough,Woburn,43.644903,-79.381836
3,M1W,Scarborough,L'Amoreaux West,43.644903,-79.381836
4,M2J,North York,"Fairview, Henry Farm, Oriole",43.644903,-79.381836
5,M2M,North York,"Newtonbrook, Willowdale",43.785962,-79.416031
6,M2N,North York,Willowdale South,43.644903,-79.381836
7,M3A,North York,Parkwoods,43.653963,-79.387207
8,M3C,North York,"Don Mills South, Flemingdon Park",43.732822,-79.346961
9,M3H,North York,"Bathurst Manor, Downsview North, Wilson Heights",43.756199,-79.439802


Geographical coordinates for Toronto, Canada:

In [13]:
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

  app.launch_new_instance()


The geographical coordinates of Toronto are 43.653963, -79.387207.


Visualizing the Canada boroughs and neighbourhoods in a map centred around Toronto:

In [22]:
canada_map = folium.Map(location=[latitude, longitude], zoom_start=10) # generate map centred around Toronto

# add Toronto as a red circle mark
folium.features.CircleMarker(
    [latitude, longitude],
    radius=10,
    popup='Toronto',
    fill=True,
    color='red',
    fill_color='red',
    fill_opacity=0.6
    ).add_to(canada_map)

# add Canada Boroughs and Neighbourhoods to the map as blue circle markers
for lat, lng, borough, neighborhood in zip(dataframe4.Latitude, dataframe4.Longitude, dataframe4.Borough,dataframe4.Neighbourhood):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.features.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        fill=True,
        color='blue',
        fill_color='blue',
        fill_opacity=0.6
        ).add_to(canada_map)

# display map
canada_map

Obtain table with boroughs with the word 'Toronto' in them:

In [24]:
dataframe_toronto = dataframe4[dataframe4.Borough.str.contains('Toronto')].reset_index(drop=True)
dataframe_toronto.head(25)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.668027,-79.369282
1,M5E,Downtown Toronto,Berczy Park,43.644903,-79.381836
2,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.649896,-79.382695
3,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",43.639259,-79.38284
4,M5V,Downtown Toronto,"Bathurst Quay, CN Tower, Harbourfront West, Is...",43.645634,-79.392989
5,M6K,West Toronto,"Brockton, Exhibition Place, Parkdale Village",43.637097,-79.434958
6,M6P,West Toronto,"High Park, The Junction South",43.662173,-79.463881
7,M6S,West Toronto,"Runnymede, Swansea",43.649248,-79.474631


Visualizing the neighbourhoods in Toronto boroughs in a map centred around Toronto:

In [26]:
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=12) # generate map centred around Toronto

# add Toronto as a red circle mark
folium.features.CircleMarker(
    [latitude, longitude],
    radius=10,
    popup='Toronto',
    fill=True,
    color='red',
    fill_color='red',
    fill_opacity=0.6
    ).add_to(toronto_map)

# add Canada Boroughs and Neighbourhoods to the map as blue circle markers
for lat, lng, borough, neighborhood in zip(dataframe_toronto.Latitude, dataframe_toronto.Longitude, dataframe_toronto.Borough,dataframe_toronto.Neighbourhood):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.features.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        fill=True,
        color='blue',
        fill_color='blue',
        fill_opacity=0.6
        ).add_to(toronto_map)
    
# display map
toronto_map

Foursquare Credentials:

In [34]:
CLIENT_ID = 'GXUX3TJAEFTX5W2EKJMMPW5OR05KPCTPXSPBSHNPD412KRXT' # your Foursquare ID
CLIENT_SECRET = 'CFXD5YOTRTRFEQCIZD2XWG1JOSXDD31OZMRGYV411TDTLULJ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

Exploring the second neighbourhood in the Toronto dataframe:

In [35]:
#Get the neighborhood's name
neighborhood_name = dataframe_toronto.loc[1, 'Neighbourhood']

In [36]:
#Get the neighborhood's latitude and longitude values
neighborhood_latitude = dataframe_toronto.loc[1, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = dataframe_toronto.loc[1, 'Longitude'] # neighborhood longitude value

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Berczy Park are 43.6449033, -79.3818364.


Obtaining the top 100 venues that are in Berczy Park within a radius of 500 meters:

In [37]:
LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius
 # create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=GXUX3TJAEFTX5W2EKJMMPW5OR05KPCTPXSPBSHNPD412KRXT&client_secret=CFXD5YOTRTRFEQCIZD2XWG1JOSXDD31OZMRGYV411TDTLULJ&v=20180605&ll=43.6449033,-79.3818364&radius=500&limit=100'

In [38]:
#results from above URL
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5df9242d83525f001be45921'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Financial District',
  'headerFullLocation': 'Financial District, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 117,
  'suggestedBounds': {'ne': {'lat': 43.649403304500005,
    'lng': -79.37562936716475},
   'sw': {'lat': 43.6404032955, 'lng': -79.38804343283525}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '55561f8a498e2e5c866ac4c9',
       'name': 'Union Pearson Express',
       'location': {'address': '61 Front St. W',
        'lat': 43.64436200658875,
        'lng': -79.38319927907953,
        'labeledLatLngs': [{'label': 'display',
 

In [46]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [41]:
#Obtain data from json results
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head(25)

Unnamed: 0,name,categories,lat,lng
0,Union Pearson Express,Train Station,43.644362,-79.383199
1,WVRST,Beer Bar,43.644968,-79.381376
2,The Fairmont Royal York,Hotel,43.645449,-79.381508
3,Pilot Coffee Roasters,Coffee Shop,43.645018,-79.380415
4,iQ Food Co,Salad Place,43.642851,-79.382081
5,Real Sports Apparel,Sporting Goods Shop,43.64286,-79.380184
6,Delta Hotels by Marriott Toronto,Hotel,43.642882,-79.383949
7,Maple Leaf Square,Plaza,43.642925,-79.380892
8,Balzac's Coffee,Coffee Shop,43.644373,-79.383065
9,Le Germain Hotel,Hotel,43.643125,-79.380918


In [42]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

100 venues were returned by Foursquare.


Exploring all neighbourhoods of Toronto:

In [50]:
#Function to return nearby venues for all neighbourhoods
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print("Neighbourhood names: ") 
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [63]:
#Execute above function for all Toronto neighbourhoods
toronto_venues = getNearbyVenues(names=dataframe_toronto['Neighbourhood'],
                                   latitudes=dataframe_toronto['Latitude'],
                                   longitudes=dataframe_toronto['Longitude']
                                  )
toronto_venues.shape
print("Venues for all Toronto neighbourhoods: ")
toronto_venues.head(50)

Venues for all Toronto neighbourhoods: 


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Cabbagetown, St. James Town",43.668027,-79.369282,Cranberries,43.667843,-79.369407,Diner
1,"Cabbagetown, St. James Town",43.668027,-79.369282,F'Amelia,43.667536,-79.368613,Italian Restaurant
2,"Cabbagetown, St. James Town",43.668027,-79.369282,Kingyo Toronto,43.665895,-79.368415,Japanese Restaurant
3,"Cabbagetown, St. James Town",43.668027,-79.369282,Butter Chicken Factory,43.667072,-79.369184,Indian Restaurant
4,"Cabbagetown, St. James Town",43.668027,-79.369282,Merryberry Cafe + Bistro,43.66663,-79.368792,Café
5,"Cabbagetown, St. James Town",43.668027,-79.369282,Murgatroid,43.667381,-79.369311,Restaurant
6,"Cabbagetown, St. James Town",43.668027,-79.369282,Cabbagetown Brew,43.666923,-79.369289,Café
7,"Cabbagetown, St. James Town",43.668027,-79.369282,Absolute Bakery & Café,43.667469,-79.369277,Bakery
8,"Cabbagetown, St. James Town",43.668027,-79.369282,Mr. Jerk,43.667328,-79.373389,Caribbean Restaurant
9,"Cabbagetown, St. James Town",43.668027,-79.369282,Fair Trade Jewellery Co.,43.665348,-79.368362,Jewelry Store


In [62]:
print("Count of venues for each Toronto neighbourhood: ")
toronto_venues.groupby('Neighborhood').count()

Count of venues for each Toronto neighbourhood: 


Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
"Bathurst Quay, CN Tower, Harbourfront West, Island airport, King and Spadina, Railway Lands, South Niagara",100,100,100,100,100,100
Berczy Park,100,100,100,100,100,100
"Brockton, Exhibition Place, Parkdale Village",37,37,37,37,37,37
"Cabbagetown, St. James Town",45,45,45,45,45,45
"Harbourfront East, Toronto Islands, Union Station",100,100,100,100,100,100
"High Park, The Junction South",28,28,28,28,28,28
"Runnymede, Swansea",39,39,39,39,39,39


In [70]:
print("Count of unique categories of Toronto venues:")
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

Count of unique categories of Toronto venues:
There are 149 uniques categories.


Analyzing Toronto Neighbourhoods:

In [92]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
toronto_onehot = toronto_onehot[ ['Neighborhood'] + [ col for col in toronto_onehot.columns if col != 'Neighborhood' ] ]

toronto_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,Antique Shop,Aquarium,Art Gallery,Arts & Crafts Store,Asian Restaurant,Bakery,Bank,Bar,...,Theater,Thrift / Vintage Store,Tibetan Restaurant,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Wine Bar,Women's Store,Yoga Studio
0,"Cabbagetown, St. James Town",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Cabbagetown, St. James Town",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Cabbagetown, St. James Town",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Cabbagetown, St. James Town",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Cabbagetown, St. James Town",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [114]:
print("Shape of table prior to grouping: ", toronto_onehot.shape)

#Group data and get mean of frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
print("Shape of table after grouping: ", toronto_grouped.shape)

print("Data grouped for mean of frequency of occurrence of each category")
toronto_grouped.head(10)

Shape of table prior to grouping:  (549, 149)
Shape of table after grouping:  (8, 149)
Data grouped for mean of frequency of occurrence of each category


Unnamed: 0,Neighborhood,American Restaurant,Antique Shop,Aquarium,Art Gallery,Arts & Crafts Store,Asian Restaurant,Bakery,Bank,Bar,...,Theater,Thrift / Vintage Store,Tibetan Restaurant,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Wine Bar,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.03,0.0,0.0,0.01,0.0,0.03,0.03,0.0,0.03,...,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.0
1,"Bathurst Quay, CN Tower, Harbourfront West, Is...",0.0,0.0,0.0,0.01,0.01,0.01,0.0,0.0,0.01,...,0.02,0.0,0.0,0.0,0.0,0.02,0.01,0.0,0.01,0.02
2,Berczy Park,0.02,0.0,0.05,0.01,0.0,0.0,0.03,0.0,0.03,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0
3,"Brockton, Exhibition Place, Parkdale Village",0.027027,0.0,0.0,0.027027,0.0,0.0,0.027027,0.027027,0.054054,...,0.0,0.027027,0.081081,0.027027,0.0,0.0,0.0,0.0,0.0,0.0
4,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.044444,0.022222,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Harbourfront East, Toronto Islands, Union Station",0.0,0.0,0.05,0.01,0.0,0.0,0.0,0.01,0.02,...,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.0
6,"High Park, The Junction South",0.0,0.035714,0.0,0.0,0.035714,0.0,0.035714,0.0,0.071429,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Runnymede, Swansea",0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.025641,0.0,...,0.0,0.0,0.0,0.025641,0.0,0.025641,0.0,0.0,0.0,0.0


Top five venues for each neighbourhood: 

In [97]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [124]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Hotel,Steakhouse,Bar
1,"Bathurst Quay, CN Tower, Harbourfront West, Is...",Coffee Shop,Hotel,Restaurant,Italian Restaurant,Café
2,Berczy Park,Coffee Shop,Café,Aquarium,Hotel,Restaurant
3,"Brockton, Exhibition Place, Parkdale Village",Tibetan Restaurant,Café,Restaurant,Diner,Pharmacy
4,"Cabbagetown, St. James Town",Coffee Shop,Pizza Place,Restaurant,Pub,Italian Restaurant


Clustering the neighbourhoods:

In [125]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

#create a new dataframe that includes the cluster as well as the top 5 venues for each neighborhood
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
neighborhoods_venues_sorted.head()
toronto_merged = dataframe_toronto
toronto_merged.rename(columns = {'Neighbourhood':'Neighborhood'}, inplace=True)
toronto_merged.head()
# merge to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.shape
toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.668027,-79.369282,0,Coffee Shop,Pizza Place,Restaurant,Pub,Italian Restaurant
1,M5E,Downtown Toronto,Berczy Park,43.644903,-79.381836,1,Coffee Shop,Café,Aquarium,Hotel,Restaurant
2,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.649896,-79.382695,3,Coffee Shop,Café,Hotel,Steakhouse,Bar
3,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",43.639259,-79.38284,1,Coffee Shop,Aquarium,Restaurant,Pizza Place,Café
4,M5V,Downtown Toronto,"Bathurst Quay, CN Tower, Harbourfront West, Is...",43.645634,-79.392989,3,Coffee Shop,Hotel,Restaurant,Italian Restaurant,Café


Visualizing the resulting clusters:

In [122]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Clusters close to Toronto fall into one of two types. Clusters further away from Toronto are different from the ones close to Toronto.