Import libraries

In [5]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

Retrieve the page from Wikipedia and use BeutifulSoup to parse it

In [6]:
wikipedia_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.get(wikipedia_link)
soup = BeautifulSoup(r.text, 'html.parser')

Prepare empty data frame with the following columns: 'PostalCode','Borough', 'Neighborhood'

In [3]:
columns = ['PostalCode','Borough', 'Neighborhood']
df = pd.DataFrame(columns = columns)

Check if the empty dataframe is correct

In [4]:
df

Unnamed: 0,PostalCode,Borough,Neighborhood


Iterate over the "wikitable" table, parse values: postal code, borough, neighborhood and store them in the dataframe. 
The following condition should be considered:
<ul>
<li>Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
<li>More than one neighborhood can exist in one postal code area. These two rows will be combined into one row with the neighborhoods separated with a comma 
<li>If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
</ul>

In [7]:
table = soup.find("table", { "class" : "wikitable" })
index = 0
for row in table.findAll("tr"):    
    cells = row.findAll("td")
    if len(cells) == 3:
        postcode = cells[0].find(text=True).strip()
        borough = cells[1].find(text=True).strip()
        neighborhood = cells[2].find(text=True).strip()
           
        if borough != 'Not assigned': # Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned   
            if neighborhood == 'Not assigned':
                neighborhood = borough # If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
            row = df.loc[df['PostalCode'] == postcode]
            if row.empty:             
                df.loc[index] = [postcode, borough, neighborhood]
                index = index + 1
            else:
                # More than one neighborhood can exist in one postal code area. These two rows will be combined into one row with the neighborhoods separated with a comma 
                df.loc[df['PostalCode'] == postcode, 'Neighborhood'] = df.loc[df['PostalCode'] == postcode, 'Neighborhood'] + ", " + neighborhood

Check the dataframe

In [8]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [9]:
df.shape

(103, 3)

Load the geographical coordinates of each postal code

In [10]:
geodata = pd.read_csv("https://cocl.us/Geospatial_data")
print("Data read into dataframe!") 

Data read into dataframe!


Fix the column name

In [11]:
geodata_column_names = geodata.columns.values
geodata_column_names[0] = "PostalCode"
geodata.columns = geodata_column_names

Merge data frames and check the result

In [12]:
df_merged = pd.merge(df, geodata)
df_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [13]:
df_merged.shape

(103, 5)

In [14]:
CLIENT_ID = 'BYYI2T0OESVKZWMHAALNG1HRUKK1GP4IA1XSDTXHX0TBUWBJ' # your Foursquare ID
CLIENT_SECRET = 'WXAHOZU4KN2MY4SK34SY4RO31AWEN32LIO41CFYBK5WRADNS' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
radius = 500
LIMIT = 100
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: BYYI2T0OESVKZWMHAALNG1HRUKK1GP4IA1XSDTXHX0TBUWBJ
CLIENT_SECRET:WXAHOZU4KN2MY4SK34SY4RO31AWEN32LIO41CFYBK5WRADNS


Copy "getNearbyVenues" function from new york analysis

In [17]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):        
        print(name)
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Retrieve toronto venues

In [18]:
toronto_venues = getNearbyVenues(names=df_merged['Neighborhood'],
                                   latitudes=df_merged['Latitude'],
                                   longitudes=df_merged['Longitude']
                                  )


Parkwoods
Victoria Village
Harbourfront, Regent Park
Lawrence Heights, Lawrence Manor
Queen's Park
Islington Avenue
Rouge, Malvern
Don Mills North
Woodbine Gardens, Parkview Hill
Ryerson, Garden District
Glencairn
Cloverdale, Islington, Martin Grove, Princess Gardens, West Deane Park
Highland Creek, Rouge Hill, Port Union
Flemingdon Park, Don Mills South
Woodbine Heights
St. James Town
Humewood-Cedarvale
Bloordale Gardens, Eringate, Markland Wood, Old Burnhamthorpe
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Downsview North, Wilson Heights
Thorncliffe Park
Adelaide, King, Richmond
Dovercourt Village, Dufferin
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto
Harbourfront East, Toronto Islands, Union Station
Little Portugal, Trinity
East Birchmount Park, Ionview, Kennedy Park
Bayview Village
CFB Toronto, Downsview East
The D

### Check data

In [19]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,KFC,43.754387,-79.333021,Fast Food Restaurant
2,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


In [20]:
toronto_venues.shape

(2233, 7)

### Checking how many venues were returned for each neighborhood

In [21]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Agincourt,4,4,4,4,4,4
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",2,2,2,2,2,2
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown",11,11,11,11,11,11
"Alderwood, Long Branch",10,10,10,10,10,10
"Bathurst Manor, Downsview North, Wilson Heights",16,16,16,16,16,16
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",26,26,26,26,26,26
Berczy Park,55,55,55,55,55,55
"Birch Cliff, Cliffside West",4,4,4,4,4,4


#### Repeat analysis done for New York

In [22]:
#one hot encoding
df_toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
df_toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']

#### Grouping rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [23]:
toronto_grouped = df_toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.shape

(101, 279)

#### Printing each neighborhood along with the top 5 most common venues

In [24]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
                 venue  freq
0          Coffee Shop  0.06
1                 Café  0.05
2      Thai Restaurant  0.04
3           Steakhouse  0.04
4  American Restaurant  0.04


----Agincourt----
            venue  freq
0  Clothing Store  0.25
1    Skating Rink  0.25
2          Lounge  0.25
3  Breakfast Spot  0.25
4   Movie Theater  0.00


----Agincourt North, L'Amoreaux East, Milliken, Steeles East----
                       venue  freq
0                 Playground   0.5
1                       Park   0.5
2  Middle Eastern Restaurant   0.0
3              Movie Theater   0.0
4                      Motel   0.0


----Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown----
                  venue  freq
0         Grocery Store  0.18
1   Fried Chicken Joint  0.09
2              Pharmacy  0.09
3           Pizza Place  0.09
4  Fast Food Restaurant  0.09


----Alderwood, Long Branch----
                ven

### Run *k*-means to cluster the neighborhood into 5 clusters.

In [25]:
kclusters = 5
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

print(kmeans.labels_)
toronto_grouped['Cluster Labels'] = kmeans.labels_ # add 'Cluster Labels' to the dataframe

df_merged_cluster = pd.merge(df_merged, toronto_grouped) # merge dataframes

[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3
 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 1 2 0 0 0 0 0
 0 1 0 0 0 0 1 4 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]


### Generating map to visualize  neighborhoods and how they cluster together.

In [26]:
# create map
map_clusters = folium.Map(location=[df_merged_cluster.loc[0, 'Latitude'], df_merged_cluster.loc[0, 'Longitude']], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_merged_cluster['Latitude'], df_merged_cluster['Longitude'], df_merged_cluster['Neighborhood'], df_merged_cluster['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters