### Import Standard libraries

In [184]:
import pandas as pd
import numpy as np

## Method 1 using Beautiful Soup

In [185]:
# Import Beautiful Soup and requests library
import requests
from bs4 import BeautifulSoup

wiki_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(wiki_url,'html.parser')

In [186]:
postal_codes = []
items = soup.find('table',{'class':'wikitable sortable'}).findAll('tr')
for item in items:
    data = item.findAll(['th','td'])
    data = [x.text.strip() for x in data]
    postal_codes.append(data)
df_postal_codes = pd.DataFrame(postal_codes[1::],columns=postal_codes[0])
df_postal_codes.shape

(180, 3)

## Method 2 using wikipedia Python library

In [187]:
#Import Wikipedia library
import wikipedia as wp

html = wp.page("List_of_postal_codes_of_Canada:_M").html().encode("UTF-8")
df_postal_codes = pd.read_html(html)[0]
df_postal_codes.shape

(180, 3)

### convert all cells in the dataframe to upper case

In [188]:
df_postal_codes = df_postal_codes.apply(lambda x: x.str.upper())
df_postal_codes.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,NOT ASSIGNED,NOT ASSIGNED
1,M2A,NOT ASSIGNED,NOT ASSIGNED
2,M3A,NORTH YORK,PARKWOODS
3,M4A,NORTH YORK,VICTORIA VILLAGE
4,M5A,DOWNTOWN TORONTO,"REGENT PARK, HARBOURFRONT"


In [189]:
df_postal_codes['Borough'].value_counts()

NOT ASSIGNED        77
NORTH YORK          24
DOWNTOWN TORONTO    19
SCARBOROUGH         17
ETOBICOKE           12
CENTRAL TORONTO      9
WEST TORONTO         6
EAST TORONTO         5
EAST YORK            5
YORK                 5
MISSISSAUGA          1
Name: Borough, dtype: int64

### cleanup Burough and Neighborhood columns as per instructions
___Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.___


___If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.___

In [190]:
df_postal_codes = df_postal_codes[df_postal_codes['Borough'] != 'NOT ASSIGNED']
df_postal_codes.loc[(df_postal_codes['Neighborhood']=='NOT ASSIGNED')&(df_postal_codes['Borough']!='NOT ASSIGNED'), 'Neighborhood'] = df_postal_codes['Borough']
df_postal_codes.shape

(103, 3)

In [191]:
df_postal_codes.describe()

Unnamed: 0,Postal Code,Borough,Neighborhood
count,103,103,103
unique,103,10,99
top,M4G,NORTH YORK,DOWNSVIEW
freq,1,24,4


In [192]:
df_postal_codes.shape

(103, 3)

## We will be using pgeocode for Geo Location translation

In [193]:
import pgeocode
#setup Nominatim for CANADA
nomi = pgeocode.Nominatim('ca')

### Loop thru all postal codes in the Dataframe and add the Latitude and Longitude from pgeocode

In [194]:
for index, row in df_postal_codes.iterrows():
    geo_code = nomi.query_postal_code(row[0])
    df_postal_codes.loc[index,'Latitude']= geo_code['latitude']
    df_postal_codes.loc[index,'Longitude'] = geo_code['longitude']
df_postal_codes.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M3A,NORTH YORK,PARKWOODS,43.7545,-79.33
3,M4A,NORTH YORK,VICTORIA VILLAGE,43.7276,-79.3148
4,M5A,DOWNTOWN TORONTO,"REGENT PARK, HARBOURFRONT",43.6555,-79.3626
5,M6A,NORTH YORK,"LAWRENCE MANOR, LAWRENCE HEIGHTS",43.7223,-79.4504
6,M7A,DOWNTOWN TORONTO,"QUEEN'S PARK, ONTARIO PROVINCIAL GOVERNMENT",43.6641,-79.3889


### Check for any probable nulls

In [195]:
df_postal_codes[df_postal_codes.isnull().any(axis=1)]

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
114,M7R,MISSISSAUGA,CANADA POST GATEWAY PROCESSING CENTRE,,


### Since there are nulls, we will use the Geospatial file to update the null values with coordinates.

In [196]:
geo_data = pd.read_csv('Geospatial_Coordinates.csv')

In [197]:
for index, row in df_postal_codes[df_postal_codes.isnull().any(axis=1)].iterrows():
    df_postal_codes.loc[index, 'Latitude'] = geo_data[geo_data['Postal Code'] == row[0]].Latitude.iloc[0]
    df_postal_codes.loc[index, 'Longitude'] = geo_data[geo_data['Postal Code'] == row[0]].Longitude.iloc[0]

In [198]:
df_postal_codes[df_postal_codes.isnull().any(axis=1)]

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude


In [199]:
df_postal_codes.head(20)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M3A,NORTH YORK,PARKWOODS,43.7545,-79.33
3,M4A,NORTH YORK,VICTORIA VILLAGE,43.7276,-79.3148
4,M5A,DOWNTOWN TORONTO,"REGENT PARK, HARBOURFRONT",43.6555,-79.3626
5,M6A,NORTH YORK,"LAWRENCE MANOR, LAWRENCE HEIGHTS",43.7223,-79.4504
6,M7A,DOWNTOWN TORONTO,"QUEEN'S PARK, ONTARIO PROVINCIAL GOVERNMENT",43.6641,-79.3889
8,M9A,ETOBICOKE,"ISLINGTON AVENUE, HUMBER VALLEY VILLAGE",43.6662,-79.5282
9,M1B,SCARBOROUGH,"MALVERN, ROUGE",43.8113,-79.193
11,M3B,NORTH YORK,DON MILLS,43.745,-79.359
12,M4B,EAST YORK,"PARKVIEW HILL, WOODBINE GARDENS",43.7063,-79.3094
13,M5B,DOWNTOWN TORONTO,"GARDEN DISTRICT, RYERSON",43.6572,-79.3783


In [200]:
df_postal_codes.shape

(103, 5)

### Lets start using Foursquare API to explore the neighborhood

In [201]:
import credentials # seperate pythin file with my Foursquare credentials
import datetime

CLIENT_ID = credentials.CLIENT_ID
CLIENT_SECRET = credentials.CLIENT_SECRET
ACCESS_TOKEN = credentials.ACCESS_TOKEN
VERSION = datetime.date.today().strftime('%Y%m%d')
LIMIT = 500

### Filter the dataframe to include only Toronto buroughs

In [202]:
df_toronto = df_postal_codes[df_postal_codes['Borough'].str.contains('TORONTO')].reset_index().drop('index',axis=1)
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,DOWNTOWN TORONTO,"REGENT PARK, HARBOURFRONT",43.6555,-79.3626
1,M7A,DOWNTOWN TORONTO,"QUEEN'S PARK, ONTARIO PROVINCIAL GOVERNMENT",43.6641,-79.3889
2,M5B,DOWNTOWN TORONTO,"GARDEN DISTRICT, RYERSON",43.6572,-79.3783
3,M5C,DOWNTOWN TORONTO,ST. JAMES TOWN,43.6513,-79.3756
4,M4E,EAST TORONTO,THE BEACHES,43.6784,-79.2941


In [203]:
df_toronto.shape

(39, 5)

### Run the Foursquare api in loop for all coordinates in the df_toronto dataframe and add the venue details to the df

In [204]:
radius = 500

toronto_venues = []

for index, row in df_toronto.iterrows():
    lat = row['Latitude']
    long = row['Longitude']
    # create the API request URL
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
          CLIENT_ID, 
          CLIENT_SECRET, 
          VERSION, 
          lat, 
          long, 
          radius, 
          LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for result in results:
        toronto_venues.append([
        row['Postal Code'],
        row['Borough'],
        row['Neighborhood'],
        lat,
        long,
        result['venue']['name'],
        result['venue']['location']['lat'],
        result['venue']['location']['lng'],
        result['venue']['categories'][0]['name']])

In [205]:
df_toronto_venues = pd.DataFrame(toronto_venues)
df_toronto_venues.columns = ['Postal Code','Borough','Neighborhood','Latitude','Longitude',
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']


In [206]:
df_toronto_venues.shape

(1529, 9)

In [207]:
df_toronto_venues.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M5A,DOWNTOWN TORONTO,"REGENT PARK, HARBOURFRONT",43.6555,-79.3626,Tandem Coffee,43.653559,-79.361809,Coffee Shop
1,M5A,DOWNTOWN TORONTO,"REGENT PARK, HARBOURFRONT",43.6555,-79.3626,Roselle Desserts,43.653447,-79.362017,Bakery
2,M5A,DOWNTOWN TORONTO,"REGENT PARK, HARBOURFRONT",43.6555,-79.3626,Figs Breakfast & Lunch,43.655675,-79.364503,Breakfast Spot
3,M5A,DOWNTOWN TORONTO,"REGENT PARK, HARBOURFRONT",43.6555,-79.3626,The Yoga Lounge,43.655515,-79.364955,Yoga Studio
4,M5A,DOWNTOWN TORONTO,"REGENT PARK, HARBOURFRONT",43.6555,-79.3626,Body Blitz Spa East,43.654735,-79.359874,Spa


In [208]:
len(df_toronto_venues['Venue Category'].unique())

211

###  Apply one hot encoding to flatten the venue categories.

In [209]:
# one hot encoding
df_toronto_venues_onehot = pd.get_dummies(df_toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
df_toronto_venues_onehot['Postal Code'] = df_toronto_venues['Postal Code'] 
df_toronto_venues_onehot['Borough'] = df_toronto_venues['Borough']

fixed_columns = df_toronto_venues_onehot.columns.tolist()
fixed_columns = fixed_columns[-2:] + fixed_columns[:-2]
df_toronto_venues_onehot = df_toronto_venues_onehot[fixed_columns]

df_toronto_venues_onehot.head()

Unnamed: 0,Postal Code,Borough,Accessories Store,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M5A,DOWNTOWN TORONTO,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M5A,DOWNTOWN TORONTO,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M5A,DOWNTOWN TORONTO,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M5A,DOWNTOWN TORONTO,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,M5A,DOWNTOWN TORONTO,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [210]:
df_toronto_venues_onehot.shape

(1529, 213)

### Let us group by Postal Codes and Boroughs by taking the mean of the frequency of occurrence of each category

In [211]:
df_toronto_venues_grouped = df_toronto_venues_onehot.groupby(['Postal Code','Borough']).mean().reset_index()
df_toronto_venues_grouped.head()

Unnamed: 0,Postal Code,Borough,Accessories Store,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M4E,EAST TORONTO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,EAST TORONTO,0.0,0.029412,0.0,0.0,0.0,0.0,0.0,0.0,...,0.029412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029412
2,M4L,EAST TORONTO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,EAST TORONTO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M4N,CENTRAL TORONTO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [212]:
df_toronto_venues_grouped.shape

(39, 213)

### Let us find out the top 5 venue categories for each borough

In [213]:
num_top_venues = 5

for index, row in df_toronto_venues_grouped.iterrows():
    print("----"+ row[0] +'---'+ row[1] +"----")
    temp = df_toronto_venues_grouped[df_toronto_venues_grouped['Postal Code'] == row[0]].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[2:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----M4E---EAST TORONTO----
               venue  freq
0       Neighborhood  0.14
1  Health Food Store  0.14
2                Pub  0.14
3        Cheese Shop  0.14
4             Bakery  0.14


----M4K---EAST TORONTO----
                venue  freq
0    Greek Restaurant  0.21
1      Ice Cream Shop  0.06
2  Italian Restaurant  0.06
3                Café  0.06
4          Restaurant  0.06


----M4L---EAST TORONTO----
              venue  freq
0              Park  0.15
1    Sandwich Place  0.10
2     Movie Theater  0.05
3      Liquor Store  0.05
4  Sushi Restaurant  0.05


----M4M---EAST TORONTO----
             venue  freq
0             Park  0.12
1      Coffee Shop  0.12
2  Coworking Space  0.12
3            Diner  0.12
4   Baseball Field  0.12


----M4N---CENTRAL TORONTO----
                       venue  freq
0         Photography Studio   0.5
1                       Park   0.5
2          Accessories Store   0.0
3               Neighborhood   0.0
4  Middle Eastern Restaurant   0.0


----M4

## Clustering of Neighborhoods

### Setup KMeans Model

In [218]:
from sklearn.cluster import KMeans
clusters = 10
df_toronto_venues_clustering = df_toronto_venues_grouped.drop(['Postal Code','Borough'],axis=1)

model_kmeans = KMeans(n_clusters=clusters, random_state=0).fit(df_toronto_venues_clustering)

print()
print('total clusters :', len(model_kmeans.labels_))
print('unique clusters :', set(model_kmeans.labels_))
model_kmeans.labels_


total clusters : 39
unique clusters : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}


array([0, 0, 3, 0, 5, 7, 6, 0, 3, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0,
       1, 2, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 9, 0, 0, 0, 0])

### Add the cluster labels to the dataframe

In [219]:
df_toronto_merged = df_toronto_venues_grouped.copy()
df_toronto_merged.insert(0,'Cluster Label',model_kmeans.labels_)
df_toronto_merged.head()

Unnamed: 0,Cluster Label,Postal Code,Borough,Accessories Store,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,0,M4E,EAST TORONTO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,M4K,EAST TORONTO,0.0,0.029412,0.0,0.0,0.0,0.0,0.0,...,0.029412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029412
2,3,M4L,EAST TORONTO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,M4M,EAST TORONTO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,M4N,CENTRAL TORONTO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Merge the clustered dataframe to the parent dataframe for Toronto for visualization

In [220]:
df_toronto_clustered = df_toronto.join(df_toronto_merged[['Cluster Label','Postal Code']].set_index('Postal Code'), on='Postal Code')
df_toronto_clustered.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Label
0,M5A,DOWNTOWN TORONTO,"REGENT PARK, HARBOURFRONT",43.6555,-79.3626,0
1,M7A,DOWNTOWN TORONTO,"QUEEN'S PARK, ONTARIO PROVINCIAL GOVERNMENT",43.6641,-79.3889,0
2,M5B,DOWNTOWN TORONTO,"GARDEN DISTRICT, RYERSON",43.6572,-79.3783,0
3,M5C,DOWNTOWN TORONTO,ST. JAMES TOWN,43.6513,-79.3756,0
4,M4E,EAST TORONTO,THE BEACHES,43.6784,-79.2941,0


## Use Folium to create map and visualize clusters

In [225]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors


#setup map to display Toronto
toronto_lat = 43.717899
toronto_long = -79.6582408
toronto_map = folium.Map(location=[toronto_lat, toronto_long], zoom_start=11)

# set color scheme for the clusters
x = np.arange(clusters)
ys = [i + x + (i*x)**2 for i in range(clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_toronto_clustered['Latitude'], df_toronto_clustered['Longitude'], df_toronto_clustered['Neighborhood'], df_toronto_clustered['Cluster Label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(toronto_map)
       
toronto_map