# PART 1 - Data Extraction  

## Import neccessary libraries

In [2]:
import numpy as np
import pandas as pd
import lxml
import lxml.etree

## Building Dataframe from webpage

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
data = pd.read_html(url)
df = data[0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Filtering Dataframe to remove 'Not Assigned' Boroughs

In [4]:
filter_df = df.loc[ df['Borough'] != 'Not assigned']
print(f"Not assigned Boroughs: {(filter_df.Borough == 'Not assigned').sum()}")

Not assigned Boroughs: 0


In [5]:
print(f"Not assigned Neighbourhoods: {(filter_df.Neighbourhood == 'Not assigned').sum()}")

Not assigned Neighbourhoods: 0


In [6]:
filter_df.reset_index(drop=True, inplace=True)
filter_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [7]:
print(f"Shape of filtered Dataframe: {filter_df.shape}")

Shape of filtered Dataframe: (103, 3)


# PART 2 - Geospatial data (Latitudes & Longitudes) Extraction

## Import neccessary libraries

In [8]:
#!pip install geocoder --quiet
import geocoder
from geopy.geocoders import Nominatim

## Using Postal codes for Latitudes & Longitudes

In [9]:
#define empty lists for geospatial values and None values
lat = []
long = []
nones = []

#initialize geolocator
geolocator = Nominatim(user_agent = 'toronto_zipcodes_M')

#loop through filter_df['Postal Code'] values and append to the respective lists
for code in filter_df['Postal Code']:
    location = geolocator.geocode(code)
    if location is None:
        nones.append(None)
        nones.append(None)
    else:
        latitude = location.latitude
        longitude = location.longitude
    lat.append(latitude)
    long.append(longitude)


In [10]:
print(f"Total non-None values: {len(lat)}, {len(long)}")
print(f"Total None values: {len(nones)}")

Total non-None values: 103, 103
Total None values: 112


## Adding Latitudes & Longitudes values in filter_df

In [11]:
filter_df['Latitude'] = lat
filter_df['Longitude'] = long
filter_df.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,49.484606,8.466197
1,M4A,North York,Victoria Village,49.48429,8.467
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",45.440588,28.018025
3,M6A,North York,"Lawrence Manor, Lawrence Heights",53.794164,-1.752006
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",44.427689,26.171308
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",44.427689,26.171308
6,M1B,Scarborough,"Malvern, Rouge",45.255274,-76.28976
7,M3B,North York,Don Mills,45.439301,28.021317
8,M4B,East York,"Parkview Hill, Woodbine Gardens",45.440569,28.019181
9,M5B,Downtown Toronto,"Garden District, Ryerson",45.44082,28.016137


# PART-3 Explore & Cluster Neighbourhoods

## PART-3.1 Explore Neighbourhoods in Boroughs ending with 'Toronto'

### Import necessary libraries

In [12]:
import re
import folium
import requests

### Apply *regex* function to extract Borough names ending with 'Toronto'

In [13]:

string = filter_df['Borough'].values
regex = r"\w*\sToronto"
borough = []
none_vals = []
for row in string:
    val = re.findall(regex, row)
    if val == []:
        none_vals.append(val)
    else:
        borough.append(val)

print(borough)

[['Downtown Toronto'], ['Downtown Toronto'], ['Downtown Toronto'], ['Downtown Toronto'], ['East Toronto'], ['Downtown Toronto'], ['Downtown Toronto'], ['Downtown Toronto'], ['Downtown Toronto'], ['West Toronto'], ['Downtown Toronto'], ['West Toronto'], ['East Toronto'], ['Downtown Toronto'], ['West Toronto'], ['East Toronto'], ['Downtown Toronto'], ['East Toronto'], ['Central Toronto'], ['Central Toronto'], ['Central Toronto'], ['Central Toronto'], ['West Toronto'], ['Central Toronto'], ['Central Toronto'], ['West Toronto'], ['Central Toronto'], ['Downtown Toronto'], ['West Toronto'], ['Central Toronto'], ['Downtown Toronto'], ['Central Toronto'], ['Downtown Toronto'], ['Downtown Toronto'], ['Downtown Toronto'], ['Downtown Toronto'], ['Downtown Toronto'], ['Downtown Toronto'], ['East Toronto']]


In [14]:
chosen_boroughs = np.unique(np.array(borough))
print('Boroughs chosen for exploring venues:\n',chosen_boroughs)

Boroughs chosen for exploring venues:
 ['Central Toronto' 'Downtown Toronto' 'East Toronto' 'West Toronto']


### Subset Dataframe with chosen Boroughs for exploring

In [15]:
explore_df = filter_df.loc[filter_df['Borough'].isin(chosen_boroughs)].reset_index(drop=True)
explore_df.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",45.440588,28.018025
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",44.427689,26.171308
2,M5B,Downtown Toronto,"Garden District, Ryerson",45.44082,28.016137
3,M5C,Downtown Toronto,St. James Town,-6.173246,39.207869
4,M4E,East Toronto,The Beaches,43.64411,-79.588907
5,M5E,Downtown Toronto,Berczy Park,43.642106,-79.377445
6,M5G,Downtown Toronto,Central Bay Street,39.749568,-86.04008
7,M6G,Downtown Toronto,Christie,39.749568,-86.04008
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.649901,-79.383718
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.649901,-79.383718


In [45]:
print(f"Total unique Boroughs: {explore_df.Borough.nunique()}\nTotal unique Neighborhoods: {explore_df.Neighbourhood.nunique()}")

Total unique Boroughs: 4
Total unique Neighborhoods: 39


### Map chosen Boroughs and their Neighbourhoods in Toronto 

In [16]:
address = 'Toronto, Ontario, Canada'
geolocator = Nominatim(user_agent='Toronto_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(f"Coordinates of Toronto are: {latitude}, {longitude}")

Coordinates of Toronto are: 43.6534817, -79.3839347


In [17]:
toronto_map = folium.Map(location= [latitude, longitude], zoom_start=10)

#add Neighbourhood markers on toronto_map
for lat, long, bor, hood in zip(explore_df['Latitude'], explore_df['Longitude'],explore_df['Borough'],explore_df['Neighbourhood']):
    label = folium.Popup('{}, {}'.format(bor, hood), parse_html=True)
    folium.CircleMarker([lat, long],
        radius=4,
        popup=label,
        color='cyan',
        fill=True,
        fill_color='blue',
        fill_opacity=0.6,
        parse_html=False).add_to(toronto_map)

toronto_map

## PART-3.2 Build & Analyze nearby venues in Neighbourhoods

### Build nearby venues dataframe

In [18]:
#Credentials
CLIENT_ID = 'CXC1D1CNWMCS54XHC3M0VLPRLBCPQQMID0OZC04Z0VYTMSAU' 
CLIENT_SECRET = 'OQRFM1BNLVMREJ3N3VJBAWGKU2ERVDEBC3Q1M2UXHBVNDBN3' 
VERSION = '20201201' 
LIMIT = 100

### Using defined-function from *Neighborhoods-New-York* Lab for exploring Neighborhoods in chosen Boroughs
#### * increased radius from 500 to 1000 of function getNearbyVenues, hence the number of Venues could be more

In [19]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, long in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            long, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            long, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [88]:
toronto_venues = getNearbyVenues(explore_df.Neighbourhood, explore_df.Latitude, explore_df.Longitude)
toronto_venues.head(10)

Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
The Danforth West, Riverdale
Toronto Dominion Centre, Design Exchange
Brockton, Parkdale Village, Exhibition Place
India Bazaar, The Beaches West
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West, Forest Hill Road Park
High Park, The Junction South
North Toronto West, Lawrence Park
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Davisville
University of Toronto, Harbord
Runnymede, Swansea
Moore Park, Summerhill East
Kensington Market, Chinatown, Grange Park
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
R

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",45.440588,28.018025,LIDL,45.439468,28.029159,Supermarket
1,"Regent Park, Harbourfront",45.440588,28.018025,Centru,45.442525,28.017562,Plaza
2,"Regent Park, Harbourfront",45.440588,28.018025,Restaurant HAZARD,45.440675,28.02874,Mediterranean Restaurant
3,"Regent Park, Harbourfront",45.440588,28.018025,Golden Chicken,45.433389,28.025602,Fast Food Restaurant
4,"Queen's Park, Ontario Provincial Government",44.427689,26.171308,Mega Image,44.425392,26.165526,Supermarket
5,"Queen's Park, Ontario Provincial Government",44.427689,26.171308,Parcul Alexandru Ioan Cuza (IOR),44.426685,26.160164,Park
6,"Queen's Park, Ontario Provincial Government",44.427689,26.171308,World Class Romania,44.420916,26.177126,Gym / Fitness Center
7,"Queen's Park, Ontario Provincial Government",44.427689,26.171308,American Ballroom,44.432719,26.179901,Modern European Restaurant
8,"Queen's Park, Ontario Provincial Government",44.427689,26.171308,Restaurant GEDI Titan,44.421173,26.170131,Restaurant
9,"Queen's Park, Ontario Provincial Government",44.427689,26.171308,Emag Showroom Titan,44.424371,26.16151,Electronics Store


In [89]:
print(f"Total Rows:{toronto_venues.shape[0]}, Total Columns:{toronto_venues.shape[1]}")
toronto_venues.head(10)

Total Rows:1972, Total Columns:7


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",45.440588,28.018025,LIDL,45.439468,28.029159,Supermarket
1,"Regent Park, Harbourfront",45.440588,28.018025,Centru,45.442525,28.017562,Plaza
2,"Regent Park, Harbourfront",45.440588,28.018025,Restaurant HAZARD,45.440675,28.02874,Mediterranean Restaurant
3,"Regent Park, Harbourfront",45.440588,28.018025,Golden Chicken,45.433389,28.025602,Fast Food Restaurant
4,"Queen's Park, Ontario Provincial Government",44.427689,26.171308,Mega Image,44.425392,26.165526,Supermarket
5,"Queen's Park, Ontario Provincial Government",44.427689,26.171308,Parcul Alexandru Ioan Cuza (IOR),44.426685,26.160164,Park
6,"Queen's Park, Ontario Provincial Government",44.427689,26.171308,World Class Romania,44.420916,26.177126,Gym / Fitness Center
7,"Queen's Park, Ontario Provincial Government",44.427689,26.171308,American Ballroom,44.432719,26.179901,Modern European Restaurant
8,"Queen's Park, Ontario Provincial Government",44.427689,26.171308,Restaurant GEDI Titan,44.421173,26.170131,Restaurant
9,"Queen's Park, Ontario Provincial Government",44.427689,26.171308,Emag Showroom Titan,44.424371,26.16151,Electronics Store


In [154]:
print(f"Total unique Venue categories: {toronto_venues['Venue Category'].nunique()}")

Total unique Venue categories: 237


#### Important Assumption for Analysis:

* Based on .nunique method, there are 237 unique venue categories
* BUT, one of the Venue Category is _'Neighborhood'_. 
* THIS makes .groupby('Neighborhood') after onehot-encoding error-prone _(ValueError: Grouper for 'Neighborhood' not 1-dimensional)_
* Hence, excluding _'Neighborhood'_ from Venue categories
* Assumed Total unique Venue categories = 236

In [161]:
print("Venues with 'Neighborhood' Venue Category:\n")
toronto_venues.loc[toronto_venues['Venue Category'] == 'Neighborhood']


Venues with 'Neighborhood' Venue Category:



Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
46,"Garden District, Ryerson",45.44082,28.016137,Piața Energiei - General,45.440053,28.021103,Neighborhood
73,Berczy Park,43.642106,-79.377445,Harbourfront,43.639526,-79.380688,Neighborhood
205,"Richmond, Adelaide, King",43.649901,-79.383718,Downtown Toronto,43.653232,-79.385296,Neighborhood
305,"Dufferin, Dovercourt Village",43.649901,-79.383718,Downtown Toronto,43.653232,-79.385296,Neighborhood
394,"Harbourfront East, Union Station, Toronto Islands",43.639259,-79.38284,Harbourfront,43.639526,-79.380688,Neighborhood


### Analyze nearby venues dataframe

#### one-hot encode Venue Category for statistics by Neighbourhood

In [116]:
encode_df = pd.get_dummies(toronto_venues[['Venue Category']], prefix='', prefix_sep='')
encode_df.head()

Unnamed: 0,Airport,Airport Food Court,Airport Lounge,American Restaurant,Antique Shop,Aquarium,Argentinian Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,...,University,Vegetarian / Vegan Restaurant,Venezuelan Restaurant,Veterinarian,Video Store,Vietnamese Restaurant,Wine Shop,Wings Joint,Xinjiang Restaurant,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [167]:
encode_df.drop('Neighborhood', axis=1, inplace=True)

In [168]:
encode_df['Neighborhood'] = toronto_venues[['Neighborhood']]

#move Neighborhood column to the '0' position
fixed_columns = ['Neighborhood'] + list(encode_df.columns[encode_df.columns!='Neighborhood'])
encode_df = encode_df[fixed_columns]
encode_df.head()

Unnamed: 0,Neighborhood,Airport,Airport Food Court,Airport Lounge,American Restaurant,Antique Shop,Aquarium,Argentinian Restaurant,Art Gallery,Art Museum,...,University,Vegetarian / Vegan Restaurant,Venezuelan Restaurant,Veterinarian,Video Store,Vietnamese Restaurant,Wine Shop,Wings Joint,Xinjiang Restaurant,Yoga Studio
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Queen's Park, Ontario Provincial Government",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [169]:
encode_df.shape

(1972, 237)

In [171]:
toronto_grouped = encode_df.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Airport,Airport Food Court,Airport Lounge,American Restaurant,Antique Shop,Aquarium,Argentinian Restaurant,Art Gallery,Art Museum,...,University,Vegetarian / Vegan Restaurant,Venezuelan Restaurant,Veterinarian,Video Store,Vietnamese Restaurant,Wine Shop,Wings Joint,Xinjiang Restaurant,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.01,0.0,0.02,0.0,0.02,0.0,...,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.017241,0.0,0.0,0.0,0.017241,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.017544,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.017544,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.01,0.0,...,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0
5,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0
6,Church and Wellesley,0.0,0.0,0.0,0.017544,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.017544,0.0,0.0,0.0,0.0
7,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Extract Top 10 venues from each Neighborhood