<h1>Question 1</h1>
<h5>I have matched the necessary data with the following Regex: (M\d\w)(.+)\((.*)\)</h5>
<h5>It returns 3 groups: the postal code (a letter 'M' followed by a digit and then a letter), any set of words before an open parentheses and the last group is the info inside the parentheses (the neighborhood itself)</h5>

In [1]:
# Importing the required libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
from geopy.geocoders import Nominatim
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

# regex = '(M\d\w)(.+)\((.*)\)'

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
columns = ['PostalCode', 'Borough', 'Neighborhood']

toronto_df= pd.DataFrame(columns=columns)
toronto_wiki = requests.get(url, 'html5lib').text

soup = BeautifulSoup(toronto_wiki)

tables = soup.find_all("tbody")

for row in tables[0].find_all("tr"):
    cols = row.find_all("td")
    for col in cols:
        info = re.search(r'(M\d\w)(.+)\((.*)\)', col.text)
        if info:
            postalcode = info.groups()[0]
            borough = info.groups()[1]
            neighborhood = info.groups()[2]
            toronto_df = toronto_df.append({"PostalCode":postalcode, "Borough":borough, "Neighborhood":neighborhood}, ignore_index=True)
    
# Replacing the '/' with commas
toronto_df["Neighborhood"] = toronto_df["Neighborhood"].str.replace("/", ",")

# Checking if any Borough does not have an assigned neighborhood
# print(toronto_df[toronto_df["Neighborhood"]=='Not assigned'])
toronto_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East TorontoBusiness reply mail Processing Cen...,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,..."


In [2]:
toronto_df.shape

(103, 3)

<h1>Question 2</h1>
<p>The resultant dataframe from question merged with the geolocation from the csv file</p>

In [3]:
toronto_geodata = pd.read_csv('https://cocl.us/Geospatial_data')

toronto_df = toronto_df.merge(toronto_geodata, left_on='PostalCode', right_on='Postal Code')

toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M3A,North York,Parkwoods,M3A,43.753259,-79.329656
1,M4A,North York,Victoria Village,M4A,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",M5A,43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",M6A,43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,M7A,43.662301,-79.389494


<h1>Question 3</h1>
<p>Clustering the data with the following assumptions:</p>
<li>All the boroughs must contain the string 'Toronto' as suggested</li>
<li>The neighborhoods will be grouped in clusters by the different types of venues in the Outdoors & Recreation category</li>

In [4]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

torontoMap = folium.Map(location=[latitude, longitude], zoom_start=10)


# Using only the Boroughs that contains the word Toronto!
filtered_df = toronto_df[toronto_df["Borough"].str.contains('Toronto')]

# Outdoors & Recreation category ID from Foursquare
outdoor_category = '4d4b7105d754a06377d81259'

In [5]:
CLIENT_ID = 'CRN2QP54XJ4SSKPST0LYZTLSISLNWRJVAMKSBNP5ULMO5Q0C' # your Foursquare ID
CLIENT_SECRET = 'FGZOZCDZ5LUQVECCPKW3BI2RSWRTOAWRWQW0IVFH2ZZT4T1D' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value


<h3>This function was modified to explore venues in a certain category, specified as a parameter</h3>

In [6]:
def getNearbyCategoryVenues(names, latitudes, longitudes, category, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/search?categoryId={}&intent=browse&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            category,
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
    
        print(url)
            
        # make the GET request
        results = requests.get(url).json()["response"]['venues']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['name'], 
            v['location']['lat'], 
            v['location']['lng'],  
            v['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [7]:
toronto_venues = getNearbyCategoryVenues(filtered_df.Neighborhood, filtered_df.Latitude, filtered_df.Longitude, outdoor_category)

# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Regent Park , Harbourfront
https://api.foursquare.com/v2/venues/search?categoryId=4d4b7105d754a06377d81259&intent=browse&client_id=CRN2QP54XJ4SSKPST0LYZTLSISLNWRJVAMKSBNP5ULMO5Q0C&client_secret=FGZOZCDZ5LUQVECCPKW3BI2RSWRTOAWRWQW0IVFH2ZZT4T1D&v=20180605&ll=43.6542599,-79.3606359&radius=500&limit=100
Garden District, Ryerson
https://api.foursquare.com/v2/venues/search?categoryId=4d4b7105d754a06377d81259&intent=browse&client_id=CRN2QP54XJ4SSKPST0LYZTLSISLNWRJVAMKSBNP5ULMO5Q0C&client_secret=FGZOZCDZ5LUQVECCPKW3BI2RSWRTOAWRWQW0IVFH2ZZT4T1D&v=20180605&ll=43.6571618,-79.37893709999999&radius=500&limit=100
St. James Town
https://api.foursquare.com/v2/venues/search?categoryId=4d4b7105d754a06377d81259&intent=browse&client_id=CRN2QP54XJ4SSKPST0LYZTLSISLNWRJVAMKSBNP5ULMO5Q0C&client_secret=FGZOZCDZ5LUQVECCPKW3BI2RSWRTOAWRWQW0IVFH2ZZT4T1D&v=20180605&ll=43.6514939,-79.3754179&radius=500&limit=100
The Beaches
https://api.foursquare.com/v2/venues/search?categoryId=4d4b7105d754a06377d81259&intent=brows

Unnamed: 0,Yoga Studio,Apres Ski Bar,Art Gallery,Athletics & Sports,Baseball Field,Basketball Court,Basketball Stadium,Bathing Area,Beach,Bike Trail,...,Summer Camp,Surf Spot,Swim School,Tennis Court,Track,Trail,Tree,Vineyard,Weight Loss Center,Well
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [9]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
        
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Gym,Other Great Outdoors,Gym / Fitness Center,Harbor / Marina,Athletics & Sports,Dog Run,Park,Pool,Beach,Sculpture Garden
1,"Brockton , Parkdale Village , Exhibition Place",Gym / Fitness Center,Gym,Park,Yoga Studio,Other Great Outdoors,Playground,Field,Scenic Lookout,Plaza,Stadium
2,"CN Tower , King and Spadina , Railway Lands , ...",Harbor / Marina,Tree,Sculpture Garden,Pool,Forest,Cycle Studio,Distribution Center,Dive Spot,Dog Run,Farm
3,Central Bay Street,Gym / Fitness Center,Gym,Garden,Pool,Gym Pool,Park,Martial Arts School,Field,Other Great Outdoors,Yoga Studio
4,Christie,Gym / Fitness Center,Yoga Studio,Park,Athletics & Sports,Baseball Field,Sculpture Garden,Flower Shop,Fountain,Dive Spot,Dog Run


In [10]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Apres Ski Bar,Art Gallery,Athletics & Sports,Baseball Field,Basketball Court,Basketball Stadium,Bathing Area,Beach,...,Summer Camp,Surf Spot,Swim School,Tennis Court,Track,Trail,Tree,Vineyard,Weight Loss Center,Well
0,Berczy Park,0.0,0.0,0.0,0.06,0.0,0.02,0.02,0.0,0.04,...,0.02,0.0,0.0,0.02,0.0,0.02,0.0,0.0,0.0,0.0
1,"Brockton , Parkdale Village , Exhibition Place",0.08,0.0,0.0,0.04,0.04,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"CN Tower , King and Spadina , Railway Lands , ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0
3,Central Bay Street,0.04,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Christie,0.142857,0.0,0.0,0.071429,0.071429,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Church and Wellesley,0.06,0.0,0.0,0.08,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,"Commerce Court , Victoria Hotel",0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02
7,Davisville,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0
8,Davisville North,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.04,0.0
9,"Dufferin , Dovercourt Village",0.0,0.0,0.0,0.064516,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 4

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 1, 2, 1, 1, 1, 1, 3], dtype=int32)

In [12]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = filtered_df 

toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",M5A,43.65426,-79.360636,1,Park,Gym,Gym / Fitness Center,Sculpture Garden,Other Great Outdoors,Dog Run,Yoga Studio,Pool,General Travel,Distribution Center
9,M5B,Downtown Toronto,"Garden District, Ryerson",M5B,43.657162,-79.378937,1,Gym,Plaza,Athletics & Sports,Gym / Fitness Center,Scenic Lookout,Park,Roof Deck,Other Great Outdoors,Yoga Studio,Basketball Court
15,M5C,Downtown Toronto,St. James Town,M5C,43.651494,-79.375418,1,Gym,Gym / Fitness Center,Other Great Outdoors,Park,Athletics & Sports,Scenic Lookout,Plaza,Recreation Center,Garden,Outdoors & Recreation
19,M4E,East Toronto,The Beaches,M4E,43.676357,-79.293031,3,Playground,Park,Trail,Dog Run,Bridge,Other Great Outdoors,Well,Forest,Dive Spot,Farm
20,M5E,Downtown Toronto,Berczy Park,M5E,43.644771,-79.373306,1,Gym,Other Great Outdoors,Gym / Fitness Center,Harbor / Marina,Athletics & Sports,Dog Run,Park,Pool,Beach,Sculpture Garden


In [13]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters