# Applied Data Science Capstone

This notebook will be use for the Capstone Project of the IBM Applied Data Science Specialization.

Author: RUBEN ADAD

### Import necessary Libraries

In [185]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import matplotlib.cm as cm
import matplotlib.colors as colors
import io

## Part 1

### Scrape the Wikipedia page

In [186]:
website_url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

### Convert to BeautifulSoup object

In [187]:
soup = BeautifulSoup(website_url,'lxml')
#print(soup.prettify())

### Extract HTML table

In [188]:
postal_table = soup.find('table',{'class':'wikitable sortable'})
#postal_table

### Extract headings

In [189]:
ths = postal_table.find_all('th')
headings = [th.text.strip() for th in ths]
        
print(headings)
postal_df = pd.DataFrame(columns=headings)

['Postcode', 'Borough', 'Neighbourhood']


### Extract table content

In [190]:
tds = postal_table.find_all('td')
values = [td.text.strip() for td in tds]
        
print(values)
print(type(values))
print(len(values))

['M1A', 'Not assigned', 'Not assigned', 'M2A', 'Not assigned', 'Not assigned', 'M3A', 'North York', 'Parkwoods', 'M4A', 'North York', 'Victoria Village', 'M5A', 'Downtown Toronto', 'Harbourfront', 'M5A', 'Downtown Toronto', 'Regent Park', 'M6A', 'North York', 'Lawrence Heights', 'M6A', 'North York', 'Lawrence Manor', 'M7A', "Queen's Park", 'Not assigned', 'M8A', 'Not assigned', 'Not assigned', 'M9A', 'Etobicoke', 'Islington Avenue', 'M1B', 'Scarborough', 'Rouge', 'M1B', 'Scarborough', 'Malvern', 'M2B', 'Not assigned', 'Not assigned', 'M3B', 'North York', 'Don Mills North', 'M4B', 'East York', 'Woodbine Gardens', 'M4B', 'East York', 'Parkview Hill', 'M5B', 'Downtown Toronto', 'Ryerson', 'M5B', 'Downtown Toronto', 'Garden District', 'M6B', 'North York', 'Glencairn', 'M7B', 'Not assigned', 'Not assigned', 'M8B', 'Not assigned', 'Not assigned', 'M9B', 'Etobicoke', 'Cloverdale', 'M9B', 'Etobicoke', 'Islington', 'M9B', 'Etobicoke', 'Martin Grove', 'M9B', 'Etobicoke', 'Princess Gardens', 'M9B

### Convert to Dataframe

During conversion when Neighbourhood is "not assigned" it will be assigned with Borough name.

Resulting dataframe is sorted by Postcode column and has 288 rows and 3 columns.

In [191]:
j = 0
for i in range(0,len(values),3):
    Postcode = values[i]
    Borough = values[i+1]
    Neighbourhood = values[i+2]

    if Neighbourhood == "Not assigned":
        Neighbourhood = Borough


    postal_df.loc[j] = [Postcode,Borough,Neighbourhood]
    j += 1
    
postal_df.sort_values(by=['Postcode'], inplace=True) 
postal_df.reset_index(inplace=True) 
postal_df.drop('index', axis=1, inplace=True)
print(postal_df.shape)
postal_df.head(25)

(288, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M1B,Scarborough,Malvern
2,M1B,Scarborough,Rouge
3,M1C,Scarborough,Highland Creek
4,M1C,Scarborough,Rouge Hill
5,M1C,Scarborough,Port Union
6,M1E,Scarborough,West Hill
7,M1E,Scarborough,Morningside
8,M1E,Scarborough,Guildwood
9,M1G,Scarborough,Woburn


### Merge Neighboorhoods for duplicated Postal Codes

In [192]:
# df.set_value(i,'ifor',ifor_val)
last_Postcode = ""
last_Neighbourhood = ""
Neighbourhood = ""

for idx, row in postal_df.iterrows():
    
    if last_Postcode != row['Postcode']:
        j = idx
        Neighbourhood = ""
        #print(last_Postcode, row['Postcode'], j)
    else:
        Neighbourhood = postal_df.at[j, 'Neighbourhood'] + ", " + row['Neighbourhood'] 
        postal_df.at[j, 'Neighbourhood'] = Neighbourhood
        postal_df.drop([idx], inplace=True)
        #print(last_Postcode, row['Postcode'], j, index)

    last_Postcode = row['Postcode']
    last_Neighbourhood = row['Neighbourhood']
    


postal_df.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M1B,Scarborough,"Malvern, Rouge"
3,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
6,M1E,Scarborough,"West Hill, Morningside, Guildwood"
9,M1G,Scarborough,Woburn
10,M1H,Scarborough,Cedarbrae
11,M1J,Scarborough,Scarborough Village
12,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
15,M1L,Scarborough,"Oakridge, Golden Mile, Clairlea"
18,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"


#### Dataframe after merging duplicated Postal Codes has 180 rows.

In [193]:
postal_df.shape

(180, 3)

## Part 2 Geocoding

### Read csv file that has the geographical coordinates of each postal code

In [194]:
url = 'http://cocl.us/Geospatial_data'

urlData = requests.get(url).content
codes_df = pd.read_csv(io.StringIO(urlData.decode('utf-8')))

codes_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Join geographical coordinates DF with Postal Codes DF

I am using INNER join to avoid rows without geographical coordinates.

Resulting dataframe has 103 rows.

In [195]:
latlon_df = postal_df.set_index('Postcode').join(codes_df.set_index('Postal Code'), how="inner")
latlon_df.reset_index(inplace=True) 
latlon_df.rename(columns={'index': 'Postcode'}, inplace=True)
print(latlon_df.shape)
latlon_df.head()

(103, 5)


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"West Hill, Morningside, Guildwood",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Visualize Postal Codes in a map

In [196]:
import folium
toronto_map = folium.Map(location=[43.653908, -79.384293], zoom_start=10) # generate map centred around Toronto


# add popular spots to the map as blue circle markers
for lat, lng, label in zip(latlon_df['Latitude'], latlon_df['Longitude'], latlon_df['Postcode']):
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        fill=True,
        color='blue',
        fill_color='blue',
        fill_opacity=0.6
        ).add_to(toronto_map)

# display map
toronto_map

## Part 3 Clustering

Import needed libraries:

In [197]:
import json # library to handle JSON files
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from sklearn.cluster import KMeans

Provide Foursqaure API credentials:

In [198]:
CLIENT_ID = 'LZ0PKCF0CXYYTZFP3U4VY0RG4MBURBC0WHZ5TTWHECUMIEPC' # your Foursquare ID
CLIENT_SECRET = 'MWUOBK3YN4Y2LPQSDFG4SMWMGWUKO2J3FZ452OUVECNJDSOU' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: LZ0PKCF0CXYYTZFP3U4VY0RG4MBURBC0WHZ5TTWHECUMIEPC
CLIENT_SECRET:MWUOBK3YN4Y2LPQSDFG4SMWMGWUKO2J3FZ452OUVECNJDSOU


Get Toronto latitude and longitud:

In [199]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(location, latitude, longitude)

Toronto, Ontario, M6K 1X9, Canada 43.653963 -79.387207


Function that extracts the category of the venue:

In [200]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Function to get information for all the neighborhoods in Toronto:

In [201]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Execute the getNerbyVenues function:

In [203]:
radius = 500
LIMIT = 100

toronto_venues = getNearbyVenues(names=latlon_df['Postcode'],
                                   latitudes=latlon_df['Latitude'],
                                   longitudes=latlon_df['Longitude']
                                  )

M1B
M1C
M1E
M1G
M1H
M1J
M1K
M1L
M1M
M1N
M1P
M1R
M1S
M1T
M1V
M1W
M1X
M2H
M2J
M2K
M2L
M2M
M2N
M2P
M2R
M3A
M3B
M3C
M3H
M3J
M3K
M3L
M3M
M3N
M4A
M4B
M4C
M4E
M4G
M4H
M4J
M4K
M4L
M4M
M4N
M4P
M4R
M4S
M4T
M4V
M4W
M4X
M4Y
M5A
M5B
M5C
M5E
M5G
M5H
M5J
M5K
M5L
M5M
M5N
M5P
M5R
M5S
M5T
M5V
M5W
M5X
M6A
M6B
M6C
M6E
M6G
M6H
M6J
M6K
M6L
M6M
M6N
M6P
M6R
M6S
M7A
M7R
M7Y
M8V
M8W
M8X
M8Y
M8Z
M9A
M9B
M9C
M9L
M9M
M9N
M9P
M9R
M9V
M9W


Print dataframe shape and first rows:

In [204]:
print(toronto_venues.shape)
toronto_venues.head()

(2238, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M1B,43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,M1B,43.806686,-79.194353,Interprovincial Group,43.80563,-79.200378,Print Shop
2,M1C,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
3,M1E,43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,M1E,43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


Keep only neighborhoods with more than 10 venues (new dataframe has 1988 rows compared with 2238 in the original dataframe):

In [205]:
toronto_venues10 = toronto_venues.groupby('Neighborhood').filter(lambda x : len(x)>10)
toronto_venues10.shape

(1988, 7)

Print the number of venues for each neighborhood:

In [206]:
toronto_venues10.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M1W,15,15,15,15,15,15
M2J,62,62,62,62,62,62
M2N,35,35,35,35,35,35
M3C,22,22,22,22,22,22
M3H,17,17,17,17,17,17
M4B,13,13,13,13,13,13
M4G,32,32,32,32,32,32
M4H,16,16,16,16,16,16
M4K,44,44,44,44,44,44
M4L,19,19,19,19,19,19


In [207]:
print('There are {} uniques categories.'.format(len(toronto_venues10['Venue Category'].unique())))

There are 254 uniques categories.


Dummy coding of "venue category" attribute. We get 1 column for each of the 254 categories. We add the "neighborhood" column and move it as the first column of the dataframe:

In [208]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues10[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Nborhood'] = toronto_venues10['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Nborhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
71,M1W,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
72,M1W,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
73,M1W,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74,M1W,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
75,M1W,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Group the dataframe by the "Nborhood" column using the mean of each "venue category":

In [209]:
toronto_grouped = toronto_onehot.groupby('Nborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Nborhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1W,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M2J,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.016129,0.0,0.0,0.0,0.0,0.016129,0.016129,0.0
2,M2N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.028571,0.0,0.0,0.0,0.0,0.0
3,M3C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M3H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0


Function to return the most frequent categories en each row of the dataframe:

In [210]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Build a new dataframe with the top 10 categories for each neighborhood:

In [211]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Nborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1W,Fast Food Restaurant,Chinese Restaurant,Pizza Place,American Restaurant,Breakfast Spot,Sandwich Place,Electronics Store,Nail Salon,Camera Store,Gym
1,M2J,Clothing Store,Fast Food Restaurant,Coffee Shop,Shoe Store,Food Court,Toy / Game Store,Bakery,Restaurant,Japanese Restaurant,Burrito Place
2,M2N,Coffee Shop,Restaurant,Ramen Restaurant,Sandwich Place,Shopping Mall,Café,Sushi Restaurant,Indonesian Restaurant,Lounge,Ice Cream Shop
3,M3C,Gym,Asian Restaurant,Coffee Shop,Beer Store,Clothing Store,Chinese Restaurant,Dim Sum Restaurant,Restaurant,Bus Line,Discount Store
4,M3H,Coffee Shop,Pharmacy,Frozen Yogurt Shop,Fast Food Restaurant,Sandwich Place,Diner,Bridal Shop,Bank,Restaurant,Supermarket


Apply KMEANS to cluster the toronto_grouped dataframe. We have to delete the "Nborhood" column because KMEANS only accepts numerical data. 

We tried with several number of clusters and finally decided to use 6 clusters.

In [212]:
# set number of clusters
kclusters = 6

toronto_grouped_clustering = toronto_grouped.drop('Nborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 3, 3, 1, 1, 1, 3, 1, 3, 1], dtype=int32)

Add cluster number to neighborhoods_venues_sorted dataframe and merge it with the latlon_df dataframe which was created in "part 2" of this notebook.

We use "inner" join since we filtered the neighborhoods to keep only those with more than 10 venues.

In [213]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = latlon_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.set_index('Postcode').join(neighborhoods_venues_sorted.set_index('Neighborhood'), how="inner")

toronto_merged.reset_index(inplace=True)
toronto_merged.head() 

Unnamed: 0,index,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1W,Scarborough,L'Amoreaux West,43.799525,-79.318389,1,Fast Food Restaurant,Chinese Restaurant,Pizza Place,American Restaurant,Breakfast Spot,Sandwich Place,Electronics Store,Nail Salon,Camera Store,Gym
1,M2J,North York,"Oriole, Henry Farm, Fairview",43.778517,-79.346556,3,Clothing Store,Fast Food Restaurant,Coffee Shop,Shoe Store,Food Court,Toy / Game Store,Bakery,Restaurant,Japanese Restaurant,Burrito Place
2,M2N,North York,Willowdale South,43.77012,-79.408493,3,Coffee Shop,Restaurant,Ramen Restaurant,Sandwich Place,Shopping Mall,Café,Sushi Restaurant,Indonesian Restaurant,Lounge,Ice Cream Shop
3,M3C,North York,"Don Mills South, Flemingdon Park",43.7259,-79.340923,1,Gym,Asian Restaurant,Coffee Shop,Beer Store,Clothing Store,Chinese Restaurant,Dim Sum Restaurant,Restaurant,Bus Line,Discount Store
4,M3H,North York,"Downsview North, Bathurst Manor, Wilson Heights",43.754328,-79.442259,1,Coffee Shop,Pharmacy,Frozen Yogurt Shop,Fast Food Restaurant,Sandwich Place,Diner,Bridal Shop,Bank,Restaurant,Supermarket


Finally, we display a map with the neighborhoods colored by the cluster to which they belong.

In [214]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['index'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters