# COURSERA CAPSTONE

In [27]:
""" This notebook will be used for the coursera capstone project. """
import numpy as np
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup
#!pip install geocoder
import geocoder
#!pip install geopy
import geopy
import folium

In [28]:
print('Hello Capstone Project Course!')

Hello Capstone Project Course!


## 1. Fetch the data from Wikipedia ##

In [29]:
html = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(html, 'lxml')

# Find the only table of that URL:
table = soup.find('table', {'class': 'wikitable sortable'})

In [30]:
# Inspect the table
#table # Output is too long to display

## 2. Clean the data

In [31]:
# Extract all <td> tags.
#table.findAll('td')  # Output is too long to display

In [32]:
# Put each column into a dataframe
t = pd.DataFrame()

i=0
target_column = {0: 'PostalCode', 1: 'Borough', 2: 'Neighborhood'}
for td in list(table.findAll('td')):
    col = target_column[i%3]
    value = str(td).replace('<td>', "").replace('</td>','').replace('\n', "")
    try:
        value = re.findall( r">(.*)</a>",value)[0]
    except:
       pass 
    t.loc[i//3, col] = value
    i+=1
t.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [33]:
### Clean the data to remove 'Not assigned' in Borough ###
df = t
df = df[~df['Borough'].str.contains('Not assigned')]
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [34]:
### Combine Neighboroods within the same postal code ###
list_of_multiple_neighborhoods = df['PostalCode'].value_counts().index[df['PostalCode'].value_counts()>1]
dfs = df.copy()

def combine_neighborhood():
    global dfs
    i=0
    while True:
        #print(i)
        dfs.reset_index(inplace=True, drop=True)
        maxrow = dfs.shape[0]
        if i == maxrow-1:
            break
        else:
            #print(i,dfs.shape[0])
            if dfs.iloc[i,0] == dfs.iloc[i+1,0]:
                dfs.iloc[i,2] = dfs.iloc[i,2] + ', ' + dfs.iloc[i+1,2]
                dfs.drop(i+1, axis=0, inplace=True)
            else:
                i+=1

combine_neighborhood()

   

In [35]:
### Clean the data to replace 'Not assigned' in Neighborhood to the Borough ###
filter = dfs['Neighborhood'].str.contains('Not assigned')
#pd.set_option('mode.chained_assignment', None) # Turn off a warning that is not relevant.
dfs.loc[filter,'Neighborhood'] = dfs.loc[filter, 'Borough']
dfs.head(8)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North


## 3. Summary after cleaning

In [36]:
print('>>> The shape of the dataframe is {}. <<<'.format(dfs.shape))

dfs.head(5)

>>> The shape of the dataframe is (103, 3). <<<


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


## 4. Obtain geolocalisaiton data

### Try using geocoder.google [does not work]

In [37]:
### Use coursera link

coords = pd.read_csv('https://cocl.us/Geospatial_data')
coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [38]:
# Fill the dataframe witht he proper coordinates.
dfs['Latitude']=0.
dfs['Longitude']=0.

def latlong(zipcode):
    pos = coords.loc[coords['Postal Code'] == zipcode, ['Latitude', 'Longitude']]
    return pos.iloc[0,0], pos.iloc[0,1] 

for i in range(len(dfs)):
    dfs.iloc[i,3], dfs.iloc[i,4] = latlong(dfs.iloc[i,0])
    
dfs.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


## 5. Visualize the data

In [170]:
m = folium.Map(location=(43.7,-79.4) , zoom_start = 10)

for i in dfs.index:
    folium.Marker((dfs.loc[i,'Latitude'], dfs.loc[i,'Longitude']), popup=dfs.loc[i,'PostalCode']).add_to(m)
    
m



## 6. Prepare the Foursquare Credentials


In [40]:
# Client ID and password are read form local text files for convenience and privacy.

CLIENT_ID = open('client.txt', 'r').readlines()[0].strip() # your Foursquare ID
CLIENT_SECRET = open('pass.txt', 'r').readlines()[0].strip() # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100

In [41]:
# Test with one postal code
i = 0
zipcode = dfs.iloc[i,0]
print(zipcode)

lat = dfs.iloc[i,3]
long = dfs.iloc[i,4]

radius_in_meters = 1500
LIMIT = 100
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&LIMIT={}'.format( \
                    CLIENT_ID, CLIENT_SECRET, VERSION, 
                    lat, long, radius_in_meters, LIMIT)
d = requests.get(url).json()

M3A


In [171]:
venues= d['response']['groups'][0]['items']

In [44]:
from pandas.io.json import json_normalize
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

nearby_venues


Unnamed: 0,venue.name,venue.categories,venue.location.lat,venue.location.lng
0,Allwyn's Bakery,"[{'id': '4bf58dd8d48988d144941735', 'name': 'C...",43.75984,-79.324719
1,Brookbanks Park,"[{'id': '4bf58dd8d48988d163941735', 'name': 'P...",43.751976,-79.33214
2,Donalda Golf & Country Club,"[{'id': '4bf58dd8d48988d1e6941735', 'name': 'G...",43.752816,-79.342741
3,Tim Hortons,"[{'id': '4bf58dd8d48988d16d941735', 'name': 'C...",43.760668,-79.326368
4,LCBO,"[{'id': '4bf58dd8d48988d186941735', 'name': 'L...",43.757774,-79.314257
5,A&W Canada,"[{'id': '4bf58dd8d48988d16e941735', 'name': 'F...",43.760643,-79.326865
6,Tim Hortons,"[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C...",43.752814,-79.314067
7,Dollarama,"[{'id': '52dea92d3cf9994f4e043dbb', 'name': 'D...",43.757317,-79.312578
8,Food Basics,"[{'id': '52f2ab2ebcbc57f1066b8b46', 'name': 'S...",43.760865,-79.326015
9,Bruno's valu-mart,"[{'id': '4bf58dd8d48988d118951735', 'name': 'G...",43.746086,-79.324978


In [45]:
nearby_venues['venue.categories'] = nearby_venues['venue.categories'].apply(lambda x: x[0]['name'])

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Allwyn's Bakery,Caribbean Restaurant,43.75984,-79.324719
1,Brookbanks Park,Park,43.751976,-79.33214
2,Donalda Golf & Country Club,Golf Course,43.752816,-79.342741
3,Tim Hortons,Café,43.760668,-79.326368
4,LCBO,Liquor Store,43.757774,-79.314257


#### Let's create a function to repeat the same process to all the neighborhoods in Greater Toronto

In [115]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, max_iter=99999999):
    
    venues_list=[]
    i=-1
    for name, lat, lng in zip(names, latitudes, longitudes):
        i+=1
        if i > max_iter:
                    break
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [116]:
LIMIT = 100
max_iter = 99999 # limit the number of results at first
radius=750

toronto_venues = getNearbyVenues(names=dfs['Neighborhood'], latitudes=dfs['Latitude'], longitudes=dfs['Longitude'], radius=radius, max_iter=max_iter) 



Parkwoods
Victoria Village
Harbourfront, Regent Park
Lawrence Heights, Lawrence Manor
Queen's Park
Islington Avenue
Rouge, Malvern
Don Mills North
Woodbine Gardens, Parkview Hill
Ryerson, Garden District
Glencairn
Cloverdale, Islington, Martin Grove, Princess Gardens, West Deane Park
Highland Creek, Rouge Hill, Port Union
Flemingdon Park, Don Mills South
Woodbine Heights
St. James Town
Humewood-Cedarvale
Bloordale Gardens, Eringate, Markland Wood, Old Burnhamthorpe
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Downsview North, Wilson Heights
Thorncliffe Park
Adelaide, King, Richmond
Dovercourt Village, Dufferin
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto
Harbourfront East, Toronto Islands, Union Station
Little Portugal, Trinity
East Birchmount Park, Ionview, Kennedy Park
Bayview Village
CFB Toronto, Downsview East
The D

In [117]:
toronto_venues

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.332140,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Parkwoods,43.753259,-79.329656,DVP at York Mills,43.758899,-79.334099,Road
3,Parkwoods,43.753259,-79.329656,TTC Stop #09083,43.759655,-79.332223,Bus Stop
4,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
5,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
6,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant
7,Victoria Village,43.725882,-79.315572,Eglinton Ave E & Sloane Ave/Bermondsey Rd,43.726086,-79.313620,Intersection
8,Victoria Village,43.725882,-79.315572,Pizza Nova,43.725824,-79.312860,Pizza Place
9,Victoria Village,43.725882,-79.315572,Extreme Fun Indoor Playground,43.720914,-79.312516,Playground


In [118]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Agincourt,15,15,15,15,15,15
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",19,19,19,19,19,19
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown",13,13,13,13,13,13
"Alderwood, Long Branch",14,14,14,14,14,14
"Bathurst Manor, Downsview North, Wilson Heights",21,21,21,21,21,21
Bayview Village,10,10,10,10,10,10
"Bedford Park, Lawrence Manor East",36,36,36,36,36,36
Berczy Park,100,100,100,100,100,100
"Birch Cliff, Cliffside West",8,8,8,8,8,8


In [119]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 325 uniques categories.


In [120]:
# Onehot enoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

In [121]:
# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [122]:
# Group rows of same neighborhoor
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,...,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,"Adelaide, King, Richmond",0.010000,0.000000,0.00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.00,0.010000,0.00,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
1,Agincourt,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.00,0.000000,0.00,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.00,0.000000,0.00,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.00,0.000000,0.00,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
4,"Alderwood, Long Branch",0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.00,0.000000,0.00,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
5,"Bathurst Manor, Downsview North, Wilson Heights",0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.00,0.000000,0.00,0.047619,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
6,Bayview Village,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.00,0.000000,0.00,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
7,"Bedford Park, Lawrence Manor East",0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.00,0.000000,0.00,0.027778,0.000000,0.000000,0.00,0.000000,0.027778,0.000000
8,Berczy Park,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.00,0.010000,0.00,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
9,"Birch Cliff, Cliffside West",0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.00,0.000000,0.00,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000


In [163]:
# Display the first n venues per neighborhood
n = 3

for i in toronto_grouped.index:
    print('~~~~~~{}~~~~~~'.format(toronto_grouped.iloc[i,0]))
    temp = toronto_grouped.iloc[i,:].T.drop('Neighborhood', axis=0)
    temp.sort_values(axis=0, ascending=False, inplace=True)
    temp = temp.iloc[0:n]
    print(temp)
    
    print('\n')

~~~~~~Adelaide, King, Richmond~~~~~~
Coffee Shop            0.07
Café                   0.06
American Restaurant    0.04
Name: 0, dtype: object


~~~~~~Agincourt~~~~~~
Lounge              0.0666667
Shopping Mall       0.0666667
Sushi Restaurant    0.0666667
Name: 1, dtype: object


~~~~~~Agincourt North, L'Amoreaux East, Milliken, Steeles East~~~~~~
Pizza Place             0.105263
Fast Food Restaurant    0.105263
Pharmacy                0.105263
Name: 2, dtype: object


~~~~~~Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown~~~~~~
Grocery Store      0.230769
Pizza Place        0.230769
Hardware Store    0.0769231
Name: 3, dtype: object


~~~~~~Alderwood, Long Branch~~~~~~
Pizza Place     0.142857
Pharmacy       0.0714286
Park           0.0714286
Name: 4, dtype: object


~~~~~~Bathurst Manor, Downsview North, Wilson Heights~~~~~~
Pizza Place         0.0952381
Coffee Shop         0.0952381
Community Center     0.047619
Name: 5,

Name: 86, dtype: object


~~~~~~The Beaches~~~~~~
Pub            0.0833333
Coffee Shop       0.0625
Bar               0.0625
Name: 87, dtype: object


~~~~~~The Beaches West, India Bazaar~~~~~~
Indian Restaurant    0.0980392
Sandwich Place       0.0588235
Park                 0.0392157
Name: 88, dtype: object


~~~~~~The Danforth West, Riverdale~~~~~~
Greek Restaurant    0.13
Coffee Shop         0.07
Pub                 0.05
Name: 89, dtype: object


~~~~~~The Junction North, Runnymede~~~~~~
Brewery               0.15
Athletics & Sports     0.1
Beer Store            0.05
Name: 90, dtype: object


~~~~~~The Kingsway, Montgomery Road, Old Mill North~~~~~~
Park                0.0740741
Breakfast Spot      0.0740741
Sushi Restaurant    0.0740741
Name: 91, dtype: object


~~~~~~Thorncliffe Park~~~~~~
Indian Restaurant     0.115385
Afghan Restaurant    0.0769231
Coffee Shop          0.0769231
Name: 92, dtype: object


~~~~~~Victoria Village~~~~~~
Intersection    0.111111
Pizza Place     0.11

In [172]:
# Function to return the top 3 venues:

# Display the first n venues per neighborhood
n = 3

def return_top3(i):
    temp = toronto_grouped.iloc[i,:].T.drop('Neighborhood', axis=0)
    temp.sort_values(axis=0, ascending=False, inplace=True)
    temp = temp.iloc[0:n]
    return list(temp.index)

## 8. Cluster neighborhoods

In [165]:
# Doing a KMeans clustering with K = 7

from sklearn.cluster import KMeans
n = 10
clf = KMeans(n_clusters = n)

dataset = toronto_grouped.iloc[:,1:]
clf.fit(dataset)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=10, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [174]:
# Display the clusters on a map. For each cluster, the label is the neighborhood(s) and the top 3 business types.

m = folium.Map(location=(43.7,-79.4) , zoom_start = 11)

for i in toronto_grouped.index:
    label = str(toronto_grouped.iloc[i,0]) 
    label2=  ' // ' +  ', '.join(return_top3(i)) 
    lat = dfs[dfs['Neighborhood'] == label].iloc[0,3]
    lng = dfs[dfs['Neighborhood'] == label].iloc[0,4]
    label +=label2
    folium.CircleMarker(
    [lat, lng],
    radius=5,
    popup=label,
    color=[
    'red',
    'blue',
    'gray',
    'darkred',
    'lightred',
    'orange',
    'beige',
    'green',
    'darkgreen',
    'lightgreen',
    'darkblue',
    'lightblue',
    'purple',
    'darkpurple',
    'pink',
    'cadetblue',
    'lightgray',
    'black'][clf.labels_[i]],
    fill=True,
    fill_color='white',
    fill_opacity=0.7,
    parse_html=False).add_to(m)  
        
m

## 9. Obervations

Most of the downtown toronto zip codes hare similar features. Away from the center, the clusters become more diverse.