# Web Scraping

---

### Import necessary libraries

In [1]:
import numpy as np
import pandas as pd
    
import json
import requests
from pandas.io.json import json_normalize

from sklearn.cluster import KMeans

import folium
from geopy.geocoders import Nominatim
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors

from urllib.request import urlopen
from bs4 import BeautifulSoup

### Import HTML script from url

In [2]:
#url = "https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&direction=prev&oldid=926287641"
#url = "https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=945633050." 
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

html = urlopen(url)
soup = BeautifulSoup(html,'lxml')

### Extract the table data from html and create a dataframe

In [3]:
rows = soup.find_all('tr')
headers = rows[0].find_all('th')


list_rows = []
for row in rows:
    if len(row.find_all('td')) == 3:
        row_td = row.find_all('td')
        headers = rows[0].find_all('th')
    
        for i,v in enumerate(headers):
            cleantext = BeautifulSoup(str(v),"lxml").get_text()
            headers[i] = cleantext.rstrip('\n')
        
        for i,v in enumerate(row_td):
            cleantext = BeautifulSoup(str(v),"lxml").get_text()
            row_td[i] = cleantext.rstrip('\n')
        
        list_rows.append(row_td)
        
df = pd.DataFrame(list_rows,columns=headers)
df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,
176,M6Z,Not assigned,
177,M7Z,Not assigned,
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


### Correct rows where [Borough] or [Neighborhood] are empty

In [4]:
# Remove rows where Borough is unassigned
df['Borough'].replace('Not assigned',float("NaN"),inplace=True)
df.dropna(inplace=True)
df.reset_index(drop=True,inplace=True)
df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing Centre
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [5]:
# Check to see if any Neighborhood cells are empty and correct them with borough name
for i,n in enumerate(df['Neighborhood'] == ""):
    if n is True:
        print(str(i) + ": " + df['Neighbourhood'].iloc[i] + " , " + df['Borough'].iloc[i])

### Construct a new DataFrame 

Goal here is to only have unique postal code rows with neighborhoods appended together

In [6]:
boroughs = []
neighborhoods = []
codes = df['Postal Code'].unique()

for i,code in enumerate(codes):
    #print(code)
    #print(df['Borough'].iloc[i])
    boroughs.append(df['Borough'].iloc[i])
    post_group = df.groupby(['Postal Code']).get_group(code)
    hoods = post_group[['Neighborhood']].values

    hood_list = ""
    for n in hoods:
        hood_list += n + ", "
    hood_list = hood_list[0].rstrip(', ')
    #print(hood_list)
    #print()
    boroughs[0].rstrip(", ")
    
    neighborhoods.append(hood_list)
    

new_dic = {"PostalCode" : codes,"Borough" : boroughs, "Neighborhood" : neighborhoods}

df_final = pd.DataFrame(new_dic)
df_final

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing Centre
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [7]:
num_data = np.shape(df_final)[0]
print("%d rows of data" % num_data)

103 rows of data


### Add location data

In [8]:
# Read csv data into a dataframe
geo = pd.read_csv('Geospatial_coordinates.csv')
geo

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [9]:
# Sort neighborhood information by PostalCode before merging with coordinates
df_final.sort_values('PostalCode',inplace=True)
df_final.reset_index(drop=True,inplace=True)
df_final

df_final[['Latitude','Longitude']] = geo[['Latitude','Longitude']]
df_final

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


# Processing and Analysis

---

## Step 1: Retrieve venue data

In [10]:
df_final['Borough'].value_counts()

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East York            5
East Toronto         5
York                 5
Mississauga          1
Name: Borough, dtype: int64

Restrict attention to boroughs in Toronto

In [11]:
dt_data = df_final[df_final['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
west_data = df_final[df_final['Borough'] == 'West Toronto'].reset_index(drop=True)
east_data = df_final[df_final['Borough'] == 'East Toronto'].reset_index(drop=True)

toronto_data = pd.concat([dt_data,west_data,east_data])
df_final = toronto_data

In [12]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_final['Latitude'], df_final['Longitude'], df_final['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [13]:
CLIENT_ID = 'ZQFK2A1YUACY2KB5JKHCJTDBAGERBSTBWA2NNWYIWEDDQDYS' # your Foursquare ID
CLIENT_SECRET = 'CXKA4WPR0FXJCF5BU2SEA0E5WBPPXGO44ZZ2TF1GZTCVSUK3' # your Foursquare Secret
VERSION = '20200508' # Foursquare API version

In [14]:
LIMIT = 100

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [15]:
toronto_venues = getNearbyVenues(names=df_final['Neighborhood'],
                                 latitudes=df_final['Latitude'],
                                 longitudes=df_final['Longitude'])

Rosedale
St. James Town, Cabbagetown
Church and Wellesley
Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Stn A PO Boxes
First Canadian Place, Underground city
Christie
Queen's Park, Ontario Provincial Government
Dufferin, Dovercourt Village
Little Portugal, Trinity
Brockton, Parkdale Village, Exhibition Place
High Park, The Junction South
Parkdale, Roncesvalles
Runnymede, Swansea
The Beaches
The Danforth West, Riverdale
India Bazaar, The Beaches West
Studio District
Business reply mail Processing Centre


**Check dimensions for comparison to further processing**

In [16]:
print("Records : {}".format(toronto_venues.shape[0]))
print("Categories: {}".format(len(toronto_venues['Venue Category'].unique())))
print("Neighborhoods: {}".format(len(toronto_venues['Neighborhood'].unique())))

Records : 1507
Categories: 229
Neighborhoods: 30


In [17]:
toronto_venues[toronto_venues['Neighborhood']=='Upper Rouge']

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category


## Step 2: Process data for clustering algorithm

**Construct one-hot matrix for venue categories**

In [18]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']],prefix="",prefix_sep="")
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']

col_loc = toronto_onehot.columns.get_loc('Neighborhood')
fixed_columns = [toronto_onehot.columns[col_loc]] + list(toronto_onehot.columns[:col_loc]) + list(toronto_onehot.columns[(col_loc+1):])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theater,Theme Restaurant,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017544,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Business reply mail Processing Centre,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0625,0.0625,0.0625,0.125,0.125,0.125,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.016129,0.0,0.0,0.0,0.0,0.016129
5,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Church and Wellesley,0.012821,0.0,0.0,0.0,0.0,0.0,0.0,0.012821,0.0,...,0.012821,0.012821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025641
7,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0
8,"Dufferin, Dovercourt Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"First Canadian Place, Underground city",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,...,0.01,0.0,0.0,0.01,0.01,0.0,0.0,0.01,0.0,0.0


In [19]:
toronto_grouped[toronto_grouped['Neighborhood']=='Upper Rouge']

Unnamed: 0,Neighborhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theater,Theme Restaurant,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio


**Organize the top venues in each neighborhood for clustering features**

In [20]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [21]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

print(neighborhoods_venues_sorted.shape)
neighborhoods_venues_sorted.head()

(30, 11)


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Bakery,Cheese Shop,Restaurant,Café,Beer Bar,Seafood Restaurant,Bistro,Butcher
1,"Brockton, Parkdale Village, Exhibition Place",Café,Breakfast Spot,Coffee Shop,Climbing Gym,Restaurant,Burrito Place,Italian Restaurant,Intersection,Bar,Stadium
2,Business reply mail Processing Centre,Yoga Studio,Restaurant,Spa,Light Rail Station,Fast Food Restaurant,Auto Workshop,Farmers Market,Recording Studio,Pizza Place,Butcher
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Airport Terminal,Boat or Ferry,Harbor / Marina,Sculpture Garden,Rental Car Location,Coffee Shop,Plane,Boutique
4,Central Bay Street,Coffee Shop,Italian Restaurant,Café,Sandwich Place,Bubble Tea Shop,Bar,Ice Cream Shop,Thai Restaurant,Salad Place,Japanese Restaurant


## Step 3: Cluster and Map

In [22]:
kclusters = 10

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)
kmeans.labels_[0:10] 

array([5, 5, 7, 4, 0, 8, 5, 5, 2, 5])

In [23]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_final
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,3,Park,Trail,Playground,Cuban Restaurant,Eastern European Restaurant,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,Discount Store
1,M4X,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675,5,Coffee Shop,Restaurant,Bakery,Italian Restaurant,Café,Pizza Place,Pharmacy,Pet Store,Park,Chinese Restaurant
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316,5,Coffee Shop,Sushi Restaurant,Japanese Restaurant,Restaurant,Yoga Studio,Men's Store,Mediterranean Restaurant,Hotel,Smoke Shop,Gay Bar
3,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,5,Coffee Shop,Pub,Park,Bakery,Café,Restaurant,Breakfast Spot,Theater,Dessert Shop,Spa
4,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,5,Clothing Store,Coffee Shop,Café,Restaurant,Middle Eastern Restaurant,Italian Restaurant,Cosmetics Shop,Japanese Restaurant,Bubble Tea Shop,Ramen Restaurant
5,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,5,Café,Coffee Shop,Gastropub,Cocktail Bar,American Restaurant,Clothing Store,Italian Restaurant,Seafood Restaurant,Beer Bar,Moroccan Restaurant
6,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,5,Coffee Shop,Cocktail Bar,Bakery,Cheese Shop,Restaurant,Café,Beer Bar,Seafood Restaurant,Bistro,Butcher
7,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,0,Coffee Shop,Italian Restaurant,Café,Sandwich Place,Bubble Tea Shop,Bar,Ice Cream Shop,Thai Restaurant,Salad Place,Japanese Restaurant
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568,5,Coffee Shop,Café,Restaurant,Clothing Store,Gym,Thai Restaurant,Hotel,Deli / Bodega,Steakhouse,Bookstore
9,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752,5,Coffee Shop,Aquarium,Café,Hotel,Fried Chicken Joint,Restaurant,Sporting Goods Shop,Brewery,Italian Restaurant,Scenic Lookout


In [24]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters