# Web Scraping

---

### Import necessary libraries

In [1]:
import numpy as np
import pandas as pd
    
import json
import requests
from pandas.io.json import json_normalize

from sklearn.cluster import KMeans

import folium
from geopy.geocoders import Nominatim
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors

from urllib.request import urlopen
from bs4 import BeautifulSoup

### Import HTML script from url

In [2]:
#url = "https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&direction=prev&oldid=926287641"
#url = "https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=945633050." 
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

html = urlopen(url)
soup = BeautifulSoup(html,'lxml')

### Extract the table data from html and create a dataframe

In [3]:
rows = soup.find_all('tr')
headers = rows[0].find_all('th')


list_rows = []
for row in rows:
    if len(row.find_all('td')) == 3:
        row_td = row.find_all('td')
        headers = rows[0].find_all('th')
    
        for i,v in enumerate(headers):
            cleantext = BeautifulSoup(str(v),"lxml").get_text()
            headers[i] = cleantext.rstrip('\n')
        
        for i,v in enumerate(row_td):
            cleantext = BeautifulSoup(str(v),"lxml").get_text()
            row_td[i] = cleantext.rstrip('\n')
        
        list_rows.append(row_td)
        
df = pd.DataFrame(list_rows,columns=headers)
df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,
176,M6Z,Not assigned,
177,M7Z,Not assigned,
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


### Correct rows where [Borough] or [Neighborhood] are empty

In [4]:
# Remove rows where Borough is unassigned
df['Borough'].replace('Not assigned',float("NaN"),inplace=True)
df.dropna(inplace=True)
df.reset_index(drop=True,inplace=True)
df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing Centre
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [5]:
# Check to see if any Neighborhood cells are empty and correct them with borough name
for i,n in enumerate(df['Neighborhood'] == ""):
    if n is True:
        print(str(i) + ": " + df['Neighbourhood'].iloc[i] + " , " + df['Borough'].iloc[i])

### Construct a new DataFrame 

Goal here is to only have unique postal code rows with neighborhoods appended together

In [6]:
boroughs = []
neighborhoods = []
codes = df['Postal Code'].unique()

for i,code in enumerate(codes):
    #print(code)
    #print(df['Borough'].iloc[i])
    boroughs.append(df['Borough'].iloc[i])
    post_group = df.groupby(['Postal Code']).get_group(code)
    hoods = post_group[['Neighborhood']].values

    hood_list = ""
    for n in hoods:
        hood_list += n + ", "
    hood_list = hood_list[0].rstrip(', ')
    #print(hood_list)
    #print()
    boroughs[0].rstrip(", ")
    
    neighborhoods.append(hood_list)
    

new_dic = {"PostalCode" : codes,"Borough" : boroughs, "Neighborhood" : neighborhoods}

df_final = pd.DataFrame(new_dic)
df_final

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing Centre
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [7]:
num_data = np.shape(df_final)[0]
print("%d rows of data" % num_data)

103 rows of data


### Add location data

In [8]:
# Read csv data into a dataframe
geo = pd.read_csv('Geospatial_coordinates.csv')
geo

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [9]:
# Sort neighborhood information by PostalCode before merging with coordinates
df_final.sort_values('PostalCode',inplace=True)
df_final.reset_index(drop=True,inplace=True)
df_final

df_final[['Latitude','Longitude']] = geo[['Latitude','Longitude']]
df_final

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


# Processing and Analysis

---

## Step 1: Retrieve venue data

In [10]:
df_final['Borough'].value_counts()

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East York            5
York                 5
East Toronto         5
Mississauga          1
Name: Borough, dtype: int64

Restrict attention to boroughs in Toronto

In [11]:
dt_data = df_final[df_final['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
west_data = df_final[df_final['Borough'] == 'West Toronto'].reset_index(drop=True)
east_data = df_final[df_final['Borough'] == 'East Toronto'].reset_index(drop=True)

toronto_data = pd.concat([dt_data,west_data,east_data])
df_final = toronto_data

In [12]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_final['Latitude'], df_final['Longitude'], df_final['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [13]:
CLIENT_ID = 'ZQFK2A1YUACY2KB5JKHCJTDBAGERBSTBWA2NNWYIWEDDQDYS' # your Foursquare ID
CLIENT_SECRET = 'CXKA4WPR0FXJCF5BU2SEA0E5WBPPXGO44ZZ2TF1GZTCVSUK3' # your Foursquare Secret
VERSION = '20200508' # Foursquare API version

In [14]:
LIMIT = 100

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [15]:
toronto_venues = getNearbyVenues(names=df_final['Neighborhood'],
                                 latitudes=df_final['Latitude'],
                                 longitudes=df_final['Longitude'])

Rosedale
St. James Town, Cabbagetown


KeyError: 'groups'

**Check dimensions for comparison to further processing**

In [None]:
print("Records : {}".format(toronto_venues.shape[0]))
print("Categories: {}".format(len(toronto_venues['Venue Category'].unique())))
print("Neighborhoods: {}".format(len(toronto_venues['Neighborhood'].unique())))

In [None]:
toronto_venues[toronto_venues['Neighborhood']=='Upper Rouge']

## Step 2: Process data for clustering algorithm

**Construct one-hot matrix for venue categories**

In [None]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']],prefix="",prefix_sep="")
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']

col_loc = toronto_onehot.columns.get_loc('Neighborhood')
fixed_columns = [toronto_onehot.columns[col_loc]] + list(toronto_onehot.columns[:col_loc]) + list(toronto_onehot.columns[(col_loc+1):])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

In [None]:
toronto_grouped[toronto_grouped['Neighborhood']=='Upper Rouge']

**Organize the top venues in each neighborhood for clustering features**

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

print(neighborhoods_venues_sorted.shape)
neighborhoods_venues_sorted.head()

## Step 3: Cluster and Map

In [None]:
kclusters = 10

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)
kmeans.labels_[0:10] 

In [None]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_final
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters