Converting the Wikipedia page into a dataframe

In [1]:
!pip install BeautifulSoup4
!pip install requests

Collecting BeautifulSoup4
[?25l  Downloading https://files.pythonhosted.org/packages/66/25/ff030e2437265616a1e9b25ccc864e0371a0bc3adb7c5a404fd661c6f4f6/beautifulsoup4-4.9.1-py3-none-any.whl (115kB)
[K     |████████████████████████████████| 122kB 5.7MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2 (from BeautifulSoup4)
  Downloading https://files.pythonhosted.org/packages/6f/8f/457f4a5390eeae1cc3aeab89deb7724c965be841ffca6cfca9197482e470/soupsieve-2.0.1-py3-none-any.whl
Installing collected packages: soupsieve, BeautifulSoup4
Successfully installed BeautifulSoup4-4.9.1 soupsieve-2.0.1


In [5]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize 

import folium 
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-2.0.0                |     pyh9f0ad1d_0          63 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          97 KB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.50-py_0
  geopy              conda-forge/noarch::geopy-2.0.0-pyh9f0ad1d_0



Downloading and Extracting Packages
geopy-2.0.0          | 63 KB     | ##################################### | 100% 
geographiclib-1.50   | 34 KB     | ################################

In [14]:
!pip install lxml



In [60]:
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(source.content, features="html")

table = soup.find("table")
table_rows = table.tbody.find_all("tr")

res = []
for tr in table_rows:
    td = tr.find_all("td")
    row = [tr.text for tr in td]
    
    # Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
    if row != [] and row[1] != "Not assigned":
        # If a cell has a borough but a "Not assigned" neighborhood, then the neighborhood will be the same as the borough.
        if "Not assigned" in row[2]: 
            row[2] = row[1]
        res.append(row)

# Dataframe with 3 columns
df = pd.DataFrame(res, columns = ["PostalCode", "Borough", "Neighborhood"])
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


In [61]:
#Remove the /n from data - cleanup
df["Neighborhood"] = df["Neighborhood"].str.replace("\n","")
df["Borough"] = df["Borough"].str.replace("\n","")
df["PostalCode"] = df["PostalCode"].str.replace("\n","")
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [64]:
df = df[df['PostalCode'] != 'Not assigned']
df = df[df['Borough'] != 'Not assigned']
df = df[df['Neighborhood'] != 'Not assigned']

In [71]:
df = df.groupby(["PostalCode", "Borough"])["Neighborhood"].apply(", ".join).reset_index()
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


In [70]:
df.shape

(103, 3)

**Get the latitude and the longitude coordinates of each neighborhood**

In [73]:
#Using csv file as geospatial wasn't working
df_geo_coor = pd.read_csv("https://cocl.us/Geospatial_data")
df_geo_coor.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [74]:
df_toronto = pd.merge(df, df_geo_coor, how='left', left_on = 'PostalCode', right_on = 'Postal Code')
# remove the "Postal Code" column
df_toronto.drop("Postal Code", axis=1, inplace=True)
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


**Explore and create clusters of neighbourhoods in Toronto**

In [75]:
address = "Toronto, ON"

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto city are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto city are 43.6534817, -79.3839347.


In [76]:
#Use folium to map the above, to test it shows Toronto
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
map_toronto

In [77]:
#Insert markers
for lat, lng, borough, neighborhood in zip(
        df_toronto['Latitude'], 
        df_toronto['Longitude'], 
        df_toronto['Borough'], 
        df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

map_toronto

In [88]:
# Going to filter out the boroughs of East York and East Toronto
df_toronto_east = df_toronto[df_toronto['Borough'].str.contains("East")].reset_index(drop=True)
df_toronto_east

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
1,M4C,East York,Woodbine Heights,43.695344,-79.318389
2,M4E,East Toronto,The Beaches,43.676357,-79.293031
3,M4G,East York,Leaside,43.70906,-79.363452
4,M4H,East York,Thorncliffe Park,43.705369,-79.349372
5,M4J,East York,"East Toronto, Broadview North (Old East York)",43.685347,-79.338106
6,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
7,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
8,M4M,East Toronto,Studio District,43.659526,-79.340923
9,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558


In [90]:
map_toronto_east = folium.Map(location=[latitude, longitude], zoom_start=12)
for lat, lng, borough, neighborhood in zip(
        df_toronto_east['Latitude'], 
        df_toronto_east['Longitude'], 
        df_toronto_east['Borough'], 
        df_toronto_east['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_east)  

map_toronto_east

**Insert Foursquare Credentials**

In [91]:
CLIENT_ID = 'N3UR2II2GGZX23CYO25LX2CAHN0FGREWWGQJXIEBHQYIRO5U'
CLIENT_SECRET = 'GNPJR24XLZ4MU4TPX5YWGZA5NYFCQ0PISDL4ARQBNCQDNOUY'
VERSION = '20180605'

**Explore neighbourhoods**

In [96]:
#Pick a neighbourhood to explore; using first neighborhood in 'East' list created
neighborhood_name = df_toronto_east.loc[0, 'Neighborhood']
print(f"The first neighborhood's name is '{neighborhood_name}'.")

The first neighborhood's name is 'Parkview Hill, Woodbine Gardens'.


In [97]:
#Get latitude and longitude of neighbourhood
neighborhood_latitude = df_toronto_east.loc[0, 'Latitude'] 
neighborhood_longitude = df_toronto_east.loc[0, 'Longitude'] 

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Parkview Hill, Woodbine Gardens are 43.7063972, -79.309937.


**Get 100 venues within a radius of 500m**

In [98]:
LIMIT = 100 
radius = 500 
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

results = requests.get(url).json()

#Get the cateogry of the venue

def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [100]:
#Cleanup data in the json file
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues)

filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,categories,lat,lng
0,Jawny Bakers,Gastropub,43.705783,-79.312913
1,East York Gymnastics,Gym / Fitness Center,43.710654,-79.309279
2,TD Canada Trust,Bank,43.70574,-79.31227
3,Shoppers Drug Mart,Pharmacy,43.705933,-79.312825
4,Pizza Pizza,Pizza Place,43.705159,-79.31313
5,Rise & Dine Eatery,Breakfast Spot,43.705769,-79.311638
6,Nostalgia,Café,43.706833,-79.311783
7,St. Clair Ave E & O'Connor Dr,Intersection,43.705233,-79.313274
8,Venice Pizza,Pizza Place,43.705921,-79.313957
9,Harvey's,Fast Food Restaurant,43.708136,-79.314105


In [105]:
#Explore all neighborhoods of Parkview Hill

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    
    for name, lat, lng in zip(names, latitudes, longitudes):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        results = requests.get(url).json()["response"]['groups'][0]['items']

        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#assign the above to ParkviewHill
toronto_east_venues = getNearbyVenues(names=df_toronto_east ['Neighborhood'],
                                   latitudes=df_toronto_east['Latitude'],
                                   longitudes=df_toronto_east['Longitude']
                                  )
toronto_east_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,Jawny Bakers,43.705783,-79.312913,Gastropub
1,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,East York Gymnastics,43.710654,-79.309279,Gym / Fitness Center
2,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,TD Canada Trust,43.70574,-79.31227,Bank
3,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,Shoppers Drug Mart,43.705933,-79.312825,Pharmacy
4,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,Pizza Pizza,43.705159,-79.31313,Pizza Place


In [107]:
#count of number of neighbourhoods
toronto_east_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",17,17,17,17,17,17
"East Toronto, Broadview North (Old East York)",3,3,3,3,3,3
"India Bazaar, The Beaches West",23,23,23,23,23,23
Leaside,33,33,33,33,33,33
"Parkview Hill, Woodbine Gardens",11,11,11,11,11,11
Studio District,40,40,40,40,40,40
The Beaches,4,4,4,4,4,4
"The Danforth West, Riverdale",43,43,43,43,43,43
Thorncliffe Park,23,23,23,23,23,23
Woodbine Heights,6,6,6,6,6,6


In [108]:
#Analyse neighborhoods

toronto_east_one = pd.get_dummies(toronto_east_venues[['Venue Category']], prefix="", prefix_sep="")

toronto_east_one ['Neighborhood'] = toronto_east_venues['Neighborhood'] 
fixed_columns = [toronto_east_one.columns[-1]] + list(toronto_east_one.columns[:-1])
toronto_east_one = toronto_east_one[fixed_columns]

toronto_east_one.head()

Unnamed: 0,Yoga Studio,American Restaurant,Athletics & Sports,Auto Workshop,Bagel Shop,Bakery,Bank,Bar,Beer Store,Bike Shop,...,Sporting Goods Shop,Sports Bar,Stationery Store,Steakhouse,Supermarket,Sushi Restaurant,Thai Restaurant,Trail,Warehouse Store,Wine Bar
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [110]:
#Group by neighbourhood
toronto_east_grouped = toronto_east_one.groupby('Neighborhood').mean().reset_index()
toronto_east_grouped.head()

Unnamed: 0,Neighborhood,Yoga Studio,American Restaurant,Athletics & Sports,Auto Workshop,Bagel Shop,Bakery,Bank,Bar,Beer Store,...,Sporting Goods Shop,Sports Bar,Stationery Store,Steakhouse,Supermarket,Sushi Restaurant,Thai Restaurant,Trail,Warehouse Store,Wine Bar
0,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"East Toronto, Broadview North (Old East York)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"India Bazaar, The Beaches West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.043478,0.0,0.043478,0.0,0.0,0.0,0.0
3,Leaside,0.0,0.0,0.0,0.0,0.030303,0.0,0.060606,0.0,0.030303,...,0.060606,0.030303,0.0,0.0,0.030303,0.030303,0.0,0.0,0.0,0.0
4,"Parkview Hill, Woodbine Gardens",0.0,0.0,0.090909,0.0,0.0,0.0,0.090909,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [112]:
#Most common venues
def most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_east_grouped['Neighborhood']

for ind in np.arange(toronto_east_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = most_common_venues(toronto_east_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Business reply mail Processing Centre, South C...",Garden Center,Smoke Shop,Butcher,Comic Shop,Park,Burrito Place,Pizza Place,Restaurant,Brewery,Skate Park
1,"East Toronto, Broadview North (Old East York)",Park,Coffee Shop,Convenience Store,Dessert Shop,Clothing Store,Comfort Food Restaurant,Comic Shop,Cosmetics Shop,Coworking Space,Curling Ice
2,"India Bazaar, The Beaches West",Park,Fast Food Restaurant,Brewery,Food & Drink Shop,Gym,Coffee Shop,Ice Cream Shop,Italian Restaurant,Liquor Store,Fish & Chips Shop
3,Leaside,Coffee Shop,Burger Joint,Sporting Goods Shop,Furniture / Home Store,Bank,Mexican Restaurant,Gym,Grocery Store,Liquor Store,Fish & Chips Shop
4,"Parkview Hill, Woodbine Gardens",Pizza Place,Breakfast Spot,Athletics & Sports,Gastropub,Café,Bank,Fast Food Restaurant,Pharmacy,Gym / Fitness Center,Intersection


**Now we have a dataset that we can cluster**

In [114]:
kclusters = 5

toronto_east_grouped_clustering = toronto_east_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_east_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 2, 0, 0, 4, 0, 3, 0, 0, 1], dtype=int32)

In [115]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_east_merged = df_toronto_east
toronto_east_merged = toronto_east_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
toronto_east_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,4,Pizza Place,Breakfast Spot,Athletics & Sports,Gastropub,Café,Bank,Fast Food Restaurant,Pharmacy,Gym / Fitness Center,Intersection
1,M4C,East York,Woodbine Heights,43.695344,-79.318389,1,Park,Skating Rink,Pharmacy,Bus Stop,Beer Store,Curling Ice,Coffee Shop,Comfort Food Restaurant,Comic Shop,Convenience Store
2,M4E,East Toronto,The Beaches,43.676357,-79.293031,3,Trail,Pub,Health Food Store,Wine Bar,Dessert Shop,Coffee Shop,Comfort Food Restaurant,Comic Shop,Convenience Store,Cosmetics Shop
3,M4G,East York,Leaside,43.70906,-79.363452,0,Coffee Shop,Burger Joint,Sporting Goods Shop,Furniture / Home Store,Bank,Mexican Restaurant,Gym,Grocery Store,Liquor Store,Fish & Chips Shop
4,M4H,East York,Thorncliffe Park,43.705369,-79.349372,0,Sandwich Place,Indian Restaurant,Coffee Shop,Burger Joint,Pizza Place,Pharmacy,Park,Bus Line,Middle Eastern Restaurant,Yoga Studio


**MAP CLUSTERS**

In [122]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
markers_colors = []
for lat, lon, poi, cluster in zip(
        toronto_east_merged['Latitude'], 
        toronto_east_merged['Longitude'], 
        toronto_east_merged['Neighborhood'], 
        toronto_east_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters