# Scraping Toronto Data

### Start of Part 1

#### Load postal code data from wikipedia into a dataframe

All necessary imports:

In [53]:
import pandas as pd
import requests
!pip install folium
import folium
from bs4 import BeautifulSoup
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans


  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes


Get html from wikipedia:

In [54]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
result = requests.get(url)
soup = BeautifulSoup(result.content, 'html.parser')

Strip postal code, borough and neighborhood from the html:

In [55]:
table_contents = []
table = soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        # skip this row
        pass
    else:
        # grab and clean up values
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

Convert to a dataframe:

In [56]:
df = pd.DataFrame(table_contents)
# Clean up a few borough names
df['Borough'] = df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


Shape of resulting dataframe:

In [57]:
print(df.shape)

(103, 3)


### End of Part 1

# Adding Latitude and Longitude Columns

### Start of Part 2


Unable to get geocoder to accept my requests, so load lat/long from csv file. Hidden cell has credential information.

In [58]:
# The code was removed by Watson Studio for sharing.

In [59]:
lldf = pd.read_csv(body)
lldf.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Add Latitude and Longitude to wikipedia data:

In [60]:
fulldf = pd.merge(df, lldf, how='left', left_on='PostalCode', right_on='Postal Code')
del fulldf['Postal Code'] # no need for duplicate columns with slightly different names
fulldf.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


### End of Part 2

### Start of Part 3

Limit Toronto data to boroughs with "Toronto" in the name

In [61]:
# Only use boroughs with "Toronto" in the name
t_df = fulldf[fulldf["Borough"].str.contains("Toronto", na=False)]
t_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


#### Empty map of Toronto

In [62]:
latitude = t_df['Latitude'].iloc[0]
longitude = t_df['Longitude'].iloc[0]

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
map_toronto

#### Add Borough markers

In [63]:
for lat, lng, borough, neighborhood in zip(t_df['Latitude'], t_df['Longitude'],
                                           t_df['Borough'], t_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto


#### Use Foursquare to find 20 nearby Italian restaurant names

Hide client credentials, so have to hide URL

In [65]:
# The code was removed by Watson Studio for sharing.

Get the venue names of 20 Italian restaurants within 200 meters

In [66]:
results = requests.get(url).json()
venues = results['response']['groups'][0]['items']
venue_df = pd.json_normalize(venues)
names_df = venue_df['venue.name']
names_df

0                           Allwyn's Bakery
1                               Tim Hortons
2                        Graydon Hall Manor
3               Donalda Golf & Country Club
4                      Galleria Supermarket
5                        Darband Restaurant
6                           Brookbanks Park
7             VIA CIBO | italian streetfood
8                        Naan & Kabob Halal
9                              What a Bagel
10                             Island Foods
11                    Starbucks Reserve Bar
12                 Me Va Me Kitchen Express
13    North Beach Indoor Volleyball Academy
14                        Mindset by Design
15                                     LCBO
16               Ghadir Mid-Eastern Grocery
17                       Kostas Meat Market
18                    CF Shops at Don Mills
19                          Shawarma Empire
Name: venue.name, dtype: object

#### Clustering

In [67]:
# get only numeric data
num_df = pd.DataFrame(t_df, columns=['Latitude', 'Longitude'])
# run k-means clustering
kclusters = 4
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(num_df)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:20]

array([3, 3, 3, 0, 3, 3, 1, 3, 1, 0, 3, 1, 0, 3, 1, 0, 3, 0, 2, 2],
      dtype=int32)

In [68]:
# add clustering labels
num_df.insert(0, 'Cluster Labels', kmeans.labels_)
merged_df = pd.merge(t_df, num_df)
merged_df.head() # check the last columns

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,3
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,3
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,3
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,0
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,3


#### Map of Clustered Neighborhoods

In [69]:
# create map
newlat = merged_df['Latitude'][0]
newlong = merged_df['Longitude'][0]
map_clusters = folium.Map(location=[newlat, newlong], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i * x) ** 2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(merged_df['Latitude'], merged_df['Longitude'],
                                  merged_df['Neighborhood'], merged_df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster - 1],
        fill=True,
        fill_color=rainbow[cluster - 1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters


### End of Part 3